# Planejamento da Solução

   - Produto Final:
        - Um link (URL):
            - Informações importantes
            - Mapa interativo
            - Dashboard interativo
            
            
   - Ferramentas:
       - Jupyter


   - Processos:
       - Organizar os códigos em funções e seções
       - Pesquisar bibliotecas para a criação de mapa na Web

# Imports

In [24]:
import pandas as pd
import numpy as np
import plotly.express as px

from geopy.geocoders import Nominatim

# Functions

In [25]:
def show_dtypes (data):
    print(data.dtypes)
    return None

In [26]:
def show_dimensions(data):
    print('Number of rows:{}'.format(data.shape[0]))
    print('Number of columns:{}'.format(data.shape[1]))
    return None

In [27]:
def collect_geodata(data, cols):
    geolocator = Nominatim(user_agent = 'geoapi')
    
    data.loc[:, cols[0]] = 'NA'
    data.loc[:, cols[1]] = 'NA'

    
    for i in range(0, len(data)):
        query = str(data.loc[i, 'lat']) + ',' + str(data.loc[i, 'long'])

        response = geolocator.reverse(query)

        if cols[0] in response.raw['address']:
            data.loc[i, cols[0]] = response.raw['address'][cols[0]]

        if cols[1] in response.raw['address']:
            data.loc[i, cols[1]] = response.raw['address'][cols[1]]
    
    return data

# ETL

In [37]:
def data_collect(path):
    data = pd.read_csv(path)
    data.head()
    
    # extraction Analysis
    show_dimensions(data)
    show_dtypes(data)
    
    return data

In [38]:
def data_transform(data):
    # convert object to date
    data['date'] = pd.to_datetime(data['date'])
    
    # descriptive statistics
    num_attributes = data.select_dtypes(include = ['int64','float64'])

    # central tendency
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    media = pd.DataFrame(num_attributes.apply(np.mean, axis =  0))
    mediana = pd.DataFrame(num_attributes.apply(np.median, axis = 0))

    # dispersion
    std = pd.DataFrame(num_attributes.apply(np.std, axis = 0))
    min_ = pd.DataFrame(num_attributes.apply(np.min, axis = 0))
    max_ = pd.DataFrame(num_attributes.apply(np.max, axis = 0))
    
    df = pd.concat([min_, max_, media, mediana, std], axis = 1).reset_index()
    df.columns = ['Atributos','Minimo','Maximo','Media','Mediana','Std']
    df
    
    show_dimensions(df)
    
    data['dormitory_type'] = 'NA'

    for i in range(len(data)):
        if data.loc[i, 'bedrooms'] == 1:
            data.loc[i, 'dormitory_type'] = 'studio'

        elif data.loc[i, 'bedrooms'] == 2:
            data.loc[i, 'dormitory_type'] = 'apartment'

        else:
            data.loc[i, 'dormitory_type'] = 'house'
            
            data['nivel'] = 'NA'

    for i in range(len(data)):
      if data.loc[i, 'price'] < 321950:
        data.loc[i, 'nivel'] = 0

      elif (data.loc[i, 'price'] >= 321951) & (data.loc[i, 'price'] < 450000):
        data.loc[i, 'nivel'] = 1

      elif (data.loc[i, 'price'] >= 450001) & (data.loc[i, 'price'] < 645000):
        data.loc[i, 'nivel'] = 2

      else:
        data.loc[i, 'nivel'] = 3
        
    cols = ['road','house_number']
    df2 = data.head(20)
    df3 = collect_geodata(df2,cols)
    
    return data

In [39]:
def data_load(data):
    house = data[['id','price','lat','long','nivel']].copy()

    fig = px.scatter_mapbox(house,
                            lat='lat',
                            lon= 'long',
                            color= 'nivel',
                            size= 'price',
                            color_continuous_scale= px.colors.cyclical.Edge,
                            size_max= 15,
                            zoom= 10)

    fig.update_layout(mapbox_style = 'open-street-map')
    fig.update_layout(height = 600, margin={'r':0, 't':0, 'l':0, 'b':0})
    fig.show()
    
    return None

In [None]:
if __name__ == '__main__':
    
    data_raw = data_collect('../datasets/kc_house_data.csv')
    
    data_processing = data_transform(data_raw)
    
    data_load(data_processing)