# Imports

In [None]:
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

# basics
import numpy as np
import pandas as pd
import requests as r
import re
import geopandas as gpd

# viz
import matplotlib.pyplot as plt
import seaborn as sns

# utils
import os
import time
from tqdm import tqdm
from geopy.geocoders import Nominatim
import geopandas as gpd
import pgeocode
import urllib
from shapely.geometry import Point
from itertools import chain
from kedro.config import ConfigLoader
from ppm.utils.external_get_data import (
    find_regex_cep,
    search_cep,
    find_census_area_by_zip,
    get_connection,
    scrapping_zipimoveis,
    find_cbg
)
from ppm.utils.readers import (
    reader,
    unzip_file
)
from ppm.utils.feature_process import (
    replace_names,
    transform_float,
    convert_to_float
)

# Parameters

In [None]:
uf = 'PB'
path_root_oos = os.path.join(
    '..', 'oos_n2', # data or oos
)
path_root_data = os.path.join(
    '..', 'data', # data or oos
)
path_raw = os.path.join(
    path_root_data, '01_raw'
)
path_intermediate = os.path.join(
    path_root_oos, '02_intermediate'
)
path_intermediate_data = os.path.join(
    path_root_data, '02_intermediate'
)
path_primary = os.path.join(
    path_root_oos, '03_primary'
)
root_brasilapi = 'https://brasilapi.com.br/api'
root_zap = 'https://www.zapimoveis.com.br/'
url_path_zapimoveis = os.path.join(
    root_zap, 'venda/imoveis/pb+joao-pessoa/?pagina={}'
)
path_census_data = os.path.join(
    path_raw, 'PB_20171016'
)
path_census_data_csv = os.path.join(
    path_census_data,
    f'{uf}',
    f'Base informaçoes setores2010 universo {uf}',
    'CSV'
)

file_path_data_merged = os.path.join(
    path_intermediate, 'scrapping_data_concat.csv'
)
file_path_data_shp = os.path.join(
    path_raw, 
    'PB_Setores_2021',
    'PB_Setores_2021.shp'
)
file_path_shp_processed = os.path.join(
    path_intermediate_data, "data_shp_processed"
)
file_path_census_processed = os.path.join(
    path_intermediate_data, "data_census_processed.csv"
)
file_path_processed = os.path.join(
    path_primary, "data_processed.csv"
)
file_path_processed_concat = os.path.join(
    path_primary, "data_processed_concat.csv"
)
file_path_data_input = os.path.join(
    path_primary, "data_input.csv"
)

# Read datasets

In [None]:
# unzip
if not os.path.exists(path_census_data):
    unzip_file(
        path_census_data+'.zip',
        path_census_data
    )

In [None]:
file_path_census_data = {
    i.split(".")[0]: os.path.join(path_census_data_csv, i)for i in os.listdir(path_census_data_csv) 
}

In [None]:
data_census_merged = pd.read_csv(file_path_census_processed)
data_shp = gpd.read_file(file_path_shp_processed)

# Scrapping

In [None]:
n_pages_init = 51
n_pages_final = 100 # 101

In [None]:
data_results_scrapping = {}
for i in tqdm(range(n_pages_init, n_pages_final + 1)):
    data_results_scrapping[
        f'page_n{i}'
    ] = scrapping_zipimoveis(url_path_zapimoveis.format(i))
    time.sleep(10)

In [None]:
content_final = {}
errors = []
for page, content in tqdm(data_results_scrapping.items()):
    data_tmp = {}
    for idx, i in enumerate(list(content.items())):
        data_tmp[f'n{idx}'] = {}
        data_tmp[f'n{idx}']['ID'] = i[0].replace('id-','')
        data_tmp[f'n{idx}']['price'] = transform_float(i[1]['price'][0])
        data_tmp[f'n{idx}']['local'] = i[1]['local']
        for f_name, f_value in i[1]['features'].items():
            data_tmp[f'n{idx}'][f_name] = replace_names(f_value)
        data_tmp[f'n{idx}'] = pd.DataFrame(data_tmp[f'n{idx}'])
    try:
        content_final[page] = pd.concat(list(data_tmp.values()),
                                        ignore_index = True)
    except Exception as e:
        errors.append((page, e))
content_final = pd.concat(list(content_final.values()),
                          ignore_index = True).replace('', np.nan)

In [None]:
cols_selected = [
    'areas',
    'bedrooms',
    'parking-spaces',
    'bathrooms'
]
string_rows = content_final[cols_selected].apply(lambda x: x.str.contains('-', na = False))

for i in cols_selected:
    values_changed = content_final[
        string_rows[i]
    ][i].apply(
        lambda x: (int(x.split("-")[0])+int(x.split("-")[1]))/2
    )
    content_final.loc[values_changed.index, i] = values_changed.values

In [None]:
ceps = []
for i in tqdm(content_final['local']):
    try:
        ceps.append( search_cep(i)[1] )
    except:
        ceps.append( np.nan )
content_final['cep'] = ceps
content_final['cep'] = content_final['cep'].fillna(np.nan)

# Pre-processing

In [None]:
cols_to_drop = [
    "bathrooms"
]
which_rows_to_drop = [
    'areas', 
    'bedrooms',
    'parking-spaces'
]
id_tag = [
    "ID"
]
content_final_process = content_final.copy()

In [None]:
content_final_process.price = content_final_process.price.apply(convert_to_float)

In [None]:
lats = []
longs = []
cbg = []
nomi = pgeocode.Nominatim('br')
for local in  tqdm(content_final_process.local):
    if local!=None:
        address = local + f', {uf}, Brasil'
        try:
                                                                                                                        
            geolocator = Nominatim(user_agent="geolocalização")
            location = geolocator.geocode(address)

            lats.append(float(location.latitude))
            longs.append(float(location.longitude))

        except:
            lats.append(np.nan)
            longs.append(np.nan)

    else:
        lats.append(np.nan)
        longs.append(np.nan)

In [None]:
content_final_process['lats'] = lats
content_final_process['longs'] = longs

In [None]:
# Criar os objetos de ponto com as latitudes e longitudes
pontos = [Point(xy) for xy in zip(content_final_process['longs'], content_final_process['lats'])]

In [None]:
cbgs = find_cbg(data_shp, pontos)
content_final_process['cd_setor'] = cbgs

In [None]:
# order columns
id_columns = [
    "ID",
    "cd_setor"
]
content_final_process = content_final_process[id_columns+[i for i in content_final_process.columns if i not in id_columns]]
content_processed = content_final_process.dropna(subset = ["cd_setor"]).reset_index(drop = True)

In [None]:
content_processed.cd_setor = content_processed.cd_setor.astype(str)
data_census_merged.cd_setor = data_census_merged.cd_setor.astype(str)

## Save scrapping

In [None]:
content_processed.to_csv(file_path_processed)

# Concat census

In [None]:
data_merged = content_processed.merge(
    data_census_merged, 
    on = ["cd_setor"],
    how = "left"
)
data_merged = data_merged.merge(
    data_shp[["cd_setor","nm_sit"]],
    on = ["cd_setor"],
    how="left"
)
data_merged.insert(
    4,
    "bairro",
    data_merged.local.apply(
        lambda x: x.split(", ")[-1]
    )
)

In [None]:
data_merged.columns = data_merged.columns.str.replace(" ","_")
columns_na = data_merged.isna().sum()[data_merged.isna().sum()!=0].index

In [None]:
data_bairros = {}
errors = []
for bairro in tqdm(data_merged.bairro.unique()):
    data_bairros[bairro] = data_merged[
        data_merged.bairro.str.contains(bairro)
    ].reset_index(drop = True)
    for na_col in tqdm(columns_na):
        #print(na_col)
        try:
            data_bairros[bairro][na_col] = data_bairros[
                bairro
            ][
                na_col
            ].astype(
                float
            )
            average = data_bairros[bairro][na_col].mean()
            data_bairros[bairro][na_col] = data_bairros[
                bairro
            ][
                na_col
            ].fillna(
                average
            )
        except Exception as e:
            errors.append(e)
            continue
data_merged_not_nan = pd.concat(list(data_bairros.values()), ignore_index = True)
data_merged_not_nan = data_merged_not_nan.dropna().reset_index(drop = True)
data_merged_not_nan = data_merged_not_nan[
    ~data_merged_not_nan['ID'].duplicated()
].reset_index(drop = True)

In [None]:
data_merged_not_nan.drop(
    data_merged_not_nan.columns[data_merged_not_nan.columns.str.contains("situacao")],
    axis = 1,
    inplace = True)

## Save data processed

In [None]:
data_merged_not_nan.to_csv(file_path_processed_concat)

# Drop columns with problemns

In [None]:
cods_cols = list(data_merged_not_nan.columns[data_merged_not_nan.columns.str.contains("cod")])
other_cols = [
    "cep",
    "lats",
    "longs",
    "local",
    "nm_sit"
]
all_columns = cods_cols + other_cols
data_merged_not_nan.drop(all_columns,
                         axis = 1, 
                         inplace = True)

# Categorical columns process

In [None]:
cat_cols = [
    "bairro"
]
cat_dict = {
    
}
for col in cat_cols:
    for idx, i in enumerate(data_merged_not_nan[col].unique()):
        cat_dict[col] = {
            i: int(idx)
        }
        data_merged_not_nan[col] = data_merged_not_nan[col].replace(cat_dict[col])

In [None]:
data_final = data_merged_not_nan.copy()
data_final = data_final.replace("X", np.nan)

## Process column with problem

In [None]:
for cat in data_final.columns[data_final.columns.str.contains("nome")]:
    cat_values = {
        i: idx for idx, i in enumerate(data_final[cat].unique())
    }
    data_final[cat] = data_final[cat].replace(cat_values)
    
for pro_col in data_final.columns:
    try:
        data_final[pro_col] = data_final[pro_col].astype(float)
    except:
        data_final[pro_col] = data_final[pro_col].astype(str).str.replace(",",".").astype(float)

## Drop bathrooms

In [None]:
data_final.drop("bathrooms", axis = 1, inplace = True)

# Save dataset final

In [None]:
data_final.to_csv(file_path_data_input)