# Imports

In [13]:
# basics
import numpy as np
import pandas as pd
import requests as r
import re
import zipfile
import geopandas as gpd

# model
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    r2_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
    median_absolute_error
)

# viz
import matplotlib.pyplot as plt
import seaborn as sns

# utils
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm
from geopy.geocoders import Nominatim
import geopandas as gpd
import pgeocode
import urllib
from shapely.geometry import Point
from itertools import chain
import sys
from pathlib import Path
import kedro

import sys
from kedro.config import ConfigLoader
from pathlib import Path
PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

# Parameters

In [16]:
uf = 'PB'
root = os.path.join(
    '..', 'oos', # data or oos
)
path_raw = os.path.join(
    root, '01_raw'
)
path_intermediate = os.path.join(
    root, '02_intermediate'
)
path_primary = os.path.join(
    root, '03_primary'
)
root_brasilapi = 'https://brasilapi.com.br/api'
root_zap = 'https://www.zapimoveis.com.br/'
url_path_ibge = os.path.join(
    root, 'ibge', 'uf', 'v1'
)
url_path_pix = os.path.join(
    root, 'pix', 'v1', 'participants'
)
url_path_zapimoveis = os.path.join(
    root_zap, 'venda/imoveis/pb+joao-pessoa/?pagina={}'
)
url_path_ibge = os.path.join(
    root, 'ibge', 'municipios', 'v1', f'{uf}?providers=dados-abertos-br,gov,wikipedia'
)
path_census_data = os.path.join(
    path_raw, 'PB_20171016'
)
path_census_data_csv = os.path.join(
    path_census_data,
    f'{uf}',
    f'Base informaçoes setores2010 universo {uf}',
    'CSV'
)

file_path_data_merged = os.path.join(
    path_intermediate, 'scrapping_data_concat.csv'
)
file_path_data_shp = os.path.join(
    path_raw, 
    'PB_Setores_2021',
    'PB_Setores_2021.shp'
)
file_path_processed = os.path.join(
    path_primary, "data_processed.csv"
)
file_path_data_input = os.path.join(
    path_primary, "data_input.csv"
)

In [17]:
response = r.get('https://brasilapi.com.br/api/cep/v1/58070403')
response.json()

{'cep': '58070403',
 'state': 'PB',
 'city': 'João Pessoa',
 'neighborhood': 'Cristo Redentor',
 'street': 'Rua José Borges Coutinho',
 'service': 'correios'}

# Methods

In [None]:
def unzip_file(zip_path: str,
               extract_path: str) -> bool:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    return True

def replace_names(value: str) -> str:
    value = value.replace('parking ','')
    value = value.replace('bedroom ','')
    value = value.replace('bathroom ','')
    value = value.replace('area ','')
    value = value.replace('m²','')
    value = value.replace(' ','')
    return value

def transform_float(value: str) -> float:
    value = value.replace("R$ ","")
    value = value.replace(".","")
    try:
        value = float(value)
    except:
        value = value
    return value

def convert_to_float(x: str) -> float:
    try:
        return float(x)
    except:
        return -1

def get_cbg(lat, long):
    url_r = f'https://geo.fcc.gov/api/census/block/find?latitude={lat}&longitude={long}&censusYear=2020&showall=true&format=json'
    result = requests.get(url_r)
    return result#.json()

In [None]:
def find_regex_cep(text: str) -> str:
    padrao = r"\d{5}-\d{3}"
    resultado = re.search(padrao,
                          text)
    if resultado:
        return resultado.group()
    else:
        return None
    
def search_cep(endereco):
    geolocator = Nominatim(
        user_agent = "my_geocoder"
    )
    location = geolocator.geocode(
        endereco,
        exactly_one = False
    )
    if location is not None:
        cep = find_regex_cep(location[0].raw['display_name'])
        return location, cep
    else:
        return location, None   
    
def find_census_area_by_zip(zip_code: str) -> str:
    url = f'https://viacep.com.br/ws/{zip_code}/json/'
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        census_area = data.get('ibge')
        return str(census_area)
    else: 
        return -1
#search_cep("Rua jose borges coutinho, 68, cristo")

In [None]:
def get_connection(url: str,
                   root_xpath: str) -> tuple:
    # Configurar o Selenium para executar o Chrome em modo headless
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Executar o Chrome em modo headless
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    limit_find, k = False, 0
    list_local = driver.find_elements(By.XPATH, root_xpath)
    print('- number of locals found: [{}]'.format(len(list_local)))
    print()
    while len(list_local)==0:
        print(f'trying: [{k + 1}]')
        driver.get(url_path_zapimoveis)
        list_local = driver.find_elements(By.XPATH, root_xpath)
        if k >= 5:
            limit_find = True
            break
        k += 1
    return driver, list_local, limit_find

In [None]:
def scrapping_zipimoveis(url: str) -> dict:
    root_xpath = "//div[@class='card-container js-listing-card']"
    i_xpath_price = root_xpath + "//div[@class='simple-card__listing-prices simple-card__prices']"
    i_xpath_amenities = root_xpath + "//ul[@class='feature__container simple-card__amenities']"
    i_xpath_feature = ".//li[contains(@class, 'feature__item')]"
    i_xpath_local = ".//h2[@class='simple-card__address color-dark text-regular']"
    try:

        driver, list_local, _ = get_connection(
            url, root_xpath
        )

        content_data = {}
        for idx, i in enumerate(list_local):
            ID = f'id-{i.get_attribute("data-id")}'
            #print('-- getting: [{}]'.format(ID))
            content_data[ID] = {}
            price = i.find_elements(By.XPATH,
                                    i_xpath_price)
            price_extracted = price[idx].find_element(By.TAG_NAME, "strong").text
            #print('---- price: [{}]'.format(price_extracted))
            content_data[ID]['price'] = [
                price_extracted
            ]

            local = i.find_element(By.XPATH,
                                   i_xpath_local)

            local_extracted = local.text
            content_data[ID]['local'] = [local_extracted]
            #print('---- local: [{}]'.format(local_extracted))

            card_amenities = i.find_elements(By.XPATH, 
                                             i_xpath_amenities)

            elementos = card_amenities[idx].find_elements(By.XPATH,
                                                          i_xpath_feature)
            content_data[ID]['features'] = {
                        i.get_attribute("class").split(" ")[-1][3:]: i.text.strip() for i in elementos
                    }
            #print()
    except Exception as e:
        print('Erro durante a execução:', e)

    finally:
        driver.quit()
    return content_data

In [None]:
def reader(url: str) -> pd.DataFrame:
    try:
        data = pd.read_csv(url,
                           delimiter = ';')
    except:
        data = pd.read_csv(url,
                           sep=';', 
                           encoding='latin-1')
    return data

In [None]:
def transform_coefs(x: float) -> float:
    return np.abs(x)/(sum(np.abs(x)))

# Read datasets

In [None]:
# unzip
if not os.path.exists(path_census_data):
    unzip_file(
        path_census_data+'.zip',
        path_census_data
    )

In [None]:
data_census = {
    i: reader(url) for i, url in file_path_census_data.items()
}

In [None]:
data_shp = gpd.read_file(file_path_data_shp)
data_shp.columns = data_shp.columns.str.lower()

# Concatenate census data

In [None]:
for idx, (name, content) in enumerate(data_census.items()):
    if idx == 0:
        data_census_merged = content.copy()
        data_census_merged.columns = data_census_merged.columns.str.lower()
        data_census_merged.columns = ["cod_setor"]+[
            '{}_{}'.format(i, name) for i in data_census_merged.columns[1:]
        ]
    else:
        content.columns = content.columns.str.lower()
        content.columns = ["cod_setor"]+[
            '{}_{}'.format(i, name) for i in content.columns[1:]
        ]
        data_census_merged = data_census_merged.merge(
            content,
            on = ["cod_setor"],
            suffixes = (
                "_{}".format(name).lower(),
                "_{}".format(name).lower()
            )
        )
        data_census_merged.columns = data_census_merged.columns.str.lower()
    if 'v999' in data_census_merged.columns:
        print(data_census_merged.columns)
data_census_merged.columns = data_census_merged.columns.str.lower()

In [None]:
data_census_merged.drop(
    data_census_merged.columns[data_census_merged.columns.str.contains('unnamed')],
    axis = 1, inplace = True
)
data_census_merged = data_census_merged.loc[:, ~data_census_merged.columns.duplicated()]
data_census_merged.rename(
        columns = {
            "cod_setor": "cd_setor"
        }, inplace = True
)

# Filter data 

In [None]:
# filter_joao_pessoa
data_census_merged = data_census_merged[
    data_census_merged.cd_setor.astype(
        str
    ).str.contains(
        '2507507'
    )
].copy()

In [None]:
# filter_joao_pessoa
data_shp = data_shp[
    data_shp.cd_setor.astype(
        str
    ).str.contains(
        '2507507'
    )
].copy()
data_shp.rename(
    columns = {
        "cod_setor": "cd_setor"
    }, inplace = True
)

# Scrapping

In [None]:
n_pages_init = 51
n_pages_final = 101

In [None]:
data_results_scrapping = {}
for i in tqdm(range(n_pages_init, n_pages_final + 1)):
    data_results_scrapping[
        f'page_n{i}'
    ] = scrapping_zipimoveis(url_path_zapimoveis.format(i))
    time.sleep(10)

In [None]:
content_final = {}
errors = []
for page, content in tqdm(data_results_scrapping.items()):
    data_tmp = {}
    for idx, i in enumerate(list(content.items())):
        data_tmp[f'n{idx}'] = {}
        data_tmp[f'n{idx}']['ID'] = i[0].replace('id-','')
        data_tmp[f'n{idx}']['price'] = transform_float(i[1]['price'][0])
        data_tmp[f'n{idx}']['local'] = i[1]['local']
        for f_name, f_value in i[1]['features'].items():
            data_tmp[f'n{idx}'][f_name] = replace_names(f_value)
        data_tmp[f'n{idx}'] = pd.DataFrame(data_tmp[f'n{idx}'])
    try:
        content_final[page] = pd.concat(list(data_tmp.values()),
                                        ignore_index = True)
    except Exception as e:
        errors.append((page, e))
content_final = pd.concat(list(content_final.values()),
                          ignore_index = True).replace('', np.nan)

In [None]:
cols_selected = [
    'areas',
    'bedrooms',
    'parking-spaces',
    'bathrooms'
]
string_rows = content_final[cols_selected].apply(lambda x: x.str.contains('-', na = False))

for i in cols_selected:
    values_changed = content_final[
        string_rows[i]
    ][i].apply(
        lambda x: (int(x.split("-")[0])+int(x.split("-")[1]))/2
    )
    content_final.loc[values_changed.index, i] = values_changed.values

In [None]:
ceps = []
for i in tqdm(content_final['local']):
    try:
        ceps.append( search_cep(i)[1] )
    except:
        ceps.append( np.nan )
content_final['cep'] = ceps
content_final['cep'] = content_final['cep'].fillna(np.nan)

## Save scrapping

In [None]:
content_final.to_csv(file_path_processed)

# Pre-processing

In [None]:
cols_to_drop = [
    "bathrooms"
]
which_rows_to_drop = [
    'areas', 
    'bedrooms',
    'parking-spaces'
]
id_tag = [
    "ID"
]
content_final_process = content_final.copy()

In [None]:
content_final_process.price = content_final_process.price.apply(convert_to_float)

In [None]:
lats = []
longs = []
cbg = []
nomi = pgeocode.Nominatim('br')
for local in  tqdm(content_final_process.local):
    if local!=None:
        address = local + f', {uf}, Brasil'
        try:

            geolocator = Nominatim(user_agent="geolocalização")
            location = geolocator.geocode(address)

            lats.append(float(location.latitude))
            longs.append(float(location.longitude))

        except:
            lats.append(np.nan)
            longs.append(np.nan)

    else:
        lats.append(np.nan)
        longs.append(np.nan)

In [None]:
content_final_process['lats'] = lats
content_final_process['longs'] = longs

In [None]:
# Criar os objetos de ponto com as latitudes e longitudes
pontos = [Point(xy) for xy in zip(content_final_process['longs'], content_final_process['lats'])]

def find_cbg(shp, points):
    cbgs = []
    for i in tqdm(pontos):
        cbgs.append(
            list(data_shp.loc[data_shp.geometry.contains(i), "cd_setor"].values)
        )
    cbgs = [i[0] if i != [] else np.nan for i in cbgs]
    #cbgs = list(chain.from_iterable(cbgs))
    return cbgs

In [None]:
cbgs = find_cbg(data_shp, pontos)
content_final_process['cd_setor'] = cbgs

In [None]:
# order columns
id_columns = [
    "ID",
    "cd_setor"
]
content_final_process = content_final_process[id_columns+[i for i in content_final_process.columns if i not in id_columns]]
content_processed = content_final_process.dropna(subset = ["cd_setor"]).reset_index(drop = True)

In [None]:
content_processed.cd_setor = content_processed.cd_setor.astype(str)
data_census_merged.cd_setor = data_census_merged.cd_setor.astype(str)

In [None]:
data_merged = content_processed.merge(
    data_census_merged, 
    on = ["cd_setor"],
    how = "left"
)
data_merged = data_merged.merge(
    data_shp[["cd_setor","nm_sit"]],
    on = ["cd_setor"],
    how="left"
)
data_merged.insert(
    4,
    "bairro",
    data_merged.local.apply(
        lambda x: x.split(", ")[-1]
    )
)

In [None]:
data_merged.columns = data_merged.columns.str.replace(" ","_")
columns_na = data_merged.isna().sum()[data_merged.isna().sum()!=0].index

In [None]:
data_bairros = {}
errors = []
for bairro in tqdm(data_merged.bairro.unique()):
    data_bairros[bairro] = data_merged[
        data_merged.bairro.str.contains(bairro)
    ].reset_index(drop = True)
    for na_col in tqdm(columns_na):
        #print(na_col)
        try:
            data_bairros[bairro][na_col] = data_bairros[
                bairro
            ][
                na_col
            ].astype(
                float
            )
            average = data_bairros[bairro][na_col].mean()
            data_bairros[bairro][na_col] = data_bairros[
                bairro
            ][
                na_col
            ].fillna(
                average
            )
        except Exception as e:
            errors.append(e)
            continue
data_merged_not_nan = pd.concat(list(data_bairros.values()), ignore_index = True)
data_merged_not_nan = data_merged_not_nan.dropna().reset_index(drop = True)
data_merged_not_nan = data_merged_not_nan[
    ~data_merged_not_nan['ID'].duplicated()
].reset_index(drop = True)

In [None]:
data_merged_not_nan.drop(
    data_merged_not_nan.columns[data_merged_not_nan.columns.str.contains("situacao")],
    axis = 1,
    inplace = True)

## Save data processed

In [None]:
data_merged_not_nan.to_csv(file_path_processed)

# Drop columns with problemns

In [None]:
cods_cols = list(data_merged_not_nan.columns[data_merged_not_nan.columns.str.contains("cod")])
other_cols = [
    "cep",
    "lats",
    "longs",
    "local",
    "nm_sit"
]
all_columns = cods_cols + other_cols
data_merged_not_nan.drop(all_columns,
                         axis = 1, 
                         inplace = True)

# Categorical columns process

In [None]:
cat_cols = [
    "bairro"
]
cat_dict = {
    
}
for col in cat_cols:
    for idx, i in enumerate(data_merged_not_nan[col].unique()):
        cat_dict[col] = {
            i: int(idx)
        }
        data_merged_not_nan[col] = data_merged_not_nan[col].replace(cat_dict[col])

In [None]:
data_final = data_merged_not_nan.copy()
data_final = data_final.replace("X", np.nan)

## Process column with problem

In [None]:
for cat in data_final.columns[data_final.columns.str.contains("nome")]:
    cat_values = {
        i: idx for idx, i in enumerate(data_final[cat].unique())
    }
    data_final[cat] = data_final[cat].replace(cat_values)
    
for pro_col in data_final.columns:
    try:
        data_final[pro_col] = data_final[pro_col].astype(float)
    except:
        data_final[pro_col] = data_final[pro_col].astype(str).str.replace(",",".").astype(float)

## Drop bathrooms

In [None]:
data_final.drop("bathrooms", axis = 1, inplace = True)

# Save dataset final

In [None]:
data_final.to_csv(file_path_data_input)