# Geocoding the data

In [1]:
import pandas as pd
from geopy.geocoders import Nominatim, ArcGIS, Photon
from geopy.extra.rate_limiter import RateLimiter

In [2]:
geolocator = Nominatim(user_agent='em_nome_do_pai')
# geolocator = Photon()
delayed_geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [24]:
df = pd.read_parquet('../stock_cleaned.parquet')#.sample(n=50)
df.shape

(2297, 6)

aqui pegamos apenas os endereços unicos pra evitar requests redundantes pro geocoder

In [25]:
df['full_address'] = df.street + ', ' + df.neighborhood

In [14]:
unique_addresses = pd.DataFrame(df.full_address.unique(), columns=['full_address'])
unique_addresses

Unnamed: 0,full_address
0,"Rua Lamute, São João Clímaco, São Paulo"
1,"Rua Galeno de Castro, Jurubatuba, São Paulo"
2,"Alameda Nothmann, Campos Elíseos, São Paulo"
3,"Rua Luís Correia de Melo, Santo Amaro, São Paulo"
4,"Rua Jorge Rizzo, Pinheiros, São Paulo"
...,...
1612,"Rua Capitães Mores, Mooca, São Paulo"
1613,"Avenida Diederichsen, Vila Guarani (z Sul), Sã..."
1614,"Rua Eleutério, Campo Belo, São Paulo"
1615,"Rua Paulo de Avelar, Vila Dom Pedro Ii, São Paulo"


In [15]:
def get_lat_long(address: str):
    location = delayed_geocode(address)
    
    if location is not None:
        return location.latitude, location.longitude

    return None, None

In [None]:
import time

unique_addresses[['latitude', 'longitude']] = None, None

n_rows = 50

for i in range(len(unique_addresses.index)//n_rows):
    print(f'Getting coordinates for rows {i*n_rows} to {(i+1)*n_rows}')
    
    tic = time.perf_counter()
    
    unique_addresses.iloc[i*n_rows:(i+1)*n_rows, [unique_addresses.columns.get_loc(c) for c in ['latitude', 'longitude']]] =unique_addresses.iloc[i*n_rows:(i+1)*n_rows].full_address.apply(lambda address: pd.Series(get_lat_long(address)))

    tac = time.perf_counter()

    print(f'Done {n_rows} rows in {tac-tic:.2f}s ({n_rows/(tac-tic):.2f} rows/s). NA count:\n{unique_addresses.iloc[i*n_rows:(i+1)*n_rows].isna().sum()}')


    file_path = f'../data/chunks/cleaned_chunk_{i}.parquet'
    
    print(f'Saving to {file_path}')

    unique_addresses.iloc[i*n_rows:(i+1)*n_rows].to_parquet(file_path)

    print('Saved\n')


In [26]:
df = pd.merge(df, unique_addresses, on='full_address', how='left')
df

Unnamed: 0,suite_area,street,neighborhood,condominium,tax,asking_price,full_address,latitude,longitude
0,70,Rua Lamute,"São João Clímaco, São Paulo",0,59,1100,"Rua Lamute, São João Clímaco, São Paulo",-23.625642,-46.591113
1,34,Rua Galeno de Castro,"Jurubatuba, São Paulo",0,0,2725,"Rua Galeno de Castro, Jurubatuba, São Paulo",,
2,12,Alameda Nothmann,"Campos Elíseos, São Paulo",150,69,1515,"Alameda Nothmann, Campos Elíseos, São Paulo",,
3,33,Rua Luís Correia de Melo,"Santo Amaro, São Paulo",506,91,2700,"Rua Luís Correia de Melo, Santo Amaro, São Paulo",-23.633037,-46.715464
4,104,Rua Jorge Rizzo,"Pinheiros, São Paulo",1680,0,5600,"Rua Jorge Rizzo, Pinheiros, São Paulo",-23.571447,-46.696173
...,...,...,...,...,...,...,...,...,...
2292,29,Rua Eleutério,"Campo Belo, São Paulo",355,25,2868,"Rua Eleutério, Campo Belo, São Paulo",,
2293,25,Rua General Jardim,"Vila Buarque, São Paulo",550,0,1950,"Rua General Jardim, Vila Buarque, São Paulo",-23.544719,-46.646533
2294,99,Rua Paulo de Avelar,"Vila Dom Pedro Ii, São Paulo",150,0,3100,"Rua Paulo de Avelar, Vila Dom Pedro Ii, São Paulo",,
2295,100,Avenida Leonardo da Vinci,"Vila Guarani (z Sul), São Paulo",0,0,3500,"Avenida Leonardo da Vinci, Vila Guarani (z Sul...",,


In [28]:
df['asking_price'] = df.asking_price / df.suite_area

In [29]:
df = df.dropna(
	subset=['latitude', 'longitude']
)
df.shape

(1317, 9)

In [30]:
df.to_parquet('../data/geocoded_stock.parquet')