In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import reverse_geocoder as rg 
import pprint 

# Utilities

In [12]:
def reverseGeocode(coordinates): 
    result = rg.search(coordinates) 
    
    return result[0]['admin1'], result[0]['name']

# Dataframe must have been given reset_indexed
# Complete missing values where deparment and municipality are unknown
def relocate(df_geo):
    n = df_geo.shape[0]
    for i in tqdm(range(0,n)):
        lat = df_geo.loc[i, 'lat']
        lon = df_geo.loc[i, 'lon']
        l2 = df_geo.loc[i, 'l2']
        l3 = df_geo.loc[i, 'l3']
        if (~np.isnan(lat) and ~np.isnan(lon)) and (pd.isnull(l3) or pd.isnull(l2)):
            df_geo.loc[i, 'l2'], df_geo.loc[i, 'l3'] = reverseGeocode((lat,lon))
            
    return df_geo

#Calculates missing data and shows dataframe
def calculate_missing(df):
    s_types = df.dtypes
    missing = df.isnull().sum()
    missing_per = (df.isnull().sum()/df.shape[0])*100

    explo = pd.DataFrame({'Type' : s_types, 'Missing' : missing,
                       'Missing%' : missing_per}).sort_values(by=['Missing%', 'Type'], ascending=False)

    return explo

#Graph missing data given dataframe
def graph_missing(df):

    df_missing = calculate_missing(df)

    df_missing = df_missing.where(df_missing['Missing%'] > 25).dropna(subset=['Missing%'])

    sns.barplot(x=df_missing.index, y=df_missing['Missing%'], data=df_missing)

## Processing

In [13]:
df = pd.read_csv('properties_co.csv')

In [14]:
df = df.where(df['operation_type'] == 'Venta').dropna(subset=['operation_type'])
df.drop(columns=['operation_type', 'l6', 'l5', 'l4', 'l3', 'l1', 'rooms', 'price_period', 'surface_covered', 'id', 'start_date', 'Unnamed: 0', 'title', 'description',  'start_date', 'end_date', 'created_on', 'ad_type'], inplace=True)

In [15]:
def cop_to_usd(currency, price):
    if currency == 'USD':
        return price*3200
    return price

In [16]:
df['price'] = tqdm(df.apply(lambda x: cop_to_usd(x.currency, x.price), axis=1))

100%|██████████| 613054/613054 [00:00<00:00, 3427820.31it/s]


In [17]:
df.drop(columns=['currency'], inplace = True)

In [19]:
calculate_missing(df)

Unnamed: 0,Type,Missing,Missing%
bedrooms,float64,351027,57.258741
surface_total,float64,231445,37.752792
bathrooms,float64,90642,14.785321
lat,float64,82426,13.445145
lon,float64,82426,13.445145
price,float64,5444,0.888013
l2,object,0,0.0
property_type,object,0,0.0
