In [5]:
import sys
import os
import math
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from IPython.display import HTML, display

def load_data(prices_filepath, regions_filepath):
    
    '''
    
    This function load the datasets co_properties.csv that contain prices of propierties in Colombia
    
    Params:
        prices_filepath (str): String that contain the path to co_properties file (csv file with price of properties in Colombia)
        regions_filepath (str): String that contain the path to regions.csv (csv file with geographic region in Colombia)
        
    Returns:
        df_prices, regions (tupla of pandas DataFrame): 
                               df_prices: This dataframe contain the following columns:
                               columns:
                                   id-->Id for each property
                                   ad_type-->constant column
                                   start_date, end_date, created_on-->date of start, end of creation for the sale offer
                                   lat, long-->Latitude and Longitude (geographic position)
                                   l1-->Country (constant column, all properties are in Colombia)
                                   l2, l3, l4, l5, l6-->Department, City, Zone, Locality, Neighborhood where property is located 
                                   rooms, bedrooms, bathrooms, surface_total, surface_covered-->Features of property
                                   price-->Target variable for prediction model
                                   currency-->Almost all prices are in COP currency
                                   price_period-->Constant column
                                   title, description--> ad title and description 
                                   property_type-->type possibles are: Otro, Apartamento, Casa, Lote, Oficina, PH etc.
                                   operation_type-->type possibles of operations are: Venta, Arriendo, Arriendo temporal
                                   messages--> Text of message
                                   categories --> A single column containing the categories marks for the message
                               regions: This dataframe contain the following columns:
                               columns:
                                   l2-->Department
                                   Region-->Region where Department is located
                                   l2shp-->Department in other format for easy integration with shape file of department
                                   
    '''
    df_prices = pd.read_csv(prices_filepath)
    regions = pd.read_csv(regions_filepath,sep=";", encoding="latin-1")
    
    return (df_prices, regions)

def clean_data(df_prices):
    
    '''
    This function clean de df_prices dataframe to be used in the model. Some operations made are:
    
    1. Remove cases with currency different to COP
    2. Remove constants columns
    3. Choose cases with operation_type = Venta and remove operation type column
    4. Assign missing value to invalid values of variables: ('surface_total', 'surface_covered', 'price') 
    5. Create dummies variables for missing values in features. 
       1--> if the feature has a missing value
       0-->if the value is valid.
    6. Remove string and date variables no-used in model or maps.
    
    '''
    
    #Step 1: Remove cases with currency different to COP
    
    df_prices = df_prices[df_prices['currency']=="COP"]
    
    #Step 2: Remove constants columns:
    
    columns_to_remove = []
    for col in df_prices.columns:
        distinct_values = df_prices[col].unique()
        if len(distinct_values)==1:
            columns_to_remove.append(col)
    df_prices = df_prices.drop(columns_to_remove, axis=1)
    
    #Step 3: Choose cases with operation type = Venta
    
    df_prices = df_prices[df_prices.operation_type=="Venta"]
    df_prices = df_prices.drop(['operation_type'], axis=1)
    
    #Step 4: Assign missing value to invalid values of variables: ('surface_total','price')
    
    surface_total_mod = list(df_prices['surface_total'].apply(lambda x: float('NaN') if x<=0 else x))
    price_mod = list(df_prices['price'].apply(lambda x: float('NaN') if x<=0 else x))
    df_prices = df_prices.drop(['surface_total','price'], axis=1)
    df_prices['surface_total'] = surface_total_mod
    df_prices['price'] = price_mod
    
    #Step 5: Create dummies variables for missing values in features
    
    columns_for_model = ['lat', 'lon', 'rooms', 'l2', 'l3', 'l4', 'l5', 'l6', 'bedrooms','rooms', 'bedrooms', 
                         'bathrooms', 'surface_total', 'surface_covered', 'price']
    
    numeric_columns = df_prices.select_dtypes(include=np.number).columns    
    names_dummies = ['missing_'+col for col in columns_for_model]
    
    for i in range(len(columns_for_model)):
        if columns_for_model[i] in numeric_columns:
            df_prices[names_dummies[i]] = df_prices[columns_for_model[i]].isna().apply(lambda x: 1 if x else 0)
        df_prices[names_dummies[i]] = df_prices[columns_for_model[i]].isna().apply(lambda x: 1 if x else 0)
    
    #Step 6: Remove string and date variables no-used in model or maps.
    
    non_used = ['id','start_date','end_date','created_on','title','description', "price_period"]
    df_prices = df_prices.drop(non_used,axis=1)
     
    print("INFO[] The cleaned table has the following fields: ")
    print("\n{}\n".format(df_prices.columns))
    
    return df_prices

def join_data(df_prices, regions):
    
    '''
    
    This function merge the table df_prices to table regions, using the key column l2. This is usefull to construct
    Choroplet map using the column l2shp and Region present in region table.
    
    Params:
        df_prices (pandas DataFrame): Contain the cleaned df_prices table 
        regions(pandas DataFrame): Contain the information in regions.csv file        
    Returns:
        df_prices_full (pandas DataFrame): Contain the cleaned df_prices table with two additional columns: l2shp and Region 
    
    '''
    
    df_prices_full = pd.merge(df_prices, regions, on="l2")
    
    return df_prices_full    

def save_data(df_prices_full, database_filename):
    
    '''
    
    This function save the table df_prices_full in a sqlLite database. This table will be used in modelling 
    and mapping stages
    
    Params:
        df_prices_full (pandas DataFrame): Contain the cleaned df_prices table with two additional columns: l2shp and Region 
        database_filename (String): Contain the path to location where table will be stored        
    Returns:
        This function is a procedure, it return None
    
    '''
    
    engine = create_engine('sqlite:///'+database_filename)
    df_prices_full.to_sql('Cleaned_prices', engine, index=False, if_exists = 'replace')
    
def main():
    
    '''
    
    This function control the ETL flow and call the other functions for load, clean, and save data
    
    '''
    
    sys_argv = ['process_data.py', 'co_properties.csv', 'regions.csv', 'PropertiesPrices.db'] 
    if len(sys_argv) == 4:

        df_prices_filepath, regions_filepath, database_filepath = sys_argv[1:]

        print('Loading data...\n    df_prices: {}\n    regions: {}'
              .format(df_prices_filepath, regions_filepath))
        df, regions = load_data(df_prices_filepath, regions_filepath)

        print('Cleaning data...')
        df = clean_data(df)
        
        print('Joining data...')
        df = join_data(df,regions)
        
        print('Saving data...\n    DATABASE: {}'.format(database_filepath))
        save_data(df, database_filepath)
        
        print('Cleaned and joined data saved to database!')
    
    else:
        print('Please provide the filepaths of the df_prices and regions '\
              'datasets as the first and second argument respectively, as '\
              'well as the filepath of the database to save the joined and cleaned data '\
              'to as the third argument. \n\nExample: python process_data.py '\
              'co_properties.csv regions.csv '\
              'PropertiesPrices.db')

if __name__ == '__main__':
    main()    

Loading data...
    df_prices: co_properties.csv
    regions: regions.csv
Cleaning data...
INFO[] The cleaned table has the following fields: 

Index(['lat', 'lon', 'l2', 'l3', 'l4', 'l5', 'l6', 'rooms', 'bedrooms',
       'bathrooms', 'surface_covered', 'property_type', 'surface_total',
       'price', 'missing_lat', 'missing_lon', 'missing_rooms', 'missing_l2',
       'missing_l3', 'missing_l4', 'missing_l5', 'missing_l6',
       'missing_bedrooms', 'missing_bathrooms', 'missing_surface_total',
       'missing_surface_covered', 'missing_price'],
      dtype='object')

Joining data...
Saving data...
    DATABASE: PropertiesPrices.db
Cleaned and joined data saved to database!


In [7]:
database_filepath = "../data/PropertiesPrices.db"
engine = create_engine('sqlite:///'+database_filepath)
df = pd.read_sql_table("Cleaned_prices",con=engine)
display(df)

Unnamed: 0,lat,lon,l2,l3,l4,l5,l6,rooms,bedrooms,bathrooms,...,missing_l4,missing_l5,missing_l6,missing_bedrooms,missing_bathrooms,missing_surface_total,missing_surface_covered,missing_price,Region,l2shp
0,6.287127,-75.336540,Antioquia,,,,,,,,...,1,1,1,1,1,1,1,0,Andina,ANTIOQUIA
1,6.287127,-75.336540,Antioquia,,,,,,,,...,1,1,1,1,1,1,1,0,Andina,ANTIOQUIA
2,,,Antioquia,,,,,,,,...,1,1,1,1,1,1,1,0,Andina,ANTIOQUIA
3,6.291447,-75.338812,Antioquia,,,,,,,,...,1,1,1,1,1,1,1,0,Andina,ANTIOQUIA
4,,,Antioquia,Bello,,,,,,,...,1,1,1,1,1,1,1,0,Andina,ANTIOQUIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595415,2.328000,-72.861000,Guaviare,,,,,,,,...,1,1,1,1,1,1,1,0,Amazonia,GUAVIARE
595416,2.571000,-72.643000,Guaviare,,,,,,,4.0,...,1,1,1,1,0,1,1,0,Amazonia,GUAVIARE
595417,2.328000,-72.861000,Guaviare,,,,,,,,...,1,1,1,1,1,1,1,0,Amazonia,GUAVIARE
595418,2.571764,-72.643701,Guaviare,,,,,,,4.0,...,1,1,1,1,0,1,1,0,Amazonia,GUAVIARE


In [8]:
df['l3'].unique()

array([None, 'Bello', 'Itagui', 'Envigado', 'Rionegro', 'Medellín',
       'Barbosa', 'Sabaneta', 'La Ceja', 'Guarne', 'La Estrella',
       'Girardota', 'Copacabana', 'Marinilla', 'Sopetrán', 'Caldas',
       'San Rafael', 'San Jerónimo', 'Guatapé', 'Venecia',
       'Ciudad Bolívar', 'Fredonia', 'El Carmen de Viboral',
       'Santafé de Antioquia', 'Retiro', 'Puerto Triunfo', 'Abejorral',
       'Turbo', 'Gómez Plata', 'Jardín', 'Hispania', 'Armenia',
       'San Pedro de Uraba', 'San Andrés de Cuerquía', 'Chigorodó',
       'Cali', 'Jamundí', 'Palmira', 'Tuluá', 'Yumbo', 'La Cumbre',
       'Candelaria', 'Cartago', 'Restrepo', 'Caicedonia', 'El Cerrito',
       'Dagua', 'Yotoco', 'Pradera', 'Zarzal', 'Vijes', 'Buenaventura',
       'Calima', 'Andalucía', 'Guacarí', 'Trujillo',
       'Guadalajara de Buga', 'Riofrío', 'Florida', 'Chía', 'Sopó',
       'Bogotá D.C', 'Soacha', 'Ricaurte', 'La Calera', 'Sibaté',
       'Mosquera', 'Zipaquirá', 'Cajicá', 'Tabio', 'Medina', 'La Mesa',
  

In [9]:
df['l4'].unique()

array([None, 'El Poblado', 'Candelaria', 'La América', 'Guayabal',
       'Buenos Aires', 'Belén', 'Laureles', 'Santa Elena', 'Altavista',
       'Aranjuez', 'Robledo', 'Castilla', 'San Antonio de Prado',
       'San Cristóbal', 'Doce de Octubre', 'Santa Cruz', 'Villa Hermosa',
       'San Javier', 'Manrique', 'Popular', 'Palmitas',
       'San Fernando Nuevo', 'Lili', 'El Ingenio', 'Santa Isabel',
       'Ciudad Jardín', 'San Fernando Viejo', 'Caney', 'Pance',
       'La Flora', 'El Limonar', 'Santa Mónica', 'Zona Norte',
       'Zona Chapinero', 'Zona Noroccidental', 'Zona Sur',
       'Zona Suroccidental', 'Zona Occidental', 'Zona Centro',
       'San Mateo', 'Olaya', 'El Recreo', 'Las Palmas', 'Nuevo Horizonte',
       'Norte-Centro Histórico', 'Paseo de la Castellana', 'Ríomar',
       'San Felipe', 'Nueva Granada', 'Metropolitana', 'Campo Alegre',
       'Suroccidente', 'Soledad', 'Puerto Colombia', 'Carrizal',
       'San Alonso', 'Antonia Santos', 'Alarcón', 'El Prado',
       

In [10]:
df['l2'].unique()

array(['Antioquia', 'Valle del Cauca', 'Cundinamarca', 'Huila',
       'Atlántico', 'Bolívar', 'Quindío', 'Caldas', 'Norte de Santander',
       'Risaralda', 'Cauca', 'Santander', 'Magdalena', 'Tolima',
       'Córdoba', 'Nariño', 'Meta', 'Casanare', 'Boyacá', 'Caquetá',
       'La Guajira', 'Sucre', 'Cesar', 'Chocó', 'Arauca', 'Guainía',
       'San Andrés Providencia y Santa Catalina', 'Amazonas', 'Putumayo',
       'Vichada', 'Guaviare'], dtype=object)

In [11]:
df_nums = df.select_dtypes(include=np.number)
display(df_nums)

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_covered,surface_total,price,missing_lat,missing_lon,...,missing_l2,missing_l3,missing_l4,missing_l5,missing_l6,missing_bedrooms,missing_bathrooms,missing_surface_total,missing_surface_covered,missing_price
0,6.287127,-75.336540,,,,,,9.000000e+07,0,0,...,0,1,1,1,1,1,1,1,1,0
1,6.287127,-75.336540,,,,,,4.500000e+08,0,0,...,0,1,1,1,1,1,1,1,1,0
2,,,,,,,,2.600000e+09,1,1,...,0,1,1,1,1,1,1,1,1,0
3,6.291447,-75.338812,,,,,,9.500000e+07,0,0,...,0,1,1,1,1,1,1,1,1,0
4,,,,,,,,1.300000e+08,1,1,...,0,0,1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595415,2.328000,-72.861000,,,,,,9.000000e+07,0,0,...,0,1,1,1,1,1,1,1,1,0
595416,2.571000,-72.643000,,,4.0,,,6.000000e+08,0,0,...,0,1,1,1,1,1,0,1,1,0
595417,2.328000,-72.861000,,,,,,9.000000e+07,0,0,...,0,1,1,1,1,1,1,1,1,0
595418,2.571764,-72.643701,,,4.0,,,6.000000e+08,0,0,...,0,1,1,1,1,1,0,1,1,0


In [12]:
df_nums.describe()

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_covered,surface_total,price,missing_lat,missing_lon,...,missing_l2,missing_l3,missing_l4,missing_l5,missing_l6,missing_bedrooms,missing_bathrooms,missing_surface_total,missing_surface_covered,missing_price
count,507234.0,507234.0,128727.0,149421.0,475503.0,97420.0,93352.0,595420.0,595420.0,595420.0,...,595420.0,595420.0,595420.0,595420.0,595420.0,595420.0,595420.0,595420.0,595420.0,595420.0
mean,5.856534,-74.998213,3.318403,3.106123,2.739158,3868.892,1428.080084,750929200.0,0.148107,0.148107,...,0.0,0.091974,0.699845,0.812591,0.939255,0.749049,0.201399,0.843217,0.836384,0.0
std,2.302451,1.06168,1.774053,7.691291,1.472847,462357.5,8460.16243,2703534000.0,0.355207,0.355207,...,0.0,0.288989,0.458325,0.39024,0.238863,0.43356,0.401046,0.363597,0.369927,0.0
min,-1.83,-81.730319,1.0,0.0,1.0,1.0,10.0,890000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.6223,-75.611352,3.0,2.0,2.0,72.0,72.0,210000000.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
50%,4.924257,-75.430571,3.0,3.0,2.0,112.0,117.0,350000000.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
75%,6.262852,-74.076517,4.0,3.0,3.0,206.0,273.0,680000000.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
max,13.351917,-67.48257,40.0,2018.0,20.0,132300000.0,200000.0,850000000000.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


**Conclusion**:

Once the cleaning is done, the average can be observed in the "missing" binary columns, that is, the percentage of missing data for each variable. The large quantity of data that is absent in the main covariates to be used is notable.

The following function allows us to build a report for missing data in a list of variables, but using a segmentation variable such as the property type. This function will be used in the app's visualizations

In [13]:
def create_missing_report(df, list_features, segment_by):
    
    '''
    
    This function create a report with information about missing values in a list of features. 
    Calculations are made on each category of the variable passed in "segment_by" parameter
    
    Params:
        df (pandas DataFrame): Contain the dataframe on which the calculations will be made 
        list_features (list of string): Names of features in the dataframe on which missing data is calculated
        segment_by (string): Name for variable used for segmentation.
    Returns:
        df_report (pandas DataFrame): Contain the report of missing values in features present in the list by each category
                                      in segment_by variable  
    
    '''
    
    # Count of cases for eacg categories in segment_by column:
    
    counts = df[segment_by].value_counts()
    categories = list(counts.index)
    
    # Create a empty dataframe where missing calculations well be stored.
    
    count_name = ['count']
    perc_names = ['perc_'+x for x in list_features]
    missing_names = ['missing_'+x for x in list_features]
    columns = count_name + missing_names + perc_names
    df_report = pd.DataFrame(index=categories, columns=columns)
    
    # Fill the count column:
    
    df_report['count'] = counts
    
    # For each feature in list_features and categorie in segment_by column, count the missing values:

    for cat in categories:
        df_ = df[df[segment_by]==cat]
        missing_values = df_[missing_names].apply(sum, axis=0)
        perc_values = [x/df_report.loc[cat,'count'] for x in missing_values]
        df_report.loc[cat,missing_names] = missing_values
        df_report.loc[cat,perc_names] = perc_values
    return(df_report)

In [14]:
segment_by = 'property_type'
list_features = ["rooms", "bedrooms","bathrooms","surface_total","surface_covered","lat","lon","price"]
df_report = create_missing_report(df,list_features,segment_by)
display(df_report)

Unnamed: 0,count,missing_rooms,missing_bedrooms,missing_bathrooms,missing_surface_total,missing_surface_covered,missing_lat,missing_lon,missing_price,perc_rooms,perc_bedrooms,perc_bathrooms,perc_surface_total,perc_surface_covered,perc_lat,perc_lon,perc_price
Apartamento,249836,170610,165350,24973,205348,194028,45037,45037,0,0.682888,0.661834,0.099958,0.821931,0.776621,0.180266,0.180266,0.0
Casa,183768,145187,138680,13334,153294,156512,24283,24283,0,0.790056,0.754647,0.072559,0.834171,0.851683,0.132139,0.132139,0.0
Otro,114087,109322,106883,46188,110158,110315,10466,10466,0,0.958234,0.936855,0.404849,0.965561,0.966938,0.091737,0.091737,0.0
Lote,27708,27261,23220,26909,21543,25264,5152,5152,0,0.983867,0.838025,0.971164,0.777501,0.911794,0.185939,0.185939,0.0
Local comercial,7045,6423,4784,3773,4873,4261,1215,1215,0,0.91171,0.679063,0.535557,0.691696,0.604826,0.172463,0.172463,0.0
Finca,6249,1717,3095,2241,2396,3313,843,843,0,0.274764,0.495279,0.358617,0.383421,0.530165,0.134902,0.134902,0.0
Oficina,6070,5530,3391,2364,4380,3676,1141,1141,0,0.911038,0.558649,0.389456,0.721582,0.605601,0.187974,0.187974,0.0
Depósito,574,574,561,88,10,574,30,30,0,1.0,0.977352,0.15331,0.017422,1.0,0.052265,0.052265,0.0
Parqueadero,59,47,35,46,42,33,13,13,0,0.79661,0.59322,0.779661,0.711864,0.559322,0.220339,0.220339,0.0
PH,24,22,0,1,24,24,6,6,0,0.916667,0.0,0.041667,1.0,1.0,0.25,0.25,0.0


In [15]:
#Heatmap for counts of missing values
import plotly
import plotly.graph_objs as go
columns_for_map = ['perc_'+x for x in list_features]
fig = go.Figure(data=go.Heatmap(z = df_report[columns_for_map],
                                x = list_features,
                                y = list(df_report.index),hoverongaps = False))
fig.show()

### Choroplet for properties prices

It is possible to create Choropleths that draw the median price for each department in the country, for a given property type. This is very useful in exploratory data analysis, and is a very aesthetic and informative way of presenting georeferenced data.

In [16]:
def construct_geodf(df, property_type, path_shape):
    
    '''
    
    This function builds a geopandas dataframe with all the information needed for mapping. Implement missing value
    filters and cleanups for lon and lat features.
    
    Parameters:
    -----------
        df(pandas DataFrame): Contain the information of properties prices and the regions where the propertie is located
        property_type(string): Type of property for which the map will be displayed.
        path_shape(String): Path to shapefile with the Department Layer, this source is public, and are avaliable in:
                            https://sites.google.com/site/seriescol/shapes
         
    Returns:
    -----------
        geod_df(geodataframe): geopandas dataframe with regions information and other variables
    
    '''
    
    import geopandas as gpd
    import warnings
    
    #Step 1: Load an filter data:
    
    df = df[(df['missing_lon']==0) & (df['missing_lat']==0) & (df['property_type']==property_type)]
    
    #Step 2: load the shape file using geopandas

    deptos = gpd.GeoDataFrame.from_file(path_shape)
    
    #Step 3: transform geographic coordinates in shape file to convenient system (EPSG:4326)
    #        This is important for correct visualization the maps in folium

    warnings.filterwarnings("ignore")
    deptos = deptos.to_crs("EPSG:4326")
    
    #Step 4: Compute medians for each department

    medians = df[['price','l2shp']].groupby(['l2shp']).median()
    medians.reset_index(inplace=True)
    medians = medians[['l2shp','price']]
    medians.columns = ['NOMBRE_DPT','price']

    # Step 5: Calculating data from cundinamarca to use in the Bogotá polygon
    #         (Bogotá is inconveniently separated in the shapefile)

    cund_p = medians[medians.NOMBRE_DPT=="CUNDINAMARCA"]
    cund_p = cund_p.iloc[0].price
    medians = medians.append({'NOMBRE_DPT':'SANTAFE DE BOGOTA D.C','price':cund_p}, ignore_index=True)
    
    # Step 6: Merge data of price to geodataframe:

    deptos = pd.merge(deptos, medians, on="NOMBRE_DPT")
    
    #Step 7: Retain only informative columns for this map:
    
    deptos = deptos[['NOMBRE_DPT','price','geometry']]
    
    return deptos

In [19]:
geodf = construct_geodf(df,'Casa',"../app/source/depto.shp")
display(geodf)
geodf.to_html("geodf.html")

Unnamed: 0,NOMBRE_DPT,price,geometry
0,ANTIOQUIA,485000000.0,"POLYGON ((-76.30393 8.61657, -76.29474 8.61367..."
1,ATLANTICO,375000000.0,"POLYGON ((-74.86719 10.35843, -74.87300 10.340..."
2,SANTAFE DE BOGOTA D.C,541554000.0,"POLYGON ((-74.01949 4.79225, -74.02299 4.78075..."
3,BOLIVAR,480000000.0,"POLYGON ((-75.15610 10.42083, -75.15090 10.406..."
4,BOYACA,390000000.0,"POLYGON ((-72.20950 7.02466, -72.20261 7.02176..."
5,CALDAS,320000000.0,"POLYGON ((-74.69159 5.74992, -74.68129 5.74822..."
6,CAQUETA,175000000.0,"POLYGON ((-74.68927 2.49504, -74.68578 2.47654..."
7,CAUCA,300000000.0,"MULTIPOLYGON (((-78.20846 2.97229, -78.24574 2..."
8,CESAR,280000000.0,"POLYGON ((-73.27890 10.85351, -73.27720 10.835..."
9,CORDOBA,210000000.0,"POLYGON ((-75.81610 9.42021, -75.80569 9.41671..."


In [20]:
def create_choropleth(geodf, col_to_plot, col_id, factor_scale):
    
    '''
    
    Construct a Choropleth folium map based on geodataframe and an column to plot.
    More information can be associated with the map for include more columns in the input geopandas.
    This is importante for other uses, such as inserting marks or enabling filters. 
    
    Adjust factor_scale for pretty visualization to upper bar.
    
    Parameters:
    -----------
        geodf (geopandas DataFrame): Geodataframe with the information that we want associate to map.
        col_to_plot (str): Column that we will be plot
        col_id(str): Column for to be used like id of cases.
        factor_scale (float): re-scaler value for transform values in column to plotting.
    Returns:
    -----------
        map_ (folium map object): object folium with Choropleth.
        geodf_geo_json: Object json with information linked to map
    
    '''
    
    import folium
    
    #Create a geodict. This will be the data attribute for the choropleth
    
    geodf_dict = geodf[[col_id,col_to_plot]]
    geodf_dict[col_to_plot] = [x/factor_scale for x in list(geodf_dict[col_to_plot])]
        
    #Create a geo_json. This contain the geographic coordinates for each point in the map
    
    #It is very important to fit an index to the geodataframe before exporting it to geo_json, because folium uses
    #it for indexing
    
    geodf = geodf.set_index(col_id)
    geodf_geo_json = geodf.to_json()

    bins = list(round(geodf_dict[col_to_plot].quantile([0, 0.15, 0.45, 0.60, 0.80, 0.90, 1]),1))
    map_ = folium.Map(location=[5.170035, -74.914305], tiles='cartodbpositron', zoom_start=6)

    folium.Choropleth(
        geo_data=geodf_geo_json,
        name="choropleth",
        data=geodf_dict,
        columns=[col_id, col_to_plot],
        key_on="feature.id",
        fill_color="BuPu",
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name="re-scaled Median of price(Millions COP)",
        bins = bins,
        reset = True
    ).add_to(map_)

    folium.LayerControl().add_to(map_)
    
    return map_, geodf_geo_json

In [21]:
map_, geodf_geo_json = create_choropleth(geodf,'price','NOMBRE_DPT',1000000)

In [22]:
def load_source_map(map_):
    
    '''
    This function create a map source object which is possible embedding in an iframe html object.
    
    '''
    source = map_.get_root().render().replace('"', '&quot;')
    
    return source

In [23]:
style = 'style="''float:left;''width: 500px;''height: 500px;''display:inline-block;''width: 50%;''margin: 0 auto;''border: 2px solid black">'

string_map = "<iframe srcdoc=\""+load_source_map(map_)+"\""+style+"</iframe>"
        
display(HTML(string_map))