Library Import

In [116]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from google.colab import drive
import datetime
from sklearn.preprocessing import (PowerTransformer, StandardScaler, 
                                   MinMaxScaler, LabelEncoder, OneHotEncoder)

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data Preprocessing

In [438]:
df = pd.read_csv('/content/drive/My Drive/project/nutritive.csv')

#remove columns
df=df.drop('start_date',axis=1)
df=df.drop('ad_type',axis=1)
df=df.drop('Unnamed: 0',axis=1)
df=df.drop('id',axis=1)
df=df.drop('description',axis=1)
df=df.drop('title',axis=1)
df=df.drop('l1',axis=1)
df=df.drop('l2',axis=1)
df=df.drop('l4',axis=1)
df=df.drop('l5',axis=1)
df=df.drop('l6',axis=1)
df=df.drop('price_period',axis=1)

#clean dataset, rows with years higher than 2020 will be removed
df['año']=df['end_date'].str[:4]
d1=df[df['año']=='2019']
d2=df[df['año']=='2018']
d3=df[df['año']=='2020']
df4=pd.concat([d1,d2])
df=pd.concat([df4,d3])

#split 'created_on' variable
dfechas=df['created_on'].str.split('-', expand=True)
df=pd.concat([df,dfechas], axis=1)

# change format of variables created on and end date
df['created_on']=pd.to_datetime(df['created_on'], format="%Y-%m-%d")
df['end_date']=pd.to_datetime(df['end_date'], format="%Y-%m-%d")

#substract dates to get selling time
df['selling_time']=df['end_date']-df['created_on']
df['price/m2']=np.divide(df['price'],df['surface_total'])
df=df.drop('end_date',axis=1)

#rename columns

df.columns=[          'created_on',             'lat',
                   'lon',              'l3',           'rooms',
              'bedrooms',       'bathrooms',   'surface_total',
       'surface_covered',           'price',        'currency',
         'property_type',  'operation_type',             'año',
                       'Year',                 'Month',                 'Day',
          'selling_time',        'price/m2']

#create numbers from strings
df['Month']=pd.to_numeric(df['Month'])
df['Year']=pd.to_numeric(df['Year'])
df['Day']=pd.to_numeric(df['Day'])

# remove created_on cause it has been splited into month year and day
df=df.drop('created_on',axis=1)

#remove 'año'
df=df.drop('año',axis=1)

#drop rows with na values
df=df.dropna()

# get number of days from date
df['selling_time']=df['selling_time'].apply(lambda x: x.days)

#apply filters 'Currency'=COP, property type='Apto or casa'

df=df[(df['property_type']=='Casa') | (df['property_type']=='Apartamento')]
df=df[(df['currency']=='COP') & (df['operation_type']=='Venta')]
df=df.drop('currency',axis=1)
df=df.drop('operation_type',axis=1)

#flitering the 8 most frecuent cities
df=df[(df['l3']=='Bogotá D.C') |(df['l3']=='Barranquilla' )|(df['l3']=='Cali' )|(df['l3']=='Cucuta' )|(df['l3']=='Medellín' )|(df['l3']=='Chía' )|(df['l3']=='Pereira' )|(df['l3']=='Envigado' )]

df=df[df['selling_time']!=0]

df=df[df['bedrooms']<df['bedrooms'].quantile(0.95)]

df=df[df['bathrooms']<df['bathrooms'].quantile(0.95)]

df=df[df['surface_total']<df['surface_total'].quantile(0.95)]

df=df[df['surface_covered']<df['surface_covered'].quantile(0.95)]

df=df[df['price/m2']<df['price/m2'].quantile(0.95)]


#get dummies
df=pd.get_dummies(df,columns=['l3','property_type'],drop_first=True)

# reset index
df=df.reset_index()




  interactivity=interactivity, compiler=compiler, result=result)


(Optional) Encode selling time

In [0]:
#q1=df['selling_time'].quantile(0.5)

#def vel(x):
#  if x<q1:
#    x='0'  
#  else:
#    x='1'
#  return(x) 

#df['selling_time']=df['selling_time'].apply(lambda x:vel(x))

Define Numerical Variables

In [439]:
# df_num is made of only the numerical values
df_num=df[['lat', 'lon', 'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered', 'price', 'Year', 'Month', 'Day', 'selling_time','price/m2', ]]
df_num.shape

(23052, 13)

Define Non numerical Values (dummies columns)

In [0]:
df_dummies=df[['l3_Bogotá D.C', 'l3_Cali', 'l3_Chía', 'l3_Cucuta', 'l3_Envigado', 'l3_Medellín', 'l3_Pereira', 'property_type_Casa']]

Scalling (only numerical columns)

In [443]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_num)
print(scaler.mean_)
df_scaled=scaler.transform(df_num)
#dataframe from scaled values
df_scaled=pd.DataFrame(df_scaled, columns=['lat', 'lon', 'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered', 'price', 'Year', 'Month', 'Day', 'selling_time','price/m2', ])
df_scaled=pd.merge(df_scaled,df_dummies,left_index=True,right_index=True)
df_scaled

[ 6.51131563e+00 -7.48104394e+01  2.86391636e+00  2.86391636e+00
  2.46369079e+00  1.12801275e+02  1.13865738e+02  3.97338654e+08
  2.01875798e+03  6.43037480e+00  1.55685407e+01  1.12651527e+02
  3.54591576e+06]


Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,Year,Month,Day,selling_time,price/m2,l3_Bogotá D.C,l3_Cali,l3_Chía,l3_Cucuta,l3_Envigado,l3_Medellín,l3_Pereira,property_type_Casa
0,-0.121456,-0.664540,-1.127070,-1.127070,-0.549332,-0.293790,-0.333006,-0.008808,-1.758121,0.749814,-0.217528,0.796269,0.404662,0,0,0,0,1,0,0,0
1,-1.067066,-1.451694,0.177535,0.177535,-1.734028,0.125878,-0.556678,-1.255403,-1.758121,0.749814,-0.217528,-0.035308,-2.143746,0,1,0,0,0,0,0,1
2,-0.119387,-0.675809,0.177535,0.177535,0.635363,0.825326,0.859913,-0.121792,-1.758121,0.749814,-0.217528,1.511812,-0.899933,0,0,0,0,1,0,0,1
3,-0.117319,-0.677543,0.177535,0.177535,0.635363,-0.066470,-0.090694,-0.441915,-1.758121,0.749814,-0.217528,0.332133,-0.695308,0,0,0,0,1,0,0,1
4,-0.122835,-0.683612,0.177535,0.177535,-0.549332,-0.800890,-0.873547,-0.592561,-1.758121,0.749814,-0.217528,1.695533,0.025741,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23047,-0.649246,0.581211,2.786746,2.786746,1.820059,0.842812,1.493651,0.951560,0.561355,-1.000979,-0.217528,1.801898,0.349644,1,0,0,0,0,0,0,1
23048,-0.626494,0.641028,2.786746,2.786746,1.820059,0.475602,1.475012,1.215191,0.561355,-1.000979,-0.217528,1.801898,1.136380,1,0,0,0,0,0,0,1
23049,-0.658210,0.608952,2.786746,2.786746,1.820059,0.265768,0.263454,-0.366592,0.561355,-1.000979,-0.217528,1.801898,-0.855458,1,0,0,0,0,0,0,1
23050,-0.622357,0.677438,2.786746,2.786746,1.820059,0.563033,2.444259,1.817774,0.561355,-1.000979,-0.217528,1.801898,1.795396,1,0,0,0,0,0,0,1


MODEL 1 (Price Forecast)

In [445]:
#drop price/m2
df_model1=df_scaled.drop('price/m2', axis=1)

#Split
# partir los datos en input y output(val_rpta)
inputs=df_model1.drop(['price'],axis=1).values #todas las variables
#inputs_sin_escalar=df_debajo01_.iloc[:,3:] #eliminando variables con coeficientes inferiories a 0,01
output=df_model1['price'].values

#train test split

from sklearn.model_selection import train_test_split
train_test_split(inputs, output)
X_train, X_test, Y_train, Y_test=train_test_split(inputs, output, train_size=0.8, random_state=60)
print('tamaños de train , test ',X_train.shape, X_test.shape)

tamaños de train , test  (18441, 19) (4611, 19)


MODEL 2 (Selling Time Forecast)

In [451]:
#Split
# partir los datos en input y output(val_rpta)
inputs=df_scaled.drop(['selling_time'],axis=1).values #todas las variables
#inputs_sin_escalar=df_debajo01_.iloc[:,3:] #eliminando variables con coeficientes inferiories a 0,01
output=df_scaled['selling_time'].values

#train test split

from sklearn.model_selection import train_test_split
train_test_split(inputs, output)
X_train, X_test, Y_train, Y_test=train_test_split(inputs, output, train_size=0.8, random_state=60)
print('tamaños de train , test ',X_train.shape, X_test.shape)

tamaños de train , test  (18441, 20) (4611, 20)


Bagging

In [452]:
import sklearn
from sklearn.ensemble import BaggingRegressor
model2 = sklearn.ensemble.BaggingRegressor(n_estimators = 2000, max_samples = 20, max_features =19)
model2.fit(X_train, Y_train)

prediction2 = model2.predict(X_test)
score = model2.score(X_test, Y_test)

score

0.21334492367298186

Bagging with ExtraTree

In [453]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor
extra_tree = ExtraTreeRegressor(splitter = "best" ,random_state=1000)
lin_reg = BaggingRegressor(extra_tree, random_state=1000, n_estimators=20)
lin_reg.fit(X_train, Y_train)

BaggingRegressor(base_estimator=ExtraTreeRegressor(ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   random_state=1000,
                                                   splitter='best'),
                 bootstrap=True, bootstrap_features=False, max_features=1.0,
                 max_samples=1.0, n_estimators=20, n_jobs=Non

In [454]:
Y_test_estimate = lin_reg.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
rmse = mean_squared_error(Y_test, Y_test_estimate)
print("Root Mean Square Error (RMSE)= {:.2f}".format(rmse))
mae = mean_absolute_error(Y_test, Y_test_estimate)
print("Mean Absolute Error (MAE)= {:.2f}".format(mae))
r2 = r2_score(Y_test, Y_test_estimate)
print("R^2 = {:.3f}".format(r2))

Root Mean Square Error (RMSE)= 0.46
Mean Absolute Error (MAE)= 0.46
R^2 = 0.524
