Library Import

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from google.colab import drive
import datetime
from sklearn.preprocessing import (PowerTransformer, StandardScaler, 
                                   MinMaxScaler, LabelEncoder, OneHotEncoder)

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data Preprocessing

In [2]:
df = pd.read_csv('/content/drive/My Drive/project/nutritive.csv')

#remove columns
df=df.drop('start_date',axis=1)
df=df.drop('ad_type',axis=1)
df=df.drop('Unnamed: 0',axis=1)
df=df.drop('id',axis=1)
df=df.drop('description',axis=1)
df=df.drop('title',axis=1)
df=df.drop('l1',axis=1)
df=df.drop('l2',axis=1)
df=df.drop('l4',axis=1)
df=df.drop('l5',axis=1)
df=df.drop('l6',axis=1)
df=df.drop('price_period',axis=1)

#clean dataset, rows with years higher than 2020 will be removed
df['año']=df['end_date'].str[:4]
d1=df[df['año']=='2019']
d2=df[df['año']=='2018']
d3=df[df['año']=='2020']
df4=pd.concat([d1,d2])
df=pd.concat([df4,d3])

#split 'created_on' variable
dfechas=df['created_on'].str.split('-', expand=True)
df=pd.concat([df,dfechas], axis=1)

# change format of variables created on and end date
df['created_on']=pd.to_datetime(df['created_on'], format="%Y-%m-%d")
df['end_date']=pd.to_datetime(df['end_date'], format="%Y-%m-%d")

#substract dates to get selling time
df['selling_time']=df['end_date']-df['created_on']
df['price/m2']=np.divide(df['price'],df['surface_total'])
df=df.drop('end_date',axis=1)

#rename columns

df.columns=[          'created_on',             'lat',
                   'lon',              'l3',           'rooms',
              'bedrooms',       'bathrooms',   'surface_total',
       'surface_covered',           'price',        'currency',
         'property_type',  'operation_type',             'año',
                       'Year',                 'Month',                 'Day',
          'selling_time',        'price/m2']

#create numbers from strings
df['Month']=pd.to_numeric(df['Month'])
df['Year']=pd.to_numeric(df['Year'])
df['Day']=pd.to_numeric(df['Day'])

# remove created_on cause it has been splited into month year and day
df=df.drop('created_on',axis=1)

#remove 'año'
df=df.drop('año',axis=1)

#drop rows with na values
df=df.dropna()

# get number of days from date
df['selling_time']=df['selling_time'].apply(lambda x: x.days)

#apply filters 'Currency'=COP, property type='Apto or casa'

df=df[(df['property_type']=='Casa') | (df['property_type']=='Apartamento')]
df=df[(df['currency']=='COP') & (df['operation_type']=='Venta')]
df=df.drop('currency',axis=1)
df=df.drop('operation_type',axis=1)

#flitering the 8 most frecuent cities
df=df[(df['l3']=='Bogotá D.C') |(df['l3']=='Barranquilla' )|(df['l3']=='Cali' )|(df['l3']=='Cucuta' )|(df['l3']=='Medellín' )|(df['l3']=='Chía' )|(df['l3']=='Pereira' )|(df['l3']=='Envigado' )]

df=df[df['selling_time']!=0]

df=df[df['bedrooms']<df['bedrooms'].quantile(0.98)]

df=df[df['bathrooms']<df['bathrooms'].quantile(0.98)]

df=df[df['surface_total']<df['surface_total'].quantile(0.98)]

df=df[df['surface_covered']<df['surface_covered'].quantile(0.98)]

df=df[df['price/m2']<df['price/m2'].quantile(0.98)]


#get dummies
df=pd.get_dummies(df,columns=['l3','property_type'])

# reset index
df=df.reset_index()




  interactivity=interactivity, compiler=compiler, result=result)


Define Non numerical Values (dummies columns)

In [0]:
df_dummies=df[['l3_Bogotá D.C', 'l3_Cali', 'l3_Chía', 'l3_Cucuta', 'l3_Envigado', 'l3_Medellín', 'l3_Pereira', 'property_type_Casa']]

Define numerical Values

In [5]:
# df_num is made of only the numerical values
df_num=df[['lat', 'lon', 'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered', 'price', 'Year', 'Month', 'Day', 'selling_time','price/m2', ]]
df_num.shape

(27946, 13)

Scalling (only numerical columns)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_num)
print(scaler.mean_)
df_scaled=scaler.transform(df_num)
#dataframe from scaled values
df_scaled=pd.DataFrame(df_scaled, columns=['lat', 'lon', 'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered', 'price', 'Year', 'Month', 'Day', 'selling_time','price/m2', ])
df_scaled=pd.merge(df_scaled,df_dummies,left_index=True,right_index=True)
df_scaled

[ 6.37559257e+00 -7.48025383e+01  3.03449510e+00  3.03449510e+00
  2.70453732e+00  1.48510485e+02  1.39383740e+02  5.04382645e+08
  2.01876873e+03  6.32577113e+00  1.55393616e+01  1.14148035e+02
  3.63905273e+06]


Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,Year,Month,Day,selling_time,price/m2,l3_Bogotá D.C,l3_Cali,l3_Chía,l3_Cucuta,l3_Envigado,l3_Medellín,l3_Pereira,property_type_Casa
0,-0.075857,-0.671515,-1.075062,-1.075062,-0.682322,-0.382125,-0.513130,-0.254827,-1.812634,0.782036,-0.215945,0.784666,0.287396,0,0,0,0,1,0,0,0
1,-1.036530,-1.458818,-0.035848,-0.035848,-1.650791,-0.207474,-0.655062,-1.025954,-1.812634,0.782036,-0.215945,-0.049962,-1.876998,0,1,0,0,0,0,0,1
2,-0.073755,-0.682787,-0.035848,-0.035848,0.286146,0.083611,0.243843,-0.324718,-1.812634,0.782036,-0.215945,1.502835,-0.820613,0,0,0,0,1,0,0,1
3,-0.071654,-0.684521,-0.035848,-0.035848,0.286146,-0.287523,-0.359370,-0.522742,-1.812634,0.782036,-0.215945,0.318828,-0.646822,0,0,0,0,1,0,0,1
4,-0.077258,-0.690591,-0.035848,-0.035848,-0.682322,-0.593162,-0.856133,-0.615929,-1.812634,0.782036,-0.215945,1.687229,-0.034427,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27941,-0.599796,0.638633,2.042580,2.042580,2.223083,1.226119,2.100791,0.339243,0.545317,-0.972569,-0.215945,1.793984,-0.960087,1,0,0,0,0,0,0,1
27942,-0.605050,0.636899,2.042580,2.042580,2.223083,2.048435,3.437321,2.202994,0.545317,-0.972569,-0.215945,1.793984,-0.161342,1,0,0,0,0,0,0,1
27943,-0.588939,0.662911,2.042580,2.042580,2.223083,1.975663,3.319044,3.018384,0.545317,-0.972569,-0.215945,1.793984,0.390822,1,0,0,0,0,0,0,0
27944,-0.586488,0.630829,3.081794,3.081794,2.223083,1.044191,1.757788,0.688696,0.545317,-0.972569,-0.215945,1.793984,-0.543524,1,0,0,0,0,0,0,1


MODEL 1 (Price Forecast)

In [7]:
#drop price/m2
df_model1=df_scaled.drop('price/m2', axis=1)

#Split
# partir los datos en input y output(val_rpta)
inputs=df_model1.drop(['price'],axis=1).values #todas las variables
#inputs_sin_escalar=df_debajo01_.iloc[:,3:] #eliminando variables con coeficientes inferiories a 0,01
output=df_model1['price'].values

#train test split

from sklearn.model_selection import train_test_split
train_test_split(inputs, output)
X_train, X_test, Y_train, Y_test=train_test_split(inputs, output, train_size=0.8, random_state=60)
print('tamaños de train , test ',X_train.shape, X_test.shape)

tamaños de train , test  (22356, 19) (5590, 19)


Bagging

In [8]:
import sklearn
from sklearn.ensemble import BaggingRegressor
model2 = sklearn.ensemble.BaggingRegressor(n_estimators = 2000, max_samples = 20, max_features =19)
model2.fit(X_train, Y_train)

prediction2 = model2.predict(X_test)
score = model2.score(X_test, Y_test)

score

0.5951068478434389

Bagging with ExtraTree

In [9]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor
extra_tree = ExtraTreeRegressor(splitter = "best" ,random_state=1000)
lin_reg = BaggingRegressor(extra_tree, random_state=1000, n_estimators=20)
lin_reg.fit(X_train, Y_train)

BaggingRegressor(base_estimator=ExtraTreeRegressor(ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   random_state=1000,
                                                   splitter='best'),
                 bootstrap=True, bootstrap_features=False, max_features=1.0,
                 max_samples=1.0, n_estimators=20, n_jobs=Non

In [10]:
Y_test_estimate = lin_reg.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
rmse = mean_squared_error(Y_test, Y_test_estimate)
print("Root Mean Square Error (RMSE)= {:.2f}".format(rmse))
mae = mean_absolute_error(Y_test, Y_test_estimate)
print("Mean Absolute Error (MAE)= {:.2f}".format(mae))
r2 = r2_score(Y_test, Y_test_estimate)
print("R^2 = {:.3f}".format(r2))

Root Mean Square Error (RMSE)= 0.16
Mean Absolute Error (MAE)= 0.22
R^2 = 0.838


(Optional) Encode selling time

In [0]:
#dias en que se vende
q1=df['selling_time'].quantile(0.5)

def vel(x):
  if x<q1:
    x='0'  
  else:
    x='1'
  return(x) 

df['selling_time']=df['selling_time'].apply(lambda x:vel(x))



In [0]:
#norte sur centro
a=np.array(['l3_Barranquilla','l3_Bogotá D.C', 'l3_Cali', 'l3_Chía', 'l3_Cucuta', 'l3_Envigado','l3_Medellín', 'l3_Pereira'])

dl_=df[df[a[0]]==1]

cero=dl_['lat'].min()
norte=dl_['lat'].quantile(0.66)
sur=dl_['lat'].quantile(0.33)
maxim=dl_['lat'].max()

latitud=pd.cut(dl_.lat,bins=[cero,sur,norte,maxim],labels=['sur','centro','norte'])

dl_['lat']=latitud

for l in range(1,a.shape[0]+1):

  dl_1=df[df[a[l]]==1]

  cero=dl_1['lat'].min()
  norte=dl_1['lat'].quantile(0.66)
  sur=dl_1['lat'].quantile(0.33)
  maxim=dl_1['lat'].max()

  latitud=pd.cut(dl_1.lat,bins=[cero,sur,norte,maxim],labels=['sur','centro','norte'])
  
  dl_1['lat']=latitud

  dl_=dl_.append(dl_1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


IndexError: ignored

In [0]:
# oriente centro occidente

dl_=dl_[dl_[a[0]]==1]

cero=dl_['lon'].min()
occidente=dl_['lon'].quantile(0.66)
oriente=dl_['lon'].quantile(0.33)
maxim=dl_['lon'].max()

longitud=pd.cut(dl_.lon,bins=[cero,oriente,occidente,maxim],labels=['oriente','centro','occidente'])

dl_['lon']=latitud

for l in range(1,a.shape[0]):

  dl_1=df[df[a[l]]==1]

  cero=dl_1['lon'].min()
  occidente=dl_1['lon'].quantile(0.66)
  oriente=dl_1['lon'].quantile(0.33)
  maxim=dl_1['lon'].max()

  longitud=pd.cut(dl_.lon,bins=[cero,oriente,occidente,maxim],labels=['oriente','centro','occidente'])
  
  dl_1['lon']=latitud

  dl_=dl_.append(dl_1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Define Numerical Variables (For Non-categorical Y variable)

Define Numerical Variables (For categorical Y variable)

MODEL 2 (Selling Time Forecast)

In [0]:
#Split
# partir los datos en input y output(val_rpta)
inputs=df_scaled.drop(['selling_time'],axis=1).values #todas las variables
#inputs_sin_escalar=df_debajo01_.iloc[:,3:] #eliminando variables con coeficientes inferiories a 0,01
output=df_scaled['selling_time'].values

#train test split

from sklearn.model_selection import train_test_split
train_test_split(inputs, output)
X_train, X_test, Y_train, Y_test=train_test_split(inputs, output, train_size=0.8, random_state=60)
print('tamaños de train , test ',X_train.shape, X_test.shape)

tamaños de train , test  (18441, 20) (4611, 20)


AdaBoost Cassifier

In [0]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [0]:
classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200
)
classifier.fit(X_train, Y_train)

ValueError: ignored