In [1]:
import numpy as np
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Import estimators
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Import metrics
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', None)

In [2]:
path_processed = '/home/matteo@COPPET/Documents/data_science/projects/housing_prices_firenze/data/processed/'

In [3]:
df = pd.read_excel(path_processed+'data.xlsx')
df.head()

Unnamed: 0,Data annuncio,Zona,Tipologia,Prezzo_EUR,Prezzo_per_m2,Superficie_m2,Superficie_Bins,Numero totale di locali,Numero di bagni,Stato,Tipologia di riscaldamento
0,2020-05-10,Bellosguardo Galluzzo,Appartamento,260000,4482.758621,58.0,"(0, 60]",2,1,Ottimo / Ristrutturato,Autonomo
1,2020-05-10,Coverciano Bellariva,Appartamento,275000,4230.769231,65.0,"(60, 80]",3,1,Ottimo / Ristrutturato,Centralizzato
2,2020-05-10,Firenze Nord,Appartamento,195000,3611.111111,54.0,"(0, 60]",2,1,Ottimo / Ristrutturato,Centralizzato
3,2020-05-10,Firenze Nord,Appartamento,195000,3482.142857,56.0,"(0, 60]",2,1,Ottimo / Ristrutturato,Centralizzato
4,2020-05-10,Campo Di Marte Liberta,Appartamento,328000,3858.823529,85.0,"(80, 100]",4,1,Buono / Abitabile,Centralizzato


### 1. Pre-processing

In [4]:
features = ['Zona', 'Tipologia', 'Superficie_m2', 'Numero totale di locali',
            'Numero di bagni', 'Stato', 'Tipologia di riscaldamento']
target = 'Prezzo_EUR'
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### One-hot encoding

In [5]:
cat_cols = ['Zona', 'Tipologia', 'Stato', 'Tipologia di riscaldamento']

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore')
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[cat_cols]).toarray(),
                             columns=OH_encoder.get_feature_names())
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[cat_cols]).toarray(),
                            columns=OH_encoder.get_feature_names())

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(cat_cols, axis=1)
num_X_test = X_test.drop(cat_cols, axis=1)

# Add one-hot encoded columns to numerical features
X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

#### Scaling

In [6]:
scaler = StandardScaler()

X_train['Superficie_m2'] = scaler.fit_transform(X_train[['Superficie_m2']])
X_test['Superficie_m2'] = scaler.transform(X_test[['Superficie_m2']])

### 2. Modeling

####  First model

In [7]:
reg = LinearRegression()
reg.fit(X_train, y_train)
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

# Train set score
#print('Training set score: {}'.format(reg.score(X_train, y_train)))
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))
print('Test set score: {}'.format(mean_absolute_error(y_test, y_test_pred)))

Training set score: 149548.55356190496
Test set score: 142669.0535981996


#### Other models

In [8]:
rdmf = RandomForestRegressor(random_state=0)
rdmf.fit(X_train, y_train)

y_train_pred = rdmf.predict(X_train)
y_test_pred = rdmf.predict(X_test)

print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))
print('Test set score: {}'.format(mean_absolute_error(y_test, y_test_pred)))

Training set score: 45952.00349853058
Test set score: 98000.99595632931


In [9]:
KNN = KNeighborsRegressor()
KNN.fit(X_train, y_train)

y_train_pred = KNN.predict(X_train)
y_test_pred = KNN.predict(X_test)

print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))
print('Test set score: {}'.format(mean_absolute_error(y_test, y_test_pred)))

Training set score: 101105.79897113462
Test set score: 106317.64868571427


In [10]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))
print('Test set score: {}'.format(mean_absolute_error(y_test, y_test_pred)))

Training set score: 91639.12268795549
Test set score: 105199.22660714286


### 3. Hyperparameter tuning

In [11]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


grid_search = GridSearchCV(rdmf, param_grid, cv=5, verbose=1, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4320 candidates, totalling 21600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 