In [7]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Import estimators and transformers
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from src.features import CustomEncoder

# Import predictors
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%load_ext autoreload
%autoreload 2

path_processed = '/home/matteo@COPPET/Documents/data_science/projects/housing_prices_firenze/data/processed/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
df = pd.read_csv(path_processed+'data_clean.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8714 entries, 0 to 8713
Data columns (total 57 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Tipologia                                  8714 non-null   object 
 1   Tipo_proprietà                             8277 non-null   object 
 2   Zona                                       8714 non-null   object 
 3   Anno_costruzione                           6334 non-null   float64
 4   Stato                                      8483 non-null   object 
 5   Prezzo_EUR                                 8714 non-null   float64
 6   Superficie_m2                              8714 non-null   float64
 7   Prezzo_per_m2                              8714 non-null   float64
 8   Riscaldamento_A_C                          8714 non-null   object 
 9   Tipo_riscaldamento                         6760 non-null   object 
 10  Alimentazione_riscaldame

### 1. Feature selection and splitting data

In [9]:
target = 'Prezzo_EUR'

X = df.drop(columns=['Prezzo_per_m2']+[target])
y = df[target].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### 2. Pre-processing pipeline

In [15]:
cat_features = ['Tipologia', 'Zona', 'Stato', 'Tipo_proprietà', 'Riscaldamento_A_C', 'Tipo_riscaldamento',
                'Alimentazione_riscaldamento', 'Classe_energetica', 'Piano']
num_features = ['Superficie_m2', 'Num_bagni', 'Num_tot_locali', 'Anno_costruzione']

# cat_features = ['Tipologia', 'Zona', 'Tipo_proprietà', 'Tipo_riscaldamento', 'Classe_energetica']
# num_features = ['Superficie_m2', 'Num_tot_locali']

cat_transformer = Pipeline([
    ('imputing', SimpleImputer(strategy='most_frequent')),
    ('oh_encoding', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline([
    ('scaling', StandardScaler()),
    ('imputing', SimpleImputer(strategy='mean'))
])

preprocessing_pipeline = ColumnTransformer([
    ('categoricals', cat_transformer, cat_features),
    ('numericals', num_transformer, num_features)
],
    remainder='passthrough'
)

### 3. Modeling

#### 3.1 First model (benchmarking)

In [16]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', LinearRegression())
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 135810.07034591644
Mean cross validation score: 3639530827.4302


The performance of this model is better on the training set compared to the previous pipeline (notebook 3.1) but terrible with the test set.

#### 3.2 Other models

In [17]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', Ridge())
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 135767.89612546202
Mean cross validation score: 144091.57258373205


This model performs better on the training set but a little worse on the test set. Generally though there's isn't much of a difference.

In [18]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', RandomForestRegressor(random_state=0))
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 36525.38813032678
Mean cross validation score: 99136.71497637955


This model performs better on the training and test sets compared to the previous preprocessing pipeline.