In [2]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Import estimators and transformers
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from src.features import CustomEncoder
from sklearn.decomposition import PCA

# Import predictors
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%load_ext autoreload
%autoreload 2

path_processed = '/home/matteo@COPPET/Documents/data_science/projects/housing_prices_firenze/data/processed/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df = pd.read_csv(path_processed+'data_clean.csv')

### 1. Feature selection and splitting data

In [4]:
target = 'Prezzo_EUR'

X = df.drop(columns=['Prezzo_per_m2']+[target])
y = df[target].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### 2. Pre-processing pipeline

In [5]:
cat_features = ['Tipologia', 'Zona', 'Stato', 'Tipo_proprietà', 'Riscaldamento_A_C', 'Tipo_riscaldamento',
                'Alimentazione_riscaldamento', 'Classe_energetica', 'Piano']
num_features = ['Superficie_m2', 'Num_bagni', 'Num_tot_locali', 'Anno_costruzione']

# cat_features = ['Tipologia', 'Zona', 'Tipo_proprietà', 'Tipo_riscaldamento', 'Classe_energetica']
# num_features = ['Superficie_m2', 'Num_tot_locali']

cat_transformer = Pipeline([
    ('imputing', SimpleImputer(strategy='most_frequent')),
    ('oh_encoding', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline([
    ('scaling', StandardScaler()),
    ('imputing', SimpleImputer(strategy='mean'))
])

preprocessing_pipeline = ColumnTransformer([
    ('categoricals', cat_transformer, cat_features),
    ('numericals', num_transformer, num_features)
],
    remainder='passthrough'
)

### 3. Modeling

#### 3.1 First model (benchmarking)

In [9]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('pca', PCA(n_components=20)),
    ('model', LinearRegression())
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 140714.30325511587
Mean cross validation score: 145152.11194017527


#### 3.2 Other models

In [10]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('pca', PCA(n_components=20)),
    ('model', Ridge())
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 140608.05917375383
Mean cross validation score: 145229.24037456853


In [11]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('pca', PCA(n_components=20)),
    ('model', RandomForestRegressor(random_state=0))
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 52457.00963679461
Mean cross validation score: 127183.15819502738


### 4. Tuning PCA

In [12]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('pca', PCA()),
    ('model', RandomForestRegressor(random_state=0))
])

param_grid = {'pca__n_components': [5, 10, 20, 30, 40, 50, 60, None]}

grid_search = GridSearchCV(pipe, param_grid, cv=5, verbose=0, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)

138092.86333382674 {'pca__n_components': 5}
127370.2117892134 {'pca__n_components': 10}
126728.2960333098 {'pca__n_components': 20}
126374.47363327583 {'pca__n_components': 30}
127525.06746861087 {'pca__n_components': 40}
127186.68925650857 {'pca__n_components': 50}
126187.37486419182 {'pca__n_components': 60}
127213.68811021633 {'pca__n_components': None}


In [13]:
pipe = grid_search.best_estimator_

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 47843.23735200023
Mean cross validation score: 125536.28658040872
