In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Import estimators and transformers
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from src.features import CustomEncoder
from sklearn.decomposition import PCA

# Import predictors
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%load_ext autoreload
%autoreload 2

path_processed = '/home/matteo@COPPET/Documents/data_science/projects/housing_prices_firenze/data/processed/'

In [2]:
df = pd.read_csv(path_processed+'data_clean.csv')

### 1. Feature selection and splitting data

In [4]:
target = 'Prezzo_EUR'
drop_cols = ['Prezzo_per_m2', 'Anno_costruzione', 'Tipo_riscaldamento', 'Alimentazione_riscaldamento',
             'Classe_energetica'] + [target]

X = df.drop(columns=drop_cols)
y = df[target].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### 2. Pre-processing pipeline

In [5]:
cat_features = ['Tipologia', 'Zona', 'Stato', 'Tipo_proprietà', 'Riscaldamento_A_C', 'Piano']
num_features = ['Superficie_m2', 'Num_bagni', 'Num_tot_locali']

# cat_features = ['Tipologia', 'Zona', 'Tipo_proprietà', 'Tipo_riscaldamento', 'Classe_energetica']
# num_features = ['Superficie_m2', 'Num_tot_locali']

cat_transformer = Pipeline([
    ('imputing', SimpleImputer(strategy='most_frequent')),
    ('oh_encoding', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline([
    ('scaling', StandardScaler()),
    ('imputing', SimpleImputer(strategy='mean'))
])

preprocessing_pipeline = ColumnTransformer([
    ('categoricals', cat_transformer, cat_features),
    ('numericals', num_transformer, num_features)
],
    remainder='passthrough'
)

### 3. Modeling

#### 3.1 First model (benchmarking)

In [7]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', LinearRegression())
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 134833.75986228662
Mean cross validation score: 13255542670473.53


#### 3.2 Other models

In [8]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', Ridge())
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 134760.16481154386
Mean cross validation score: 142635.27057315363


In [9]:
pipe = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', RandomForestRegressor(random_state=0))
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)

# Train set score
print('Training set score: {}'.format(mean_absolute_error(y_train, y_train_pred)))

# Cross validation score
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)
print('Mean cross validation score: {}'.format(np.mean(-scores)))

Training set score: 36356.276877881835
Mean cross validation score: 98677.69355130232
