In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor, BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler

# Functions

In [2]:
def filter_outliers(data):
    q1_prix, q2_prix, q3_prix, q4_prix = data.prix.quantile([0.25, 0.5, 0.75, 1])
    iqr_prix = q3_prix - q1_prix
    lower_prix = q1_prix- 1.5*iqr_prix
    upper_prix = q3_prix + 1.5*iqr_prix
    data_filtered = data.query('prix>=@lower_prix and prix<=@upper_prix')
    return data_filtered

def randomized_search(estimator_, X_train_, y_train_, param_grid_, cv_, verbose_, n_jobs_, n_iter_, scoring_='neg_mean_squared_error', random_state=1):
    estimator_grid = RandomizedSearchCV(estimator=estimator_, param_distributions=param_grid_,
                       cv=cv_, scoring=scoring_,
                       verbose=verbose_, n_jobs=n_jobs_, n_iter=n_iter_, random_state=seed)
    estimator_grid.fit(X_train_, y_train_)
    return estimator_grid.best_estimator_, estimator_grid.best_params_

def cross_val_rmse_(model_, X_train_, y_train_, cv_, scoring_='neg_mean_squared_error'):
    model_cv_scores = cross_val_score(model_, X_train_, y_train_, cv=cv_, scoring=scoring_)
    model_rmse = np.sqrt(-model_cv_scores.mean())
    return model_rmse

def cross_val_rmse_score_printer(models, n_folds, X_train, y_train):
    models_rmses = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_rmse = cross_val_rmse_(model, X_train, y_train, cv_=n_folds)
        models_rmses[name] = model_rmse
    for key, value in models_rmses.items():
        print(f"\033[1m{key}\033[0m cross-validation RMSE: \033[1m{value:.2f}\033[0m")
    return models_rmses

# Load the data

In [3]:
data = pd.read_csv("real_estate_PML.csv")
data.head()
keys = ['Très Petit', 'Petit', 'Moyen', 'Grand', 'Très Grand' ]
values = [1,2,3,4,5]
mapped_dict = dict(zip(keys, values))

# Outliers removal

In [4]:
paris, lyon, mrs = data.loc[data['ville']=='Paris',].copy(), data.loc[data['ville']=='Lyon',].copy(), data.loc[data['ville']=='Marseille',].copy()
paris.shape, lyon.shape, mrs.shape

((2921, 8), (2176, 8), (876, 8))

In [5]:
paris_filtered = filter_outliers(paris)
lyon_filtered = filter_outliers(lyon)
mrs_filtered = filter_outliers(mrs)


In [6]:
paris_filtered.shape, lyon_filtered.shape, mrs_filtered.shape

((2691, 8), (2053, 8), (798, 8))

In [7]:
df = pd.concat([paris_filtered, lyon_filtered, mrs_filtered], axis=0, ignore_index=True)
data.shape, df.shape

((5973, 8), (5542, 8))

# Dummies

In [8]:
dummy = pd.get_dummies(df['ville'], dtype=float)
df = pd.concat([dummy, df], axis=1).drop(['type_l', 'prix_m2', 'ville'], axis=1)

In [9]:
df.loc[df['n_pieces']==0, 'n_pieces'] = 1
df.n_pieces.value_counts()

n_pieces
3.0     1775
2.0     1388
4.0     1127
1.0      716
5.0      447
6.0       73
7.0       12
8.0        2
10.0       1
25.0       1
Name: count, dtype: int64

# Data split and scaling

In [10]:
features = [col for col in df.columns if col != 'prix']
seed = 25
X_train, X_test, y_train, y_test = train_test_split(df[features].values, df.prix.values, test_size=0.2, random_state=seed)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Establish base models

In [12]:
lreg = LinearRegression()
ridge = Ridge(alpha=0.5)
dt = DecisionTreeRegressor(random_state=seed)
rf = RandomForestRegressor(random_state=seed)
knr = KNeighborsRegressor()

In [13]:
# RMSE on non-scaled features
models = {'Linear Regresion': LinearRegression(),
          'Ridge': Ridge(alpha=0.5),
          'Decision Tree Regressor': DecisionTreeRegressor(random_state=25),
          'Random Forest Regressor': RandomForestRegressor(random_state=25),
          'KNeighbors Regressor': KNeighborsRegressor()}
non_scaled_10cv_rmse = cross_val_rmse_score_printer(models, n_folds=10, X_train=X_train, y_train=y_train)

[1mLinear Regresion[0m cross-validation RMSE: [1m177582.75[0m
[1mRidge[0m cross-validation RMSE: [1m177582.06[0m
[1mDecision Tree Regressor[0m cross-validation RMSE: [1m150136.42[0m
[1mRandom Forest Regressor[0m cross-validation RMSE: [1m123069.19[0m
[1mKNeighbors Regressor[0m cross-validation RMSE: [1m187849.65[0m


In [14]:
# RMSE on scaled features
scaled_10cv_rmse = cross_val_rmse_score_printer(models, n_folds=10, X_train=X_train_scaled, y_train=y_train)

[1mLinear Regresion[0m cross-validation RMSE: [1m177536.17[0m
[1mRidge[0m cross-validation RMSE: [1m177581.88[0m
[1mDecision Tree Regressor[0m cross-validation RMSE: [1m150349.17[0m
[1mRandom Forest Regressor[0m cross-validation RMSE: [1m123088.81[0m
[1mKNeighbors Regressor[0m cross-validation RMSE: [1m133710.88[0m


# Add new feature

In [15]:
containers = [0,20,40,80,160,df['surface_m2'].max()]
df['largeur'] = pd.cut(df['surface_m2'], containers, labels = keys)
df['largeur']=df['largeur'].map(mapped_dict)
df.head()

Unnamed: 0,Lyon,Marseille,Paris,arr,n_pieces,n_chambres,surface_m2,prix,largeur
0,0.0,0.0,1.0,17,3.0,1.0,51.84,535000.0,3
1,0.0,0.0,1.0,15,3.0,2.0,40.77,470000.0,3
2,0.0,0.0,1.0,13,2.0,1.0,39.0,385000.0,2
3,0.0,0.0,1.0,13,2.0,1.0,36.03,383000.0,2
4,0.0,0.0,1.0,10,2.0,1.0,40.7,399000.0,3


In [21]:
new_features = [col for col in df.columns if col!='prix']
X_new = df[new_features].values
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, df.prix.values, test_size=0.2, random_state=seed)
X_new_train_scaled = scaler.fit_transform(X_new_train)
X_new_test_scaled = scaler.transform(X_new_test)

In [22]:
scaled_10cv_rmse_new_features = cross_val_rmse_score_printer(models, n_folds=10, X_train=X_new_train_scaled, y_train=y_new_train)

[1mLinear Regresion[0m cross-validation RMSE: [1m174499.15[0m
[1mRidge[0m cross-validation RMSE: [1m174539.41[0m
[1mDecision Tree Regressor[0m cross-validation RMSE: [1m150735.50[0m
[1mRandom Forest Regressor[0m cross-validation RMSE: [1m123099.26[0m
[1mKNeighbors Regressor[0m cross-validation RMSE: [1m134284.77[0m


## Comment on results


# Hyper parameter Tuning

In [26]:
# Linear Models.
knr.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [30]:
ridge_param = {
    'alpha':[1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 50],
    'solver':['svd', 'cholesky', 'saga']}

dt_params = {
    'max_depth':range(1,20),
    'min_samples_leaf':[0.1, 0.2, 0.3, 0.5, 0.8, 1],
    'min_samples_split':[1, 2, 3, 5, 10]}

rf_params = {
    'bootstrap':[True, False],
    'ccp_alpha':[0.2, 0.4, 0.6, 0.8, 1],
    'max_depth':[1, 2, 4, 6, 8, 10, 12, 14, 15],
    'min_samples_leaf':[0.2, 0.4, 0.6, 0.8, 1],
    'n_estimators':[20, 50, 100, 250, 500]}

knr_params = {
    'algorithm':['ball_tree', 'kd_tree'],
    'leaf_size':[10,20,30],
    'n_neighbors':[1,3,5,10,15]}
params_list = [ridge_param, dt_params, rf_params, knr_params]

In [38]:
new_models = {
          'Ridge': Ridge(alpha=0.5),
          'Decision Tree Regressor': DecisionTreeRegressor(random_state=25),
          'Random Forest Regressor': RandomForestRegressor(random_state=25),
          'KNeighbors Regressor': KNeighborsRegressor()}

best_model_dict = {}

for (key,val), parameters in zip(new_models.items(), params_list):
    best_model, model_best_hp = randomized_search(v, X_new_train_scaled, y_new_train, param_grid_=parameters, cv_=5, verbose_=0, n_jobs_=-1, n_iter_= 200)


Model name: Ridge
Model: Ridge(alpha=0.5)
params: {'alpha': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50], 'solver': ['svd', 'cholesky', 'saga']}
-------------------
Model name: Decision Tree Regressor
Model: DecisionTreeRegressor(random_state=25)
params: {'max_depth': range(1, 20), 'min_samples_leaf': [0.1, 0.2, 0.3, 0.5, 0.8, 1], 'min_samples_split': [1, 2, 3, 5, 10]}
-------------------
Model name: Random Forest Regressor
Model: RandomForestRegressor(random_state=25)
params: {'bootstrap': [True, False], 'ccp_alpha': [0.2, 0.4, 0.6, 0.8, 1], 'max_depth': [1, 2, 4, 6, 8, 10, 12, 14, 15], 'min_samples_leaf': [0.2, 0.4, 0.6, 0.8, 1], 'n_estimators': [20, 50, 100, 250, 500]}
-------------------
Model name: KNeighbors Regressor
Model: KNeighborsRegressor()
params: {'algorithm': ['ball_tree', 'kd_tree'], 'leaf_size': [10, 20, 30], 'n_neighbors': [1, 3, 5, 10, 15]}
-------------------
