# Optimizing the model with series of experiments

In [1]:
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

### Let´s first reload the data from the previous notebook. Then, perform train_test_split once again.

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

/home/arthur/Documentos/Insper/6_semestre/projetos_ml/AmesHousingDataset/data


In [3]:
model_data_scaled_path = DATA_DIR / 'processed' / 'ames_model_data_scaled.pkl'

In [4]:
data = pd.read_pickle(model_data_scaled_path)

In [5]:
X = data.drop(columns=['SalePrice']).copy().values
y = data['SalePrice'].copy().values

In [6]:
X.shape, y.shape

((2877, 164), (2877,))

In [7]:
RANDOM_SEED = 42  # Any number here, really.

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)

### Experiment 1: LinearRegression with encoded categorical values AND scaled continuous variables

Our first experiment is to rerun the linear regression model with the same features as before, but this time on the new data with scaled continuous variables.

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_scaled_model = LinearRegression()

linear_scaled_model.fit(Xtrain, ytrain)

In [None]:
scores_linear_scaled = cross_val_score(linear_scaled_model,
                                       Xtrain,
                                       ytrain,
                                       cv=10,
                                       scoring='neg_mean_squared_error',
                                       n_jobs=-1)

scores_linear_scaled = np.sqrt(-scores_linear_scaled)
print(f'MEAN: {scores_linear_scaled.mean():.2f} +/- {scores_linear_scaled.std():.2f}')
error_percent_linear = 100 * (10**scores_linear_scaled.mean() - 1)
std_percent_linear = 100 * (10**scores_linear_scaled.std() - 1)
print(f'Average error is {error_percent_linear:.2f}%')
print(f'Standard deviation of error is {std_percent_linear:.2f}%')

path = DATA_DIR / 'processed' / 'models_log.csv'

with open(path, 'a') as f:
    f.write(f'Linear Regression Scaled,{error_percent_linear:.2f},{std_percent_linear:.2f}\n')

The experiment showed no signifficant difference in the model performance. This means that the LinearRegression model is not sensitive to the scale of the continuous variables. We will, however, still keep the scaled data for the next experiments.

### Experiment 2: Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso = Lasso()

params = {
    'alpha': np.logspace(-4, 0, 100),
    'max_iter': [15000, 20000, 30000],
    'tol': [0.0001, 0.001],
    'selection': ['cyclic', 'random'],
}

grid = GridSearchCV(lasso, params, cv=5, verbose=1, n_jobs=-1)

In [None]:
grid.fit(Xtrain, ytrain)

In [None]:
best_params = grid.best_params_
print(best_params)

In [None]:
lasso_best = Lasso(**best_params)
lasso_best.fit(Xtrain, ytrain)

In [None]:
scores_lasso = cross_val_score(lasso_best, Xtrain, ytrain, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
scores_lasso = np.sqrt(-scores_lasso)
print(f'MEAN: {scores_lasso.mean():.2f} +/- {scores_lasso.std():.2f}')
error_percent_lasso = 100 * (10**scores_lasso.mean() - 1)
std_percent_lasso = 100 * (10**scores_lasso.std() - 1)
print(f'Average error is {error_percent_lasso:.2f}%')
print(f'Standard deviation of error is {std_percent_lasso:.2f}%')

path = DATA_DIR / 'processed' / 'models_log.csv'

with open(path, 'a') as f:
    f.write(f'Lasso,{error_percent_lasso:.2f},{std_percent_lasso:.2f}\n')

### Experiment 3:  Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()

params = {
    'n_estimators': [100, 600, 1200, 1800],
    'min_samples_split': [3],
    'max_features': ['auto', 'sqrt'],
    'verbose': [0, 1, 2, 3],
    'max_depth': [None],
    'bootstrap': [False],
    'min_weight_fraction_leaf': [0.0],
    'min_samples_leaf': [1],
}

grid = GridSearchCV(rf, params, cv=5, verbose=1, n_jobs=-1)

In [None]:
grid.fit(Xtrain, ytrain)

In [None]:
best_params = grid.best_params_
print(best_params)

In [None]:
rf_best = RandomForestRegressor(**best_params)
rf_best.fit(Xtrain, ytrain)

In [None]:
scores_rf = cross_val_score(rf_best, Xtrain, ytrain, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
scores_rf = np.sqrt(-scores_rf)
error_percent_rf = 100 * (10**scores_rf.mean() - 1)
std_percent_rf = 100 * (10**scores_rf.std() - 1)
print(f'Average error is {error_percent_rf:.2f}%')
print(f'Standard deviation of error is {std_percent_rf:.2f}%')

path = DATA_DIR / 'processed' / 'models_log.csv'

with open(path, 'a') as f:
    f.write(f'Random Forest,{error_percent_rf:.2f},{std_percent_rf:.2f}\n')

### Experiment 4:  Gradient Boosting Regression

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor()

params = {
 'alpha': 0.9, 
 'criterion': 'friedman_mse',
 'learning_rate': [0.1, 0.5, 0.01], 
 'max_depth': [3, 4, 5], 
 'max_features': 'sqrt', 
 'min_samples_leaf': [2,3], 
 'n_estimators': [1600,200], 
 'subsample': [0.9, 0.8, 0.7, 0.6], 
 'verbose': [0,1,2], 
 'warm_start': [True, False]
}

grid = GridSearchCV(gbr, params, cv=5, verbose=1, n_jobs=-1)

In [None]:
grid.fit(Xtrain, ytrain)

In [None]:
best_params = grid.best_params_
with open('best_params.txt', 'w') as f:
    f.write(str(best_params))
print(best_params)

In [None]:
gbr_best = GradientBoostingRegressor(**best_params)
gbr_best.fit(Xtrain, ytrain)

In [None]:
scores_gbr = cross_val_score(gbr_best, Xtrain, ytrain, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
scores_gbr = np.sqrt(-scores_gbr)
print(f'MEAN: {scores_gbr.mean():.2f} +/- {scores_gbr.std():.2f}')
error_percent_gbr = 100 * (10**scores_gbr.mean() - 1)
std_percent_gbr = 100 * (10**scores_gbr.std() - 1)
print(f'Average error is {error_percent_gbr:.2f}%')
print(f'Standard deviation of error is {std_percent_gbr:.2f}%')

path = DATA_DIR / 'processed' / 'models_log.csv'

with open(path, 'a') as f:
    f.write(f'Gradient Boosting,{error_percent_gbr:.2f},{std_percent_gbr:.2f}\n')

### Experiment 5:  KNN Regression

In [9]:
from sklearn.neighbors import KNeighborsRegressor

In [12]:
knn = KNeighborsRegressor()

params = {
    'n_neighbors': [5, 6, 7, 8],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2, 3],
    'leaf_size': [30, 40, 50]
}

grid = GridSearchCV(knn, params, cv=5, verbose=1, n_jobs=-1)

In [14]:
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [15]:
best_params = grid.best_params_
print(best_params)

{'algorithm': 'auto', 'leaf_size': 30, 'n_neighbors': 6, 'p': 1, 'weights': 'distance'}


In [16]:
knn_best = KNeighborsRegressor(**best_params)
knn_best.fit(Xtrain, ytrain)

In [17]:
scores_knn = cross_val_score(knn_best, Xtrain, ytrain, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
scores_knn = np.sqrt(-scores_knn)
error_percent_knn = 100 * (10**scores_knn.mean() - 1)
std_percent_knn = 100 * (10**scores_knn.std() - 1)
print(f'Average error is {error_percent_knn:.2f}%')
print(f'Standard deviation of error is {std_percent_knn:.2f}%')

path = DATA_DIR / 'processed' / 'models_log.csv'

with open(path, 'a') as f:
    f.write(f'knn,{error_percent_knn:.2f},{std_percent_knn:.2f}\n')

Average error is 19.22%
Standard deviation of error is 1.09%


### Experiment 6:  Regression Stack

### Retraining of best model on the whole dataset

In [None]:
best_model = linear_scaled_model

In [None]:
best_model.fit(X, y)