# Optimizing the model with series of experiments

In [1]:
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

### Let´s first reload the data from the previous notebook. Then, perform train_test_split once again.

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

/home/arthur/Documentos/Insper/6_semestre/projetos_ml/AmesHousingDataset/data


In [3]:
model_data_scaled_path = DATA_DIR / 'processed' / 'ames_model_data_scaled.pkl'

In [4]:
data = pd.read_pickle(model_data_scaled_path)

In [5]:
X = data.drop(columns=['SalePrice']).copy().values
y = data['SalePrice'].copy().values

In [6]:
X.shape, y.shape

((2877, 164), (2877,))

In [8]:
RANDOM_SEED = 42  # Any number here, really.

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)

### Experiment 1: LinearRegression with encoded categorical values AND scaled continuous variables

Our first experiment is to rerun the linear regression model with the same features as before, but this time on the new data with scaled continuous variables. 

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
linear_scaled_model = LinearRegression()

linear_scaled_model.fit(Xtrain, ytrain)

In [24]:
scores_linear_scaled = cross_val_score(linear_scaled_model,
                                       Xtrain, 
                                       ytrain,
                                       cv=5,
                                       scoring='neg_mean_squared_error',
                                       n_jobs=-1)

scores_linear_scaled = np.sqrt(-scores_linear_scaled)
print(f'MEAN: {scores_linear_scaled.mean():.2f} +/- {scores_linear_scaled.std():.2f}')
error_percent_linear = 100 * (10**scores_linear_scaled.mean() - 1)
print(f'Average error is {error_percent_linear:.2f}%')

MEAN: 0.06 +/- 0.01
Average error is 14.02%


The experiment showed no signifficant difference in the model performance. This means that the LinearRegression model is not sensitive to the scale of the continuous variables. We will, however, still keep the scaled data for the next experiments.

### Experiment 2: Lasso Regression

In [28]:
from sklearn.linear_model import Lasso

In [29]:
lasso = Lasso()

params = {
    'alpha': np.logspace(-4, 0, 100)
}

grid = GridSearchCV(lasso, params, cv=5, verbose=1, n_jobs=-1)

In [30]:
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  model = cd_fast.enet_coordinate_descent(


In [31]:
best_params = grid.best_params_
print(best_params)

{'alpha': 0.0001}


In [32]:
lasso_best = Lasso(**best_params)
lasso_best.fit(Xtrain, ytrain)

In [33]:
scores_lasso = cross_val_score(lasso_best, Xtrain, ytrain, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
scores_lasso = np.sqrt(-scores_lasso)
print(f'MEAN: {scores_lasso.mean():.2f} +/- {scores_lasso.std():.2f}')
error_percent_lasso = 100 * (10**scores_lasso.mean() - 1)
print(f'Average error is {error_percent_lasso:.2f}%')

MEAN: 0.06 +/- 0.01
Average error is 13.80%


  model = cd_fast.enet_coordinate_descent(


### Experiment 3:  Random Forest Regression

In [35]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
rf = RandomForestRegressor()

params = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 600],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20, 50, 100],
    'min_impurity_decrease': [0.0, 0.1, 0.2],
    'bootstrap': [True, False],
    'oob_score': [True, False],
    'verbose': [0, 1, 2],
    'warm_start': [True, False],
    'ccp_alpha': [0.0, 0.1, 0.2],
}

grid = GridSearchCV(rf, params, cv=5, verbose=1, n_jobs=-1)

In [37]:
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 1088640 candidates, totalling 5443200 fits


In [None]:
best_params = grid.best_params_
print(best_params)

In [None]:
rf_best = RandomForestRegressor(**best_params)
rf_best.fit(Xtrain, ytrain)

In [None]:
ypred_rf = rf_best.predict(Xtest)

In [None]:
RMSE = np.sqrt(mean_squared_error(ytest, ypred_rf))
error_percent = 100 * (10**RMSE - 1)
print(f'RMSE: {RMSE}')
print(f'Average error is {error_percent:.2f}%')

### Experiment 4:  Gradient Boosting Regression

### Experiment 5:  KNN Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor()

params = {
    'n_neighbors': [5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3],
}

grid = GridSearchCV(knn, params, cv=5, verbose=1, n_jobs=-1)

In [None]:
grid.fit(Xtrain, ytrain)

In [None]:
best_params = grid.best_params_
print(best_params)

In [None]:
knn_best = KNeighborsRegressor(**best_params)
knn_best.fit(Xtrain, ytrain)

In [None]:
y_pred_knn = knn_best.predict(Xtest)

In [None]:
RMSE = np.sqrt(mean_squared_error(ytest, y_pred_knn))
error_percent = 100 * (10**RMSE - 1)
print(f'RMSE: {RMSE}')
print(f'Average error is {error_percent:.2f}%')

### Experiment 6:  Regression Stack

### Retraining of best model on the whole dataset

In [None]:
best_model = linear_scaled_model

In [None]:
best_model.fit(X, y)