# Optimizing the model with series of experiments

In [74]:
import pickle
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

### Let´s first reload the data from the previous notebook. Then, perform train_test_split once again.

In [32]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

c:\Users\felip\OneDrive\Documentos\GitHub\AmesHousingDataset\data


In [33]:
model_data_scaled_path = DATA_DIR / 'processed' / 'ames_model_data_scaled.pkl'

In [None]:
with open(model_data_scaled_path, 'rb') as file:
    data = pickle.load(file)

In [34]:
X = data.drop(columns=['SalePrice']).copy().values
y = data['SalePrice'].copy().values

In [35]:
X.shape, y.shape

((2877, 164), (2877,))

In [36]:
RANDOM_SEED = 42  # Any number here, really.

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)

### Experiment 1: LinearRegression with encoded categorical values AND scaled continuous variables

Our first experiment is to rerun the linear regression model with the same features as before, but this time on the new data with scaled continuous variables. 

In [37]:
from sklearn.linear_model import LinearRegression

In [38]:
linear_scaled_model = LinearRegression()

linear_scaled_model.fit(Xtrain, ytrain)

In [75]:
ypred_linear = linear_scaled_model.predict(Xtest)

In [76]:
RMSE = np.sqrt(mean_squared_error(ytest, ypred_linear))
error_percent = 100 * (10**RMSE - 1)
print(f'RMSE: {RMSE}')
print(f'Average error is {error_percent:.2f}%')

RMSE: 0.06112775876094033
Average error is 15.11%


The experiment showed no signifficant difference in the model performance. This means that the LinearRegression model is not sensitive to the scale of the continuous variables. We will, however, still keep the scaled data for the next experiments.

### Experiment 2: Lasso Regression

In [41]:
from sklearn.linear_model import Lasso

In [42]:
lasso = Lasso()

params = {
    'alpha': np.logspace(-4, 0, 100)
}

grid = GridSearchCV(lasso, params, cv=5, verbose=1, n_jobs=-1)

In [43]:
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [44]:
best_params = grid.best_params_
print(best_params)

{'alpha': 0.0001}


In [45]:
lasso_best = Lasso(**best_params)
lasso_best.fit(Xtrain, ytrain)

In [77]:
ypred_lasso = lasso_best.predict(Xtest)

In [78]:
RMSE = np.sqrt(mean_squared_error(ytest, ypred_lasso))
error_percent = 100 * (10**RMSE - 1)
print(f'RMSE: {RMSE}')
print(f'Average error is {error_percent:.2f}%')

RMSE: 0.062039744688892146
Average error is 15.36%


### Experiment 3:  Random Forest Regression

In [48]:
from sklearn.ensemble import RandomForestRegressor

In [51]:
rf = RandomForestRegressor()

params = {
    'n_estimators': [10, 50, 100, 500, 600],
    'max_depth': [None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
}

grid = GridSearchCV(rf, params, cv=5, verbose=1, n_jobs=-1)

In [52]:
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [53]:
best_params = grid.best_params_
print(best_params)

{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [54]:
rf_best = RandomForestRegressor(**best_params)
rf_best.fit(Xtrain, ytrain)

In [79]:
ypred_rf = rf_best.predict(Xtest)

In [80]:
RMSE = np.sqrt(mean_squared_error(ytest, ypred_rf))
error_percent = 100 * (10**RMSE - 1)
print(f'RMSE: {RMSE}')
print(f'Average error is {error_percent:.2f}%')

RMSE: 0.06320588866028146
Average error is 15.67%


### Experiment 4:  Gradient Boosting Regression

### Experiment 5:  KNN Regression

In [58]:
from sklearn.neighbors import KNeighborsRegressor

In [83]:
knn = KNeighborsRegressor()

params = {
    'n_neighbors': [5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3],
}

grid = GridSearchCV(knn, params, cv=5, verbose=1, n_jobs=-1)

In [84]:
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [85]:
best_params = grid.best_params_
print(best_params)

{'n_neighbors': 9, 'p': 1, 'weights': 'distance'}


In [62]:
knn_best = KNeighborsRegressor(**best_params)
knn_best.fit(Xtrain, ytrain)

In [86]:
y_pred_knn = knn_best.predict(Xtest)

In [87]:
RMSE = np.sqrt(mean_squared_error(ytest, y_pred_knn))
error_percent = 100 * (10**RMSE - 1)
print(f'RMSE: {RMSE}')
print(f'Average error is {error_percent:.2f}%')

RMSE: 0.07969911719108594
Average error is 20.14%


### Experiment 6:  Regression Stack

### Retraining of best model on the whole dataset

In [None]:
best_model = linear_scaled_model

In [None]:
best_model.fit(X, y)