# Optimizing the model with series of experiments

In [21]:
import pickle
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

### Let´s first reload the data from the previous notebook. Then, perform train_test_split once again.

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

c:\Users\felip\OneDrive\Documentos\GitHub\AmesHousingDataset\data


In [3]:
model_data_scaled_path = DATA_DIR / 'processed' / 'ames_model_data_scaled.pkl'

In [4]:
with open(model_data_scaled_path, 'rb') as file:
    data = pickle.load(file)

In [5]:
X = data.drop(columns=['SalePrice']).copy().values
y = data['SalePrice'].copy().values

In [6]:
X.shape, y.shape

((2877, 164), (2877,))

In [7]:
RANDOM_SEED = 42  # Any number here, really.

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)

### Experiment 1: LinearRegression with encoded categorical values AND scaled continuous variables

Our first experiment is to rerun the linear regression model with the same features as before, but this time on the new data with scaled continuous variables. 

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
model = LinearRegression()

model.fit(Xtrain, ytrain)

In [10]:
ypred = model.predict(Xtest)

In [11]:
RMSE = np.sqrt(mean_squared_error(ytest, ypred))
print(f'RMSE: {RMSE}')

RMSE: 0.06112775876094033


In [12]:
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 15.11%


The experiment showed no signifficant difference in the model performance. This means that the LinearRegression model is not sensitive to the scale of the continuous variables. We will, however, still keep the scaled data for the next experiments.

### Experiment 2: Lasso Regression

In [31]:
from sklearn.linear_model import Lasso

In [32]:
lasso = Lasso()

params = {
    'alpha': np.logspace(-4, 0, 100)
}

grid = GridSearchCV(lasso, params, cv=5, verbose=1, n_jobs=-1)

In [33]:
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [34]:
best_params = grid.best_params_
print(best_params)

{'alpha': 0.0001}


In [35]:
lasso_best = Lasso(**best_params)
lasso_best.fit(Xtrain, ytrain)

In [36]:
ypred = lasso_best.predict(Xtest)

In [37]:
RMSE = np.sqrt(mean_squared_error(ytest, ypred))
print(f'RMSE: {RMSE}')

RMSE: 0.062039744688892146


In [38]:
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 15.36%


### Experiment 3:  Random Forest Regression

In [39]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
rf = RandomForestRegressor()

params = {
    'n_estimators': [10, 50, 100, 500],
    'max_depth': [None, 3, 5, 7, 9],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
}

grid = GridSearchCV(rf, params, cv=5, verbose=1, n_jobs=-1)

In [49]:
grid.fit(Xtrain, ytrain)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


KeyboardInterrupt: 

In [50]:
best_params = grid.best_params_
print(best_params)

{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [51]:
rf_best = RandomForestRegressor(**best_params)
rf_best.fit(Xtrain, ytrain)

In [52]:
ypred = rf_best.predict(Xtest)

In [53]:
RMSE = np.sqrt(mean_squared_error(ytest, ypred))
print(f'RMSE: {RMSE}')

RMSE: 0.06204157118246429


In [54]:
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 15.36%
