In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from scipy import stats

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold


Next, I execute the functions that were created in the `02 - Feature Engineering & Functions` notebook, as they are required for the operations in this notebook.

In [2]:
%run clean_data.py

### Data Imports

In [3]:
ames_train = pd.read_csv('../datasets/train.csv')
ames_train, training_cat_label = clean_data(ames_train, version='mean')
ames_train = ames_train[ames_train['gr_liv_area']<4000]

In [4]:
ames_test = pd.read_csv('../datasets/test.csv')
ames_test, categories = clean_data(ames_test, version='mean', cat_labels=training_cat_label)

In [5]:
ames_train.isna().sum().sort_values()

Id                 0
electrical         0
central_air        0
heating_qc         0
bsmtfin_type_1     0
                  ..
porch_sf           0
SalePrice          0
yr_sold            0
screen_porch       0
house_remodeled    0
Length: 90, dtype: int64

In [6]:
ames_test.isna().sum().sort_values()

Id                 0
electrical         0
central_air        0
heating_qc         0
bsmtfin_type_1     0
                  ..
porch_sf           0
yr_sold            0
mo_sold            0
lot_config         0
house_remodeled    0
Length: 89, dtype: int64

### Outlier Censoring:

As we saw in the EDA notebook, many of the numerical features in the sample have extreme outliers that can influence the performance of our developed models. Therefore, I'm implementing a conservative approach to replace the extreme outliers in all numerical features with the value at the 99.5th percentile in the sample.

# Benchmark Model 

In [7]:
X = ames_train.drop(columns = 'SalePrice')
y = ames_train['SalePrice']
preds = y.mean()

df_test = ames_test.copy()
df_test['SalePrice'] = preds
submission = df_test[['Id', 'SalePrice']]
submission.set_index('Id', inplace=True)
submission.to_csv('../datasets/Kaggle_submissions/benchmark_model_.csv')

### Linear regression with best predictors:

In [8]:
models_predictors = pd.read_pickle(r'../datasets/models_predictor.pkl')

In [9]:
X_train = ames_train[models_predictors['lr_best_predictors']]

y_train = np.log(ames_train['SalePrice'])

X_test = ames_test[models_predictors['lr_best_predictors']]

### Feature preparation
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='infrequent_if_exist')

ss = StandardScaler()

ct = ColumnTransformer(
    [
    ('ohe', ohe, X_train.select_dtypes(include='object').columns),
    ('ss', ss, X_train.select_dtypes(include=['float64', 'int64']).columns)
    ],
    remainder='passthrough'
    )

# ####

X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_ct, y_train)

print('Train RMSE:', np.sqrt(metrics.mean_squared_error(np.exp(y_train), np.exp(lr.predict(X_train_ct)))))
print('Train R2:', np.sqrt(metrics.r2_score(y_train, lr.predict(X_train_ct))))

Train RMSE: 18841.60807654265
Train R2: 0.9616815060476851


In [10]:
preds = np.exp(lr.predict(X_test_ct))
df_test['SalePrice'] = preds
submission = df_test[['Id', 'SalePrice']]
submission.set_index('Id', inplace=True)
submission.to_csv('../datasets/Kaggle_submissions/linear_model_.csv')

### Ridge regression with best predictors

In [11]:
X_train = ames_train[models_predictors['ridge_best_predictors']]

y_train = np.log(ames_train['SalePrice'])

X_test = ames_test[models_predictors['ridge_best_predictors']]

### Feature preparation
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='infrequent_if_exist')

ss = StandardScaler()

ct = ColumnTransformer(
    [
    ('ohe', ohe, X_train.select_dtypes(include='object').columns),
    ('ss', ss, X_train.select_dtypes(include=['float64', 'int64']).columns)
    ],
    remainder='passthrough'
    )

# ####

r_alphas = np.logspace(-2, 2.2, 300)
 
ridge_cv = RidgeCV(alphas=r_alphas, cv=10)
ridge_cv.fit(X_train_ct, y_train)

print('Train RMSE:', np.sqrt(metrics.mean_squared_error(np.exp(y_train), np.exp(ridge_cv.predict(X_train_ct)))))
print('Train R2:', np.sqrt(metrics.r2_score(y_train, ridge_cv.predict(X_train_ct))))

Train RMSE: 18885.198355658355
Train R2: 0.960888966520748


In [12]:
preds = np.exp(ridge_cv.predict(X_test_ct))
df_test['SalePrice'] = preds
submission = df_test[['Id', 'SalePrice']]
submission.set_index('Id', inplace=True)
submission.to_csv('../datasets/Kaggle_submissions/ridge_cv_model_.csv')

### LASSO regression with best predictors

In [13]:
X_train = ames_train[models_predictors['lasso_best_predictors']]

y_train = np.log(ames_train['SalePrice'])

X_test = ames_test[models_predictors['lasso_best_predictors']]

### Feature preparation
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='infrequent_if_exist')

ss = StandardScaler()

ct = ColumnTransformer(
    [
    ('ohe', ohe, X_train.select_dtypes(include='object').columns),
    ('ss', ss, X_train.select_dtypes(include=['float64', 'int64']).columns)
    ],
    remainder='passthrough'
    )

# ####

alphas_range = np.logspace(-2, 2.2, 300)
lasso_cv = LassoCV(n_alphas=300, cv=10, max_iter=5000)

lasso_cv.fit(X_train_ct, y_train)

print('Train RMSE:', np.sqrt(metrics.mean_squared_error(np.exp(y_train), np.exp(lasso_cv.predict(X_train_ct)))))
print('Train R2:', np.sqrt(metrics.r2_score(y_train, lasso_cv.predict(X_train_ct))))

Train RMSE: 19377.309385621353
Train R2: 0.9593562775396465


In [14]:
preds = np.exp(lasso_cv.predict(X_test_ct))
df_test['SalePrice'] = preds
submission = df_test[['Id', 'SalePrice']]
submission.set_index('Id', inplace=True)
submission.to_csv('../datasets/Kaggle_submissions/lasso_cv_model.csv')