In [90]:
import sys
sys.path.append("../src")

import pandas as pd
import numpy as np
from functions import *
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline


In [91]:
pd.set_option('display.float_format','{:.3f}'.format)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

df = pd.read_csv("../data/01_raw/data.csv")
print(df.shape)

(1460, 81)


In [92]:
get_corr_pairs(df, 10)

0.88248 ('GarageArea', 'GarageCars')
0.82567 ('GarageYrBlt', 'YearBuilt')
0.82549 ('GrLivArea', 'TotRmsAbvGrd')
0.81953 ('TotalBsmtSF', '1stFlrSF')
0.79098 ('OverallQual', 'SalePrice')
0.70862 ('SalePrice', 'GrLivArea')
0.68750 ('2ndFlrSF', 'GrLivArea')
0.67662 ('TotRmsAbvGrd', 'BedroomAbvGr')
0.64921 ('BsmtFullBath', 'BsmtFinSF1')
0.64228 ('YearRemodAdd', 'GarageYrBlt')


In [93]:
cat_cols = df.select_dtypes(include="O").columns
num_cols = df.select_dtypes(exclude="O").columns

In [94]:
df.loc[:,cat_cols] = df.loc[:,cat_cols].fillna("NaN", axis=1)
df.loc[:,["MasVnrArea"]] = df.loc[:,["MasVnrArea"]].fillna(0.0)
df["GarageYrBlt"] = np.where(df["GarageYrBlt"].isna(), df["YearBuilt"], df["GarageYrBlt"])
df["LotFrontage"] = np.where(df["LotFrontage"].isna(), df["YearBuilt"].mean(), df["LotFrontage"])

In [95]:
major_values(df[cat_cols],15)

Utilities     99.932
Street        99.589
PoolQC        99.521
Condition2    98.973
RoofMatl      98.219
Heating       97.808
MiscFeature   96.301
LandSlope     94.658
Alley         93.767
CentralAir    93.493
Functional    93.151
PavedDrive    91.781
Electrical    91.370
GarageCond    90.822
GarageQual    89.795
dtype: float64

In [96]:
# Prevent warning : SettingWithCopyWarning
#with pd.option_context('mode.chained_assignment',None):
df.drop(major_values(df[cat_cols],15).index, axis=1, inplace=True)

In [97]:
_ = [
    'Id',
    'MSSubClass',
    'LotFrontage',
    'OverallQual',
    'YearBuilt',
    'YearRemodAdd',
    'MasVnrArea',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'LowQualFinSF',
    'GrLivArea',
    'Fireplaces',
    'GarageYrBlt',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'MoSold',
    'YrSold'
]

df.drop(_, axis=1, inplace=True)

In [98]:
y = df.SalePrice
df_features = df.columns[:-1]
X = df[df_features]

In [99]:
le = LabelEncoder()
X_2 = X.apply(le.fit_transform)

In [100]:
get_model_metrics(LinearRegression(), X_2, y, b1=False, b2=False)

Training set : R2 = 0.829, RMSE = 32271.381
Testing  set : R2 = 0.809, RMSE = 36917.141




In [101]:
get_model_metrics(RidgeCV(), X_2, y, b1=False, b2=False)

Training set : R2 = 0.829, RMSE = 32312.096
Testing  set : R2 = 0.81, RMSE = 36836.176




In [102]:
get_model_metrics(LassoCV(), X_2, y, b1=False, b2=False)

Training set : R2 = 0.726, RMSE = 40873.28
Testing  set : R2 = 0.766, RMSE = 40884.075


