In [1]:
# Imports
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Import the JgKMeans class
from jg_ml_models import JgKMeans, ModelPreprocessor

In [3]:
# Load data
train_full_df = pd.read_csv('../data/train.csv').drop('Id', axis=1)
train_full_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Initialize the JgKMeans class with the original market data
target_col = 'SalePrice'
pre_proc = ModelPreprocessor(train_full_df, target_col=target_col)

In [5]:
# Variables specific to this dataset
numeric_cols: list[str] = pre_proc.get_numeric_columns()
print(numeric_cols)
encode_cols: list[str] = pre_proc.get_categorical_columns()
print(encode_cols)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual

In [9]:
X = pre_proc.scale_numeric_columns()
print(X.index.name)
X.head()

None


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,0.073375,-0.208034,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.510015,0.575425,-0.288653,...,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777,0.347273
1,-0.872563,0.409895,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.572835,1.171992,-0.288653,...,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439,0.007288
2,0.073375,-0.084449,0.07348,0.651479,-0.5172,0.984752,0.830215,0.322174,0.092907,-0.288653,...,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777,0.536154
3,0.309859,-0.414011,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.572835,-0.499274,-0.288653,...,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655,-0.515281
4,0.073375,0.574676,0.375148,1.374795,-0.5172,0.951632,0.733308,1.360826,0.463568,-0.288653,...,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777,0.869843


In [10]:
encoded_df = pre_proc.encode_categorical_columns_ohe(encode_cols=encode_cols, target_col=target_col)
print(encoded_df.index.name)
encoded_df.head()

None


Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_Pave,Alley_nan,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,208500
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,181500
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,223500
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,140000
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,250000


In [11]:
scaled_encoded_df = pre_proc.scale_numeric_columns(encoded_df)
scaled_encoded_df.head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_Pave,Alley_nan,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,-0.083045,-0.215859,-0.105263,0.518133,-0.418955,-0.064238,0.064238,-0.188311,-0.169981,0.257821,...,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995,0.347273
1,-0.083045,-0.215859,-0.105263,0.518133,-0.418955,-0.064238,0.064238,-0.188311,-0.169981,0.257821,...,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995,0.007288
2,-0.083045,-0.215859,-0.105263,0.518133,-0.418955,-0.064238,0.064238,-0.188311,-0.169981,0.257821,...,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995,0.536154
3,-0.083045,-0.215859,-0.105263,0.518133,-0.418955,-0.064238,0.064238,-0.188311,-0.169981,0.257821,...,-0.301962,-0.045376,0.390293,3.668167,-0.052414,-0.091035,-0.117851,-2.138345,-0.305995,-0.515281
4,-0.083045,-0.215859,-0.105263,0.518133,-0.418955,-0.064238,0.064238,-0.188311,-0.169981,0.257821,...,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995,0.869843


In [67]:
corr_df = pre_proc.get_pd_corr_abs_encoded_features_df(scaled_encoded_df)
corr_df.head()

Unnamed: 0,correlation
ExterQual_TA,0.589044
BsmtQual_Ex,0.553105
KitchenQual_TA,0.519298
KitchenQual_Ex,0.504094
Foundation_PConc,0.497734


In [68]:
coef_df = pre_proc.get_ridge_coeff_abs_numeric_features_df(scaled_encoded_df)
coef_df.head()

Best alpha: 100.0 (R^2 score:  0.86)


Unnamed: 0,coefficient
MSZoning_C (all),0.306094
MSZoning_FV,0.057453
MSZoning_RH,0.078078
MSZoning_RL,0.138896
MSZoning_RM,0.144411


In [57]:
def get_vif_numeric_features_df(X: pd.DataFrame) -> pd.DataFrame:
    vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif_inv = [1 / v for v in vif]
    vif_df = pd.DataFrame()
    vif_df["variables"] = X.columns
    vif_df["vif"] = vif
    vif_df["vif_inv"] = vif_inv
    return vif_df.set_index("variables")
    
    # #vif = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])].reshape(-1, 1))
    # vif = np.array([variance_inflation_factor(X.values, i) for i in range(X.shape[1])]).reshape(-1,1)
    # vif_inv = [1 / v for v in vif]
    # vif_df = pd.DataFrame(
    #     [vif, vif_inv],
    #     columns=['vif', 'vif_inv'],
    #     index=X.columns,
    # )
    # return vif_df
    
    
    # vif_df = pd.DataFrame(
    #     [vif, vif_inv],
    #     columns=['vif', 'vif_inv'],
    #     index=scaled_df.columns,
    # )
    # return vif_df


In [62]:
X = pre_proc.scale_numeric_columns()
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,0.073375,-0.208034,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.510015,0.575425,-0.288653,...,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777,0.347273
1,-0.872563,0.409895,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.572835,1.171992,-0.288653,...,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439,0.007288
2,0.073375,-0.084449,0.07348,0.651479,-0.5172,0.984752,0.830215,0.322174,0.092907,-0.288653,...,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777,0.536154
3,0.309859,-0.414011,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.572835,-0.499274,-0.288653,...,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655,-0.515281
4,0.073375,0.574676,0.375148,1.374795,-0.5172,0.951632,0.733308,1.360826,0.463568,-0.288653,...,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777,0.869843


In [63]:
X = X.dropna(axis=1).drop("SalePrice", axis=1)
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_a = np.array(vif).reshape(-1,1)
vif_a

  vif = 1. / (1. - r_squared_i)


array([[1.4839875 ],
       [1.21876007],
       [3.23342832],
       [1.56371177],
       [4.04058919],
       [2.22036952],
       [       inf],
       [       inf],
       [       inf],
       [       inf],
       [       inf],
       [       inf],
       [       inf],
       [       inf],
       [2.20967505],
       [1.14970532],
       [2.90131637],
       [2.16355283],
       [2.30645561],
       [1.58424767],
       [4.87701104],
       [1.56345985],
       [5.49376453],
       [5.18434072],
       [1.20321126],
       [1.21419699],
       [1.28154296],
       [1.02137259],
       [1.10824954],
       [1.07993741],
       [1.02274862],
       [1.04866126],
       [1.05085768]])

In [66]:
vif_df = get_vif_numeric_features_df(X).sort_values("vif")
vif_df.head()

  vif = 1. / (1. - r_squared_i)


Unnamed: 0_level_0,vif,vif_inv
variables,Unnamed: 1_level_1,Unnamed: 2_level_1
3SsnPorch,1.021373,0.979075
MiscVal,1.022749,0.977757
MoSold,1.048661,0.953597
YrSold,1.050858,0.951604
PoolArea,1.079937,0.92598


In [69]:
combo_df = pd.concat([corr_df, coef_df, vif_df], axis=1)
combo_df.head()

Unnamed: 0,correlation,coefficient,vif,vif_inv
ExterQual_TA,0.589044,0.266217,,
BsmtQual_Ex,0.553105,0.780588,,
KitchenQual_TA,0.519298,0.371717,,
KitchenQual_Ex,0.504094,0.766181,,
Foundation_PConc,0.497734,0.119363,,
