In [11]:
# Imports
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [12]:
# Import the JgKMeans class
from jg_ml_models import JgKMeans, ModelPreprocessor

In [13]:
# Load data
train_full_df = pd.read_csv('../data/train.csv').drop('Id', axis=1)
train_full_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [14]:
# Initialize the JgKMeans class with the original market data
target_col = 'SalePrice'
pre_proc = ModelPreprocessor(train_full_df, target_col=target_col)
train_full_df = pre_proc.remove_cols_with_mostly_same_values(freq_value_threshold=0.9)
train_full_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,Neighborhood,Condition1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,PoolQC,Fence,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,,Reg,Lvl,Inside,CollgCr,Norm,...,0,61,0,,,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,,Reg,Lvl,FR2,Veenker,Feedr,...,298,0,0,,,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,,IR1,Lvl,Inside,CollgCr,Norm,...,0,42,0,,,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,,IR1,Lvl,Corner,Crawfor,Norm,...,0,35,272,,,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,,IR1,Lvl,FR2,NoRidge,Norm,...,192,84,0,,,12,2008,WD,Normal,250000


In [15]:
# Variables specific to this dataset
numeric_cols: list[str] = pre_proc.get_numeric_columns()
print(numeric_cols)
print(len(numeric_cols))
encode_cols: list[str] = pre_proc.get_categorical_columns()
print(encode_cols)
print(len(encode_cols))

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'MoSold', 'YrSold', 'SalePrice']
30
['MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 'PoolQC', 'Fence', 'SaleType', 'SaleCondition']
29


In [16]:
scaled_encoded_df = pre_proc.get_scaled_and_encoded_df(train_full_df)
scaled_encoded_df.head()


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,0.073375,-0.208034,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.510015,0.575425,-0.288653,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,208500
1,-0.872563,0.409895,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.572835,1.171992,-0.288653,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,181500
2,0.073375,-0.084449,0.07348,0.651479,-0.5172,0.984752,0.830215,0.322174,0.092907,-0.288653,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,223500
3,0.309859,-0.414011,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.572835,-0.499274,-0.288653,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,140000
4,0.073375,0.574676,0.375148,1.374795,-0.5172,0.951632,0.733308,1.360826,0.463568,-0.288653,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,250000


In [17]:
combo_df = pre_proc.get_combined_important_numeric_features_df(scaled_df=scaled_encoded_df)
combo_df.head()

  c /= stddev[:, None]
  c /= stddev[None, :]


Best alpha: 100.0 (R^2 score:  0.86)


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


Unnamed: 0,correlation,coefficient,pval_inv,vif,vif_inv,total
OverallQual,0.797881,1.0,1.0,5.7618,0.173557,2.797881
BsmtQual_Ex,0.58045,0.650323,0.999328,inf,0.0,2.230101
KitchenQual_Ex,0.5349,0.638686,1.0,inf,0.0,2.173586
GarageCars,0.647034,0.52325,0.999765,5.633601,0.177506,2.170048
Neighborhood_NridgHt,0.421879,0.486109,1.0,inf,0.0,1.907988


In [21]:
scaled_df = pre_proc.scale_numeric_columns()
scaled_df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,...,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,MoSold,YrSold
0,0.073375,-0.208034,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.510015,0.575425,-0.944591,...,0.163779,0.91221,-0.951226,0.992426,0.311725,0.351,-0.752176,0.216503,-1.599111,0.138777
1,-0.872563,0.409895,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.572835,1.171992,-0.641228,...,0.163779,-0.318683,0.600495,-0.101543,0.311725,-0.060731,1.626195,-0.704483,-0.48911,-0.614439
2,0.073375,-0.084449,0.07348,0.651479,-0.5172,0.984752,0.830215,0.322174,0.092907,-0.301643,...,0.163779,-0.318683,0.600495,0.911391,0.311725,0.631726,-0.752176,-0.070361,0.990891,0.138777
3,0.309859,-0.414011,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.572835,-0.499274,-0.06167,...,0.163779,0.296763,0.600495,0.789839,1.650307,0.790804,-0.752176,-0.176048,-1.599111,-1.367655
4,0.073375,0.574676,0.375148,1.374795,-0.5172,0.951632,0.733308,1.360826,0.463568,-0.174865,...,1.390023,1.527656,0.600495,0.870874,1.650307,1.698485,0.780197,0.56376,2.100892,0.138777


In [22]:
scaled_df[target_col] = train_full_df[target_col]
pvals_df = pre_proc.get_lr_pvals_numeric_features_df(scaled_df)
pvals_df.sort_values(by='pval', ascending=False)

Unnamed: 0,pval
1stFlrSF,0.983862
BsmtFinSF1,0.860464
GrLivArea,0.857855
2ndFlrSF,0.755525
BsmtFullBath,0.74854
MasVnrArea,0.722859
BsmtUnfSF,0.668381
MoSold,0.665167
OverallCond,0.641794
WoodDeckSF,0.569328
