In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# reading
data = pd.read_csv('https://raw.githubusercontent.com/JoanClaverol/housing_data/main/housing-classification-iter5.csv')

# X and y creation
X = data
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
318,9900,90.0,1347,4,1,0,3,340,0,RL,...,1,9,1993.0,656,60,144,0,0,4,2009
580,14585,,1144,3,2,0,2,216,0,RL,...,1,7,1960.0,572,110,0,0,0,6,2007
961,12227,,1330,4,1,0,2,550,0,RL,...,1,11,1977.0,619,282,0,0,0,7,2008
78,10778,72.0,1768,4,0,0,0,0,0,RL,...,2,8,,0,0,0,0,0,4,2010
5,14115,85.0,796,1,0,0,2,40,0,RL,...,1,5,1993.0,480,30,0,320,700,10,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,9130,,800,4,0,0,2,0,0,RL,...,1,7,1966.0,484,40,0,0,0,7,2008
1122,8926,,672,3,0,0,1,64,160,RL,...,1,5,1956.0,288,0,0,0,0,10,2009
1346,20781,,1568,3,1,0,2,0,0,RL,...,1,9,1968.0,508,80,0,290,0,6,2006
1406,8445,70.0,768,2,0,0,2,58,0,RL,...,1,5,1988.0,396,0,0,0,0,3,2009


In [None]:
X_train.isna().sum()

LotArea            0
LotFrontage      189
TotalBsmtSF        0
BedroomAbvGr       0
Fireplaces         0
PoolArea           0
GarageCars         0
WoodDeckSF         0
ScreenPorch        0
MSZoning           0
Condition1         0
Heating            0
Street             0
CentralAir         0
Foundation         0
ExterQual          0
ExterCond          0
BsmtQual          29
BsmtCond          29
BsmtExposure      30
BsmtFinType1      29
KitchenQual        0
FireplaceQu      555
MSSubClass         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         4
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
KitchenAbvGr       0
TotRmsAbvGrd       0
GarageYrBlt       67
GarageArea         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
MiscVal      

## 2. Categorical encoding - "MANUAL" approach  (Without using Pipelines)

#### 2.1. Replacing NaNs in categorical features

In [None]:
# selecting non-numerical columns
X_train_cat = X_train.select_dtypes(exclude="number")

# defining the imputer to use "N_A" as replacement value
cat_imputer = SimpleImputer(strategy="constant",
                            fill_value="N_A").set_output(transform='pandas')

# fitting and transforming
X_cat_imputed = cat_imputer.fit_transform(X_train_cat)

X_cat_imputed.head()

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
318,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,Gd,GLQ,Gd,TA
580,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,No,BLQ,Gd,Gd
961,RL,PosN,GasA,Pave,Y,CBlock,TA,Gd,Gd,Gd,No,ALQ,TA,TA
78,RL,Norm,GasA,Pave,N,CBlock,TA,TA,TA,TA,No,Unf,TA,N_A
5,RL,Norm,GasA,Pave,Y,Wood,TA,TA,Gd,TA,No,GLQ,TA,N_A


#### 2.2. Replacing NaNs in numerical features

In [None]:
# Selecting numerical columns
X_train_num = X_train.select_dtypes(include="number")

# Imputing the mean
num_imputer = SimpleImputer(strategy="mean").set_output(transform='pandas')

# Fitting and transforming
X_num_imputed = num_imputer.fit_transform(X_train_num)

X_num_imputed.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSSubClass,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
318,9900.0,90.0,1347.0,4.0,1.0,0.0,3.0,340.0,0.0,60.0,...,1.0,9.0,1993.0,656.0,60.0,144.0,0.0,0.0,4.0,2009.0
580,14585.0,69.58427,1144.0,3.0,2.0,0.0,2.0,216.0,0.0,20.0,...,1.0,7.0,1960.0,572.0,110.0,0.0,0.0,0.0,6.0,2007.0
961,12227.0,69.58427,1330.0,4.0,1.0,0.0,2.0,550.0,0.0,60.0,...,1.0,11.0,1977.0,619.0,282.0,0.0,0.0,0.0,7.0,2008.0
78,10778.0,72.0,1768.0,4.0,0.0,0.0,0.0,0.0,0.0,90.0,...,2.0,8.0,1978.213442,0.0,0.0,0.0,0.0,0.0,4.0,2010.0
5,14115.0,85.0,796.0,1.0,0.0,0.0,2.0,40.0,0.0,50.0,...,1.0,5.0,1993.0,480.0,30.0,0.0,320.0,700.0,10.0,2009.0


In [None]:
# Concatenating all columns
X_imputed = pd.concat([X_cat_imputed, X_num_imputed], axis=1)

X_imputed.head()

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
318,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,...,1.0,9.0,1993.0,656.0,60.0,144.0,0.0,0.0,4.0,2009.0
580,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,...,1.0,7.0,1960.0,572.0,110.0,0.0,0.0,0.0,6.0,2007.0
961,RL,PosN,GasA,Pave,Y,CBlock,TA,Gd,Gd,Gd,...,1.0,11.0,1977.0,619.0,282.0,0.0,0.0,0.0,7.0,2008.0
78,RL,Norm,GasA,Pave,N,CBlock,TA,TA,TA,TA,...,2.0,8.0,1978.213442,0.0,0.0,0.0,0.0,0.0,4.0,2010.0
5,RL,Norm,GasA,Pave,Y,Wood,TA,TA,Gd,TA,...,1.0,5.0,1993.0,480.0,30.0,0.0,320.0,700.0,10.0,2009.0


### 3. One Hot encoding

As you have learnt in the Platform lesson, One Hot encoding means creating a new binary column for each category in every categorical column. Fortunately, a Scikit-Learn transformer takes care of everything.

In [None]:
# import
from sklearn.preprocessing import OneHotEncoder

# initialize
my_onehot = OneHotEncoder(drop="first",sparse_output=False).set_output(transform='pandas')

# fit
my_onehot.fit(X_cat_imputed)

# transform
X_cat_imputed_onehot = my_onehot.transform(X_cat_imputed)

In [None]:
X_cat_imputed_onehot.head()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,...,BsmtFinType1_Rec,BsmtFinType1_Unf,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_N_A,FireplaceQu_Po,FireplaceQu_TA
318,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
580,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
961,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
78,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


### 3.1.Concatenating "one-hot" columns with numerical columns:

In [None]:
X_imputed = pd.concat([X_cat_imputed_onehot, X_num_imputed], axis=1)

X_imputed.head(3)

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
318,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,9.0,1993.0,656.0,60.0,144.0,0.0,0.0,4.0,2009.0
580,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,7.0,1960.0,572.0,110.0,0.0,0.0,0.0,6.0,2007.0
961,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,11.0,1977.0,619.0,282.0,0.0,0.0,0.0,7.0,2008.0


### 4. Categorical encoding - "Automated" approach (Using Pipelines)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# building the pipeline
X_cat = X_train.select_dtypes(exclude="number").copy()
X_num = X_train.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))

categoric_pipe = make_pipeline(
                                SimpleImputer(strategy="constant", fill_value="N_A"),
                                OneHotEncoder(handle_unknown="ignore")
                              )

preprocessor = ColumnTransformer(
                                  transformers=[
                                                   ("num_pipe", numeric_pipe, X_num.columns),
                                                 ("cat_pipe", categoric_pipe, X_cat.columns),
                                               ]
                                 )

In [None]:
from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor,
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [None]:
search.best_score_

0.9289314405194233

In [None]:
search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'decisiontreeclassifier__max_depth': 8,
 'decisiontreeclassifier__min_samples_leaf': 7}

In [None]:
# training accuracy
y_train_pred = search.predict(X_train)

accuracy_score(y_train, y_train_pred)

0.9648972602739726

In [None]:
# testing accuracy
y_test_pred = search.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.9315068493150684