## Housing Data - Model

In [62]:
import pandas as pd
from sklearn import set_config
set_config(transform_output='pandas')

### Imports

In [63]:

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from numpy import arange
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [64]:
# Load the dataset
path = '../Data/Competition_Labeled_Data.csv'
data = pd.read_csv(path).set_index('Id')

### Data Exploration

In [65]:
# Checking data types
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual     

In [66]:
data.shape

(1460, 80)

In [67]:
# Keeping the unsplit data in the original "data" variable
X = data.copy()

In [68]:
# Separating target feature from predictor features
y = X.pop('Expensive')

In [69]:
# Feature Engineering
#X["HasGarage"] = (X["GarageCars"] > 0).astype(int)
#X["HasPool"] = (X["PoolArea"] > 0).astype(int)
#X["HasDeck"] = (X["WoodDeckSF"] > 0).astype(int)
#X["TotalOutdoorSpace"] = X["WoodDeckSF"] + X["ScreenPorch"]

In [70]:
# Train-test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Preprocessing

### Pipeline

In [71]:
# select categorical and numerical column names
X_num = X_train.select_dtypes(include="number").columns
X_cat = X_train.select_dtypes(exclude="number").columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
# Just impute numerical data
num_pipe = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean'))
])


In [72]:
# Identify ordinal features (there are more than just these two)
ord_features = [
    'KitchenQual',
    'FireplaceQu'
]

# Make explicit the order of the categories
kitchen_qual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
fireplace_qu = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']

# Compile ordered lists in a master list
categories = [
    kitchen_qual,
    fireplace_qu
]

# Provide ordered lists to the ordinal encoder
ord_encoder = OrdinalEncoder(categories=categories)


# One-Hot Encoder:

# Identify features for one-hot encoding (the remaining categorial features)
oh_features = list(set(X_cat) - set(ord_features))

oh_encoder = OneHotEncoder(
    handle_unknown='infrequent_if_exist',
    sparse_output=False,
    min_frequency=0.01
)


# Create full categorical encoder which sends some features to either encoder
cat_encoder = ColumnTransformer([
    ('oh_encoder', oh_encoder, oh_features),
    ('ord_encoder', ord_encoder, ord_features)
])

In [73]:
# Impute and then encode categorical data

cat_imputer = SimpleImputer(strategy='constant', fill_value='NA')

cat_pipe = Pipeline([
    ('cat_imputer', cat_imputer),
    ('oh_encoder', cat_encoder)
])

### Create column transformer to preprocess numerical and categorical data separately

In [74]:
# Combine categorical pipeline and numerical pipeline
preprocessor = ColumnTransformer([
    ('num_pipe', num_pipe, X_num),
    ('cat_pipe', cat_pipe, X_cat)
])

In [75]:
preprocessor

## Modeling

In [76]:
tree_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ("dtree", DecisionTreeClassifier())
])


### Tuning

In [77]:
# define parameter grid
param_grid = {
    'dtree__max_depth': [3, 5, 7],
    'dtree__min_samples_split': [2, 5, 10],
    }  

tree_search = GridSearchCV(
    tree_pipeline,
    param_grid,
    scoring='balanced_accuracy',
    cv=5,               
    n_jobs=-1,
    verbose=1
)

tree_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [78]:
tree_search.best_params_

{'dtree__max_depth': 7, 'dtree__min_samples_split': 10}

In [84]:
tree_pipeline.get_params().keys()


dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'dtree', 'preprocessor__force_int_remainder_cols', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num_pipe', 'preprocessor__cat_pipe', 'preprocessor__num_pipe__memory', 'preprocessor__num_pipe__steps', 'preprocessor__num_pipe__verbose', 'preprocessor__num_pipe__num_imputer', 'preprocessor__num_pipe__num_imputer__add_indicator', 'preprocessor__num_pipe__num_imputer__copy', 'preprocessor__num_pipe__num_imputer__fill_value', 'preprocessor__num_pipe__num_imputer__keep_empty_features', 'preprocessor__num_pipe__num_imputer__missing_values', 'preprocessor__num_pipe__num_imputer__strategy', 'preprocessor__cat_pipe__memory', 'preprocessor__cat_pipe__steps', 'preprocessor__cat_pipe__verbose', 'preprocessor__cat_pipe__cat_imputer', 'preprocessor__cat_pipe_

In [79]:
accuracy_score(y_true=y_train,
               y_pred=tree_search.predict(X_train))

0.9777397260273972

## Final Implementation

In [80]:
accuracy_score(y_true=y_test,
               y_pred=tree_search.predict(X_test))

0.928082191780822

In [81]:
# Now that we are done tuning and testing, right before deployment we can train with all our labeled data
tree_search.best_estimator_.fit(X, y)

### Submission

In [82]:
# Bring in unlabeled data
X_competition = pd.read_csv('../Data/Unlabeled_Competition_Data.csv')

# Hide Id column from model (we should have done this with the labeled dataset as well)
X_comp = X_competition.set_index('Id')

In [83]:
# Make predictions with the unlabeled competition data
preds = tree_search.predict(X_comp)

# Format those predictions for the competition
submission = pd.DataFrame(
    {'Expensive':preds}, 
    index=X_comp.index
)

# Export the predictions as a csv to be uploaded to the competition
submission.to_csv('../Data/tree_submission.csv')