In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.options.display.max_columns = 999
sns.set_style('whitegrid')

# https://metadata.phila.gov/#home/datasetdetails/5543865f20583086178c4ee5/representationdetails/55d624fdad35c7e854cb21a4/

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import RobustScaler, OneHotEncoder

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('PHL_Building_Dataset_ML_Classification.csv')
df = df.drop(columns=['Unnamed: 0', 'basements_bin'])
df

Unnamed: 0,basements,building_code_description,central_air,depth,exterior_condition,fireplaces,frontage,garage_type,interior_condition,market_value,number_of_bathrooms,number_of_bedrooms,number_of_rooms,number_stories,parcel_shape,street_designation,topography,total_area,total_livable_area,type_heater,view_type,year_built,zoning,building_description,segment
0,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,29.17,A,4.0,257500.0,2.0,3.0,6.0,2.0,E,ST,6,2625.30,1266.0,H,I,1960.0,1,MASONRY,2
1,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,18.03,A,4.0,249400.0,2.0,3.0,6.0,2.0,E,ST,6,1622.70,1266.0,A,I,1960.0,1,MASONRY,2
2,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,18.05,A,4.0,249500.0,2.0,3.0,6.0,2.0,E,ST,6,1624.50,1266.0,A,I,1960.0,1,MASONRY,2
3,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,18.08,A,4.0,249500.0,2.0,3.0,6.0,2.0,E,ST,6,1627.20,1266.0,H,I,1960.0,1,MASONRY,2
4,0,ROW B/GAR 2 STY MASONRY,Y,90.00,4.0,0.0,18.71,A,4.0,253800.0,2.0,3.0,6.0,2.0,E,ST,6,1683.90,1310.0,H,I,1960.0,1,MASONRY,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44479,D,ROW 3 STY MASONRY,N,100.00,4.0,0.0,21.87,0,4.0,117500.0,0.0,3.0,6.0,3.0,A,ST,6,1651.00,2907.0,H,I,1929.0,1,MASONRY,1
44480,D,ROW 2 STY MASONRY,Y,102.19,3.0,0.0,18.24,0,3.0,70800.0,2.0,3.0,9.0,3.0,A,ST,6,1811.00,2026.0,A,I,2005.0,1,MASONRY,1
44481,D,ROW 3 STY MASONRY,Y,103.26,3.0,0.0,16.00,0,3.0,23600.0,1.0,3.0,6.0,3.0,A,ST,6,1604.00,2082.0,A,I,2005.0,1,MASONRY,1
44482,D,ROW 3 STY MASONRY,Y,90.00,3.0,0.0,16.00,0,3.0,23600.0,1.0,3.0,6.0,3.0,E,ST,6,1440.00,1387.0,A,I,2005.0,1,MASONRY,1


In [6]:
# Rename Segment
df['segment'] = df['segment'].replace({1:'Bottom', 2:'Lower Middle', 3:'Middle', 4:'Upper Middle', 5:'Top'})

# Splitting Data

In [7]:
X = df.drop(columns=['building_code_description', 'segment'])
y = df['segment']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

# Evaluation Matrix

In [9]:
# Evaluation Matrix Classification
def Eva_Matrix1(Model, X_train, y_train, X_test, y_test, Name):
    y_pred_train = Model.predict(X_train)
    acc_train = accuracy_score(y_train, y_pred_train)
    rec_train = recall_score(y_train, y_pred_train)
    prec_train = precision_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    y_pred_test = Model.predict(X_test)
    acc_test = accuracy_score(y_test, y_pred_test)
    rec_test = recall_score(y_test, y_pred_test)
    prec_test = precision_score(y_test, y_pred_test)
    f1_test = f1_score(y_train, y_pred_train)
    data = {
        f"Training {Name}" : [acc_train, rec_train, prec_train, f1_train],
        f"Test {Name}" : [acc_test, rec_test, prec_test, f1_test]
    }
    df = pd.DataFrame(data=data, index=['Accuracy', 'Recall', 'Precision', 'F1'])
    return df

# Pipeline

In [10]:
num_columns = ['fireplaces', 'market_value', 'number_of_rooms', 'total_area', 
               'total_livable_area']

cat_columns = ['basements', 'central_air', 'exterior_condition', 'garage_type',
               'interior_condition', 'parcel_shape', 'topography',
               'type_heater', 'view_type', 'building_description']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    #('scaler', RobustScaler()),
    #('poly', PolynomialFeatures(degree=3, include_bias=False)),
    #('power', PowerTransformer(method='yeo-johnson'))
])

categoric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, num_columns),
    ('categoric', categoric_pipeline, cat_columns)
])

pipeSVM = Pipeline([
    ("prep", preprocessor),
    ("algo", SVC(max_iter=400, probability=True))
])

pipeLR = Pipeline([
    ("prep", preprocessor),
    ("algo", LogisticRegression())
])

pipeKNN = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])
               
pipeDT = Pipeline([
    ("prep", preprocessor),
    ("algo", DecisionTreeClassifier())
])
            
pipeRF = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestClassifier())
])

# Base Model (KNN)

In [10]:
pipeKNN.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [11]:
y_KNN_train = pipeKNN.predict(X_train)

In [12]:
y_KNN_test = pipeKNN.predict(X_test)

In [13]:
accuracy_score(y_train, y_KNN_train)

0.9995784977660381

In [14]:
accuracy_score(y_test, y_KNN_test)

0.9993256153759694

In [15]:
print(classification_report(y_test, y_KNN_test))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      1.00      1.00      3089
      Middle       1.00      1.00      1.00       826
         Top       1.00      1.00      1.00        42
Upper Middle       1.00      0.98      0.99       152

    accuracy                           1.00      8897
   macro avg       1.00      1.00      1.00      8897
weighted avg       1.00      1.00      1.00      8897



# Base Model Logistic Regression

In [16]:
pipeLR.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [17]:
y_LR_train = pipeLR.predict(X_train)

In [18]:
y_LR_test = pipeLR.predict(X_test)

In [19]:
accuracy_score(y_train, y_LR_train)

0.7611487340882906

In [20]:
accuracy_score(y_test, y_LR_test)

0.7591322917837473

In [21]:
print(classification_report(y_test, y_LR_test))

              precision    recall  f1-score   support

      Bottom       0.86      0.90      0.88      4788
Lower Middle       0.65      0.77      0.71      3089
      Middle       0.36      0.08      0.13       826
         Top       0.09      0.07      0.08        42
Upper Middle       0.32      0.05      0.08       152

    accuracy                           0.76      8897
   macro avg       0.46      0.37      0.37      8897
weighted avg       0.73      0.76      0.73      8897



# Base Model SVM

In [22]:
pipeSVM.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [23]:
y_SVM_train = pipeSVM.predict(X_train)

In [24]:
y_SVM_test = pipeSVM.predict(X_test)

In [25]:
accuracy_score(y_train, y_SVM_train)

0.9610812937308568

In [26]:
accuracy_score(y_test, y_SVM_test)

0.9598741148701809

In [27]:
print(classification_report(y_test, y_SVM_test))

              precision    recall  f1-score   support

      Bottom       1.00      0.93      0.96      4788
Lower Middle       0.90      1.00      0.95      3089
      Middle       0.99      1.00      1.00       826
         Top       1.00      1.00      1.00        42
Upper Middle       1.00      0.97      0.98       152

    accuracy                           0.96      8897
   macro avg       0.98      0.98      0.98      8897
weighted avg       0.96      0.96      0.96      8897



# Base Model (DecisionTreeClassifier)

In [28]:
pipeDT.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [29]:
y_DT_train = pipeDT.predict(X_train)

In [30]:
y_DT_test = pipeDT.predict(X_test)

In [31]:
accuracy_score(y_train, y_DT_train)

1.0

In [32]:
accuracy_score(y_test, y_DT_test)

0.9998876025626616

In [33]:
print(classification_report(y_test, y_DT_test))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      1.00      1.00      3089
      Middle       1.00      1.00      1.00       826
         Top       1.00      1.00      1.00        42
Upper Middle       1.00      1.00      1.00       152

    accuracy                           1.00      8897
   macro avg       1.00      1.00      1.00      8897
weighted avg       1.00      1.00      1.00      8897



# Base Model (Random Forest Classifier)

In [34]:
pipeRF.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [35]:
y_RF_train = pipeRF.predict(X_train)

In [36]:
y_RF_test = pipeRF.predict(X_test)

In [37]:
accuracy_score(y_train, y_RF_train)

1.0

In [38]:
accuracy_score(y_test, y_RF_test)

0.9956164999438013

In [39]:
print(classification_report(y_test, y_RF_test))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      1.00      1.00      3089
      Middle       0.97      1.00      0.98       826
         Top       0.94      0.81      0.87        42
Upper Middle       0.93      0.82      0.87       152

    accuracy                           1.00      8897
   macro avg       0.97      0.92      0.94      8897
weighted avg       1.00      1.00      1.00      8897



# =================================================

# Hyper Parameter Tuning

## Logistic Regression

In [41]:
skf = StratifiedKFold(n_splits=3, random_state=42)

param_LR = {
    "algo__C" : np.logspace(-3, 3, 7),
    "algo__penalty" : ['l1', 'l2', 'elasticnet'],
    "algo__fit_intercept" : [True,False]
}

GS_LR = GridSearchCV(pipeLR, param_LR, cv = skf, n_jobs = 1, verbose = 1, scoring='balanced_accuracy')
GS_LR.fit(X_train, y_train)

Fitting 3 folds for each of 42 candidates, totalling 126 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 126 out of 126 | elapsed:  5.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer())]),
                                                                         ['fireplaces',
                                                                          'market_value',
                                                                          'number_of_rooms',
                                                                          'total_area',
                                                                          'total_livable_area']),
                                                                        ('categoric',
                                     

In [42]:
GS_LR.best_params_

{'algo__C': 0.001, 'algo__fit_intercept': True, 'algo__penalty': 'l2'}

In [43]:
LR_Tuned = GS_LR.best_estimator_

In [44]:
y_LR_Tuned_train = LR_Tuned.predict(X_train)

In [45]:
y_LR_Tuned_test = LR_Tuned.predict(X_test)

In [46]:
accuracy_score(y_train, y_LR_Tuned_train)

0.7603338297692978

In [47]:
accuracy_score(y_test, y_LR_Tuned_test)

0.7566595481623019

In [48]:
print(classification_report(y_test, y_LR_Tuned_test))

              precision    recall  f1-score   support

      Bottom       0.85      0.90      0.88      4788
Lower Middle       0.65      0.76      0.70      3089
      Middle       0.34      0.07      0.11       826
         Top       0.00      0.00      0.00        42
Upper Middle       0.22      0.04      0.07       152

    accuracy                           0.76      8897
   macro avg       0.41      0.35      0.35      8897
weighted avg       0.72      0.76      0.73      8897



# DecisionTreeClassifier

In [49]:
skf = StratifiedKFold(n_splits=3, random_state=42)

param_DT = {
   'algo__max_depth': [None, 5, 10, 15], 
   'algo__min_samples_leaf' : np.arange(1, 17, 5), 
   'algo__max_features' : [0.3, 0.5, 0.7, 0.8]
}

GS_DT = GridSearchCV(pipeDT, param_DT, cv = skf, n_jobs = 1, verbose = 1, scoring='balanced_accuracy')
GS_DT.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed:  5.2min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer())]),
                                                                         ['fireplaces',
                                                                          'market_value',
                                                                          'number_of_rooms',
                                                                          'total_area',
                                                                          'total_livable_area']),
                                                                        ('categoric',
                                     

In [50]:
GS_DT.best_params_

{'algo__max_depth': None,
 'algo__max_features': 0.8,
 'algo__min_samples_leaf': 6}

In [51]:
DT_Tuned = GS_DT.best_estimator_

In [52]:
y_DT_Tuned_train = DT_Tuned.predict(X_train)

In [53]:
y_DT_Tuned_test = DT_Tuned.predict(X_test)

In [54]:
accuracy_score(y_train, y_DT_Tuned_train)

0.9999437997021384

In [55]:
accuracy_score(y_test, y_DT_Tuned_test)

0.9998876025626616

In [56]:
print(classification_report(y_test, y_DT_Tuned_test))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      1.00      1.00      3089
      Middle       1.00      1.00      1.00       826
         Top       1.00      1.00      1.00        42
Upper Middle       1.00      1.00      1.00       152

    accuracy                           1.00      8897
   macro avg       1.00      1.00      1.00      8897
weighted avg       1.00      1.00      1.00      8897



# RandomForestClassifier

In [30]:
skf = StratifiedKFold(n_splits=3, random_state=42)

param_RF = {
    "algo__n_estimators" : np.arange(100, 301, 100),
    "algo__max_depth" : [None, 5],
    "algo__min_samples_leaf" : np.arange(1, 12, 5),
    "algo__max_features" : [0.3, 0.5]
}

GS_RF = GridSearchCV(pipeRF, param_RF, cv = skf, n_jobs = -1, verbose = 1, scoring='balanced_accuracy')
GS_RF.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  3.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer())]),
                                                                         ['fireplaces',
                                                                          'market_value',
                                                                          'number_of_rooms',
                                                                          'total_area',
                                                                          'total_livable_area']),
                                                                        ('categoric',
                                     

In [31]:
GS_RF.best_params_

{'algo__max_depth': None,
 'algo__max_features': 0.5,
 'algo__min_samples_leaf': 1,
 'algo__n_estimators': 200}

In [32]:
RF_Tuned = GS_RF.best_estimator_

In [33]:
y_RF_Tuned_train = RF_Tuned.predict(X_train)

In [34]:
y_RF_Tuned_test = RF_Tuned.predict(X_test)

In [35]:
accuracy_score(y_train, y_RF_Tuned_train)

1.0

In [36]:
accuracy_score(y_test, y_RF_Tuned_test)

0.9998876025626616

In [37]:
print(classification_report(y_test, y_RF_Tuned_test))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      1.00      1.00      3089
      Middle       1.00      1.00      1.00       826
         Top       1.00      1.00      1.00        42
Upper Middle       1.00      1.00      1.00       152

    accuracy                           1.00      8897
   macro avg       1.00      1.00      1.00      8897
weighted avg       1.00      1.00      1.00      8897



# KNN

In [19]:
skf = StratifiedKFold(n_splits=3, random_state=42)

param_KNN = {
    "algo__n_neighbors" : np.arange(1, 15, 2),
    "algo__p" : [1, 2],
    "algo__weights" : ['uniform', 'distance']
}

GS_KNN = GridSearchCV(pipeKNN, param_KNN, cv = skf, n_jobs = -1, verbose = 1, scoring='f1')
GS_KNN.fit(X_train, y_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


MemoryError: Unable to allocate 1.00 GiB for an array with shape (5657, 23724) and data type float64

In [185]:
GS_KNN.best_params_

{'algo__n_neighbors': 11, 'algo__p': 2, 'algo__weights': 'uniform'}

In [186]:
KNN_Tuned = GS_KNN.best_estimator_

In [187]:
y_KNN_Tuned_train = KNN_Tuned.predict(X_train)

In [188]:
y_KNN_Tuned_test = KNN_Tuned.predict(X_test)

In [189]:
accuracy_score(y_train, y_KNN_Tuned_train)

0.9993536965745918

In [190]:
accuracy_score(y_test, y_KNN_Tuned_test)

0.9993256153759694

In [191]:
print(classification_report(y_test, y_KNN_Tuned_test))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      1.00      1.00      3089
      Middle       1.00      1.00      1.00       826
         Top       1.00      1.00      1.00        42
Upper Middle       1.00      0.98      0.99       152

    accuracy                           1.00      8897
   macro avg       1.00      1.00      1.00      8897
weighted avg       1.00      1.00      1.00      8897



# =================================================

# Fine Tuning

## DecisionTreeClassifier

In [11]:
pipeDT_Tuning = Pipeline([
    ("prep", preprocessor),
    ("algo", DecisionTreeClassifier(max_depth = 4, max_features = 0.5, min_samples_leaf = 4, min_samples_split = 2, random_state=42))
])

In [12]:
pipeDT_Tuning.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [13]:
y_train_DT_Fine_Tuned = pipeDT_Tuning.predict(X_train)

In [14]:
y_test_DT_Fine_Tuned = pipeDT_Tuning.predict(X_test)

In [15]:
accuracy_score(y_train, y_train_DT_Fine_Tuned)

0.9324472419703824

In [16]:
accuracy_score(y_test, y_test_DT_Fine_Tuned)

0.9337979094076655

In [17]:
print(classification_report(y_test, y_test_DT_Fine_Tuned))

              precision    recall  f1-score   support

      Bottom       1.00      0.97      0.98      4788
Lower Middle       0.89      0.98      0.93      3089
      Middle       0.77      0.74      0.76       826
         Top       0.00      0.00      0.00        42
Upper Middle       0.46      0.27      0.34       152

    accuracy                           0.93      8897
   macro avg       0.62      0.59      0.60      8897
weighted avg       0.93      0.93      0.93      8897



In [None]:
"n_estimators" : np.arange(100, 301, 100),
    "algo__max_depth" : [None, 5],
    "algo__min_samples_leaf" : np.arange(1, 12, 5),
    "algo__max_features" : [0.3, 0.5]
}

In [43]:
pipeRF_tuning = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestClassifier(n_estimators = 100, max_depth= None, min_samples_leaf= 2, max_features= 0.3))
])

In [44]:
pipeRF_tuning.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [45]:
y_train_RF_Fine_Tuned = pipeRF_tuning.predict(X_train)

In [50]:
y_test_RF_Fine_Tuned = pipeRF_tuning.predict(X_test)

In [51]:
accuracy_score(y_train, y_train_RF_Fine_Tuned)

0.9999718998510692

In [52]:
accuracy_score(y_test, y_test_RF_Fine_Tuned)

0.9996628076879848

In [53]:
print(classification_report(y_test, y_test_DT_Fine_Tuned))

              precision    recall  f1-score   support

      Bottom       1.00      0.97      0.98      4788
Lower Middle       0.89      0.98      0.93      3089
      Middle       0.77      0.74      0.76       826
         Top       0.00      0.00      0.00        42
Upper Middle       0.46      0.27      0.34       152

    accuracy                           0.93      8897
   macro avg       0.62      0.59      0.60      8897
weighted avg       0.93      0.93      0.93      8897



# over sampling

In [141]:
from sklearn.utils import resample

In [142]:
df_train = pd.concat([X_train, y_train], axis=1)

In [143]:
df_train['segment'].value_counts()

Bottom          19150
Lower Middle    12354
Middle           3306
Upper Middle      608
Top               169
Name: segment, dtype: int64

In [144]:
top = df_train[df_train['segment'] == 'Top'] ## Kelas Minority
upper_middle = df_train[df_train['segment'] == 'Upper Middle']
middle = df_train[df_train['segment'] == 'Middle']

In [145]:
lower_middle = df_train[df_train['segment'] == 'Lower Middle'] ## Kelas Majority


In [146]:
bottom = df_train[df_train['segment'] == 'Bottom'] ## Kelas Majority


In [147]:
top_oversample = resample(top, ## Kelas Minority
                           replace=True,
                           n_samples=len(bottom), ## Kelas Majority
                           random_state = 42)
upper_middle_oversample = resample(upper_middle, ## Kelas Minority
                           replace=True,
                           n_samples=len(bottom), ## Kelas Majority
                           random_state = 42)
middle_oversample = resample(middle, ## Kelas Minority
                           replace=True,
                           n_samples=len(bottom), ## Kelas Majority
                           random_state = 42)

In [148]:
df_OverSample = pd.concat([bottom, lower_middle, top_oversample, upper_middle_oversample, middle_oversample])

In [149]:
X_train_OS = df_OverSample.drop(columns='segment')
y_train_OS = df_OverSample['segment']

# LR oversampling

In [265]:
pipeLR.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [266]:
y_LR_trainOS = pipeLR.predict(X_train)

In [267]:
y_LR_testOS = pipeLR.predict(X_test)

In [268]:
accuracy_score(y_train, y_LR_trainOS)

0.4681765813358811

In [269]:
accuracy_score(y_test, y_LR_testOS)

0.4708328650106778

In [271]:
print(classification_report(y_test, y_LR_testOS))

              precision    recall  f1-score   support

      Bottom       0.90      0.78      0.84      4788
Lower Middle       0.00      0.00      0.00      3089
      Middle       0.11      0.46      0.17       826
         Top       0.05      0.79      0.09        42
Upper Middle       0.05      0.20      0.09       152

    accuracy                           0.47      8897
   macro avg       0.22      0.45      0.24      8897
weighted avg       0.49      0.47      0.47      8897



# KNN oversampling

In [272]:
pipeKNN.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [273]:
y_KNN_trainOS = pipeKNN.predict(X_train)

In [274]:
y_KNN_testOS = pipeKNN.predict(X_test)

In [275]:
accuracy_score(y_train, y_KNN_trainOS)

0.999606597914969

In [276]:
accuracy_score(y_test, y_KNN_testOS)

0.9987636281892773

In [278]:
print(classification_report(y_test, y_KNN_testOS))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      1.00      1.00      3089
      Middle       0.99      1.00      0.99       826
         Top       1.00      1.00      1.00        42
Upper Middle       0.99      0.99      0.99       152

    accuracy                           1.00      8897
   macro avg       1.00      1.00      1.00      8897
weighted avg       1.00      1.00      1.00      8897



# Dt oversample 

In [280]:
pipeDT.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [281]:
y_DT_trainOS = pipeDT.predict(X_train)

In [282]:
y_DT_testOS = pipeDT.predict(X_test)

In [284]:
accuracy_score(y_train, y_DT_trainOS)

1.0

In [285]:
accuracy_score(y_test, y_DT_testOS)

0.9998876025626616

In [286]:
print(classification_report(y_test, y_DT_testOS))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      1.00      1.00      3089
      Middle       1.00      1.00      1.00       826
         Top       1.00      1.00      1.00        42
Upper Middle       1.00      1.00      1.00       152

    accuracy                           1.00      8897
   macro avg       1.00      1.00      1.00      8897
weighted avg       1.00      1.00      1.00      8897



# DT Oversample fine tuning

In [258]:
pipeDT_Tuning = Pipeline([
    ("prep", preprocessor),
    ("algo", DecisionTreeClassifier(max_depth =5, max_features = 0.4, min_samples_leaf = 2, min_samples_split = 2, random_state=42))
])

In [259]:
pipeDT_Tuning.fit(X_train_OS, y_train_OS)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['fireplaces', 'market_value',
                                                   'number_of_rooms',
                                                   'total_area',
                                                   'total_livable_area']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder())]),
                                           

In [260]:
y_train_DT_Fine_TunedOS = pipeDT_Tuning.predict(X_train)

In [261]:
y_test_DT_Fine_TunedOS = pipeDT_Tuning.predict(X_test)

In [262]:
accuracy_score(y_train, y_train_DT_Fine_TunedOS)

0.9140416444207154

In [263]:
accuracy_score(y_test, y_test_DT_Fine_TunedOS)

0.9182870630549623

In [264]:
print(classification_report(y_test, y_test_DT_Fine_TunedOS))

              precision    recall  f1-score   support

      Bottom       1.00      1.00      1.00      4788
Lower Middle       1.00      0.81      0.89      3089
      Middle       0.56      0.90      0.69       826
         Top       0.52      0.76      0.62        42
Upper Middle       0.53      0.77      0.63       152

    accuracy                           0.92      8897
   macro avg       0.72      0.85      0.76      8897
weighted avg       0.95      0.92      0.93      8897



# Joblib Model Pipeline Fine Tuning DecisionTreeClassifier

import joblib

joblib.dump(pipeDT_Tuning, 'Model_PHL_Building_Classification_DT')