In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from slugify import slugify
import glob
import re
from scipy.stats import randint, uniform

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier



from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedShuffleSplit, cross_validate, TimeSeriesSplit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_curve, f1_score,roc_auc_score, roc_curve, precision_recall_fscore_support, make_scorer, recall_score
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight

from sklearn.inspection import PartialDependenceDisplay
from sklearn import tree

from treeinterpreter import treeinterpreter
from waterfall_chart import plot as waterfall

from imblearn.over_sampling import SMOTE,ADASYN,RandomOverSampler,BorderlineSMOTE,SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
#Reading the model datset
df_normal = pd.read_csv(r'../../data/clean/train/HealthInspectionTrain.csv', keep_default_na = False)
df_smote = pd.read_csv(r'../../data/clean/train/HealthInspectionSMOTETrain.csv', keep_default_na = False)
df_gan = pd.read_csv(r'../../data/clean/train/HealthInspectionGANTrain.csv', keep_default_na = False)

In [9]:
df_val = pd.read_csv(r'../../data/clean/val/HealthInspectionVal.csv', keep_default_na = False)

In [3]:
df_normal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16909 entries, 0 to 16908
Data columns (total 38 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   violation_count                                         16909 non-null  float64
 1   latitude                                                16909 non-null  float64
 2   longitude                                               16909 non-null  float64
 3   failFlag                                                16909 non-null  int64  
 4   avg_violations_last_3                                   16909 non-null  float64
 5   fail_rate_last_3                                        16909 non-null  float64
 6   days_since_last_inspection                              16909 non-null  float64
 7   trend_last_3                                            16909 non-null  float64
 8   BusinessName_id                     

In [5]:
df_gan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25909 entries, 0 to 25908
Data columns (total 38 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   violation_count                                         25909 non-null  float64
 1   latitude                                                25909 non-null  float64
 2   longitude                                               25909 non-null  float64
 3   failFlag                                                25909 non-null  float64
 4   avg_violations_last_3                                   25909 non-null  float64
 5   fail_rate_last_3                                        25909 non-null  float64
 6   days_since_last_inspection                              25909 non-null  float64
 7   trend_last_3                                            25909 non-null  float64
 8   BusinessName_id                     

In [10]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9147 entries, 0 to 9146
Data columns (total 38 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   violation_count                                         9147 non-null   float64
 1   latitude                                                9147 non-null   float64
 2   longitude                                               9147 non-null   float64
 3   failFlag                                                9147 non-null   int64  
 4   avg_violations_last_3                                   9147 non-null   float64
 5   fail_rate_last_3                                        9147 non-null   float64
 6   days_since_last_inspection                              9147 non-null   float64
 7   trend_last_3                                            9147 non-null   float64
 8   BusinessName_id                       

In [6]:
#We drop business_id and address_id from all three dataframes

In [7]:
df_normal = df_normal.drop(columns=['BusinessName_id', 'Address_id'])
df_smote = df_smote.drop(columns=['BusinessName_id', 'Address_id'])
df_gan = df_gan.drop(columns=['BusinessName_id', 'Address_id'])

In [11]:
df_val = df_val.drop(columns=['BusinessName_id', 'Address_id'])

In [8]:
X_normal = df_normal.drop(columns='failFlag')
Y_normal = df_normal['failFlag']
X_smote = df_smote.drop(columns='failFlag')
Y_smote = df_smote['failFlag']
X_gan = df_gan.drop(columns='failFlag')
Y_gan = df_gan['failFlag']

In [12]:
X_val = df_val.drop(columns='failFlag')
Y_val = df_val['failFlag']

In [14]:
#Standardising the data for the building the model
scaler=StandardScaler()
X_normal_scaled = scaler.fit_transform(X_normal)
X_val_normal_scaled = scaler.transform(X_val)

In [15]:
X_smote_scaled = scaler.fit_transform(X_smote)
X_val_smote_scaled = scaler.transform(X_val)

In [16]:
X_gan_scaled = scaler.fit_transform(X_gan)
X_val_gan_scaled = scaler.transform(X_val)

In [21]:
#Basic RandomForest Model
model = RandomForestClassifier(random_state=55)
model.fit(X_normal_scaled,Y_normal)

RandomForestClassifier(random_state=55)

In [22]:

print("TRAINING METRICS")
print("Accuracy:", model.score(X_normal_scaled,Y_normal))
print(classification_report(Y_normal,model.predict(X_normal_scaled)))


print("\nTEST METRICS")
print("Accuracy:", model.score(X_val_normal_scaled,Y_val))
print(classification_report(Y_val,model.predict(X_val_normal_scaled)))


TRAINING METRICS
Accuracy: 0.9990537583535395
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15576
           1       1.00      0.99      0.99      1333

    accuracy                           1.00     16909
   macro avg       1.00      0.99      1.00     16909
weighted avg       1.00      1.00      1.00     16909


TEST METRICS
Accuracy: 0.9369192084836558
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8574
           1       0.39      0.01      0.02       573

    accuracy                           0.94      9147
   macro avg       0.66      0.51      0.50      9147
weighted avg       0.90      0.94      0.91      9147



In [23]:
#Basic RandomForest Model
model = RandomForestClassifier(random_state=55, min_samples_leaf=5, class_weight='balanced_subsample')
model.fit(X_normal_scaled,Y_normal)

RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=5,
                       random_state=55)

In [24]:
print("TRAINING METRICS")
print("Accuracy:", model.score(X_normal_scaled,Y_normal))
print(classification_report(Y_normal,model.predict(X_normal_scaled)))


print("\nTEST METRICS")
print("Accuracy:", model.score(X_val_normal_scaled,Y_val))
print(classification_report(Y_val,model.predict(X_val_normal_scaled)))


TRAINING METRICS
Accuracy: 0.9520965166479389
              precision    recall  f1-score   support

           0       0.99      0.96      0.97     15576
           1       0.64      0.89      0.75      1333

    accuracy                           0.95     16909
   macro avg       0.82      0.92      0.86     16909
weighted avg       0.96      0.95      0.96     16909


TEST METRICS
Accuracy: 0.9074013337706351
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      8574
           1       0.21      0.18      0.19       573

    accuracy                           0.91      9147
   macro avg       0.58      0.57      0.57      9147
weighted avg       0.90      0.91      0.90      9147



# Try dropping violation_count

In [33]:
X_normal = X_normal.drop('violation_count', axis = 1)

In [34]:
X_normal_scaled = scaler.fit_transform(X_normal)
X_val_normal_scaled = scaler.transform(X_val.drop('violation_count', axis = 1))

In [37]:
#Basic RandomForest Model
model = RandomForestClassifier(random_state=55, n_estimators = 200, min_samples_leaf=5, class_weight='balanced')
model.fit(X_normal_scaled,Y_normal)

RandomForestClassifier(class_weight='balanced', min_samples_leaf=5,
                       n_estimators=200, random_state=55)

In [38]:
print("TRAINING METRICS")
print("Accuracy:", model.score(X_normal_scaled,Y_normal))
print(classification_report(Y_normal,model.predict(X_normal_scaled)))


print("\nTEST METRICS")
print("Accuracy:", model.score(X_val_normal_scaled,Y_val))
print(classification_report(Y_val,model.predict(X_val_normal_scaled)))


TRAINING METRICS
Accuracy: 0.9551126618960317
              precision    recall  f1-score   support

           0       0.99      0.96      0.98     15576
           1       0.66      0.90      0.76      1333

    accuracy                           0.96     16909
   macro avg       0.82      0.93      0.87     16909
weighted avg       0.96      0.96      0.96     16909


TEST METRICS
Accuracy: 0.9116650267847382
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      8574
           1       0.23      0.17      0.19       573

    accuracy                           0.91      9147
   macro avg       0.59      0.57      0.57      9147
weighted avg       0.90      0.91      0.91      9147



## SMOTE 

In [43]:
#Basic RandomForest Model
model = RandomForestClassifier(random_state=55, n_estimators = 300,min_samples_leaf=7, class_weight='balanced_subsample')
model.fit(X_smote_scaled,Y_smote)

RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=7,
                       n_estimators=300, random_state=55)

In [44]:
print("TRAINING METRICS")
print("Accuracy:", model.score(X_smote_scaled,Y_smote))
print(classification_report(Y_smote,model.predict(X_smote_scaled)))


print("\nTEST METRICS")
print("Accuracy:", model.score(X_val_smote_scaled,Y_val))
print(classification_report(Y_val,model.predict(X_val_smote_scaled)))


TRAINING METRICS
Accuracy: 0.9291217257318952
              precision    recall  f1-score   support

           0       0.92      0.93      0.93     15576
           1       0.93      0.92      0.93     15576

    accuracy                           0.93     31152
   macro avg       0.93      0.93      0.93     31152
weighted avg       0.93      0.93      0.93     31152


TEST METRICS
Accuracy: 0.8339346233737838
              precision    recall  f1-score   support

           0       0.95      0.86      0.91      8574
           1       0.16      0.38      0.22       573

    accuracy                           0.83      9147
   macro avg       0.56      0.62      0.57      9147
weighted avg       0.90      0.83      0.86      9147



## GAN

In [47]:
model = RandomForestClassifier(random_state=55, n_estimators = 300,min_samples_leaf=10, class_weight='balanced_subsample')
model.fit(X_gan_scaled,Y_gan)

RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=10,
                       n_estimators=300, random_state=55)

In [48]:
print("TRAINING METRICS")
print("Accuracy:", model.score(X_gan_scaled,Y_gan))
print(classification_report(Y_gan,model.predict(X_gan_scaled)))


print("\nTEST METRICS")
print("Accuracy:", model.score(X_val_gan_scaled,Y_val))
print(classification_report(Y_val,model.predict(X_val_gan_scaled)))


TRAINING METRICS
Accuracy: 0.9485892932957659
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     15576
         1.0       1.00      0.87      0.93     10333

    accuracy                           0.95     25909
   macro avg       0.96      0.94      0.95     25909
weighted avg       0.95      0.95      0.95     25909


TEST METRICS
Accuracy: 0.7817863780474472
              precision    recall  f1-score   support

           0       0.94      0.82      0.88      8574
           1       0.07      0.20      0.10       573

    accuracy                           0.78      9147
   macro avg       0.50      0.51      0.49      9147
weighted avg       0.88      0.78      0.83      9147

