## Description of our dataset
* **inspection_date** (Floating Timestamp): Date and time when the inspection occurred.
* **inspection_type_clean** (Text): Type/category of the inspection conducted.
* **facility_rating_status** (Text): Rating status of the facility after inspection.
* **violation_count** (Number): Total number of violations observed.
* **violation_codes** (Text): Codes corresponding to the observed violations.
* **analysis_neighborhood** (Text): Neighborhood used for analysis purposes.
* **violation_observed** (Number) :Any violation observed during the inspection
* **address** (Text): Street address of the business.
* **Name** (Text): Name of the business.
* **has_violation_count** (Binary) : has violation_count variable or no.
* **has_violation_observed** (Binary) : has violation_observed or no.


In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from slugify import slugify
import glob
import re
from scipy.stats import randint, uniform

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import joblib


from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedShuffleSplit, cross_validate, TimeSeriesSplit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_curve, f1_score,roc_auc_score, roc_curve, precision_recall_fscore_support, make_scorer, recall_score
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight

from sklearn.inspection import PartialDependenceDisplay
from sklearn import tree

from treeinterpreter import treeinterpreter
from waterfall_chart import plot as waterfall

from imblearn.over_sampling import SMOTE,ADASYN,RandomOverSampler,BorderlineSMOTE,SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df_before = pd.read_csv(r"../../data/clean/Merged_non_null_dataset.csv", parse_dates=['inspection_date'], keep_default_na = False)

In [3]:
df = df_before.copy()

In [4]:
df = df.sort_values(['name', 'inspection_date'])


In [5]:
# Encode target
target_map = {"Pass": 0, "Conditional Pass": 1, "Closure": 2}
df['facility_rating_status'] = df['facility_rating_status'].map(target_map)

In [6]:
df['prev_rating_majority_3'] = df.groupby('name')['facility_rating_status'] \
                                 .apply(lambda x: x.shift(1).rolling(3, min_periods=1)
                                         .apply(lambda y: y.mode()[0] if len(y.mode())>0 else -1))


df['days_since_last_inspection'] = df.groupby('name')['inspection_date'].diff().dt.days
# df['prev_violation_count'] = df.groupby('name')['violation_count'].shift(1)
df['avg_violation_count_last_3'] = df.groupby('name')['violation_count'].shift(1).rolling(3, min_periods=1).mean()
df['is_first_inspection'] = df.groupby('name')['facility_rating_status'].cumcount().eq(0).astype(int)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65504 entries, 89 to 51569
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   inspection_date             65504 non-null  datetime64[ns]
 1   facility_rating_status      65504 non-null  int64         
 2   violation_observed          65504 non-null  float64       
 3   violation_codes             65504 non-null  object        
 4   address                     65504 non-null  object        
 5   name                        65504 non-null  object        
 6   violation_count             65504 non-null  float64       
 7   analysis_neighborhood       65504 non-null  object        
 8   inspection_type_clean       65504 non-null  object        
 9   has_violation_count         65504 non-null  int64         
 10  has_violation_observed      65504 non-null  int64         
 11  prev_rating_majority_3      57460 non-null  float64  

In [8]:
df['prev_rating_majority_3'] = df['prev_rating_majority_3'].fillna(-1)
df['avg_violation_count_last_3'] = df['avg_violation_count_last_3'].fillna(-1)
df['days_since_last_inspection'] = df['days_since_last_inspection'].fillna(0)


In [9]:
# Drop columns that are unnecessary for model. We will try to do something with violation_code at 
#last to see if there can be some improvement. Violation_observed and has_violation_observed has no use in production. 
# It is not used in the future datasets anymore
drop_cols = ['violation_codes', 'violation_observed', 'has_violation_observed']  
df.drop(columns=drop_cols, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65504 entries, 89 to 51569
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   inspection_date             65504 non-null  datetime64[ns]
 1   facility_rating_status      65504 non-null  int64         
 2   address                     65504 non-null  object        
 3   name                        65504 non-null  object        
 4   violation_count             65504 non-null  float64       
 5   analysis_neighborhood       65504 non-null  object        
 6   inspection_type_clean       65504 non-null  object        
 7   has_violation_count         65504 non-null  int64         
 8   prev_rating_majority_3      65504 non-null  float64       
 9   days_since_last_inspection  65504 non-null  float64       
 10  avg_violation_count_last_3  65504 non-null  float64       
 11  is_first_inspection         65504 non-null  int32    

In [11]:
#Saving data for visualization purpose
df.to_csv('../../data/clean/dataset_visualization.csv', index=False)


In [12]:
df = df.drop(['name', 'address'], axis = 1)

In [13]:
# 3. One-hot encoding
# ======================
cat_cols = ['analysis_neighborhood', 'inspection_type_clean']
df = pd.get_dummies(df, columns=cat_cols, drop_first= True)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65504 entries, 89 to 51569
Data columns (total 60 columns):
 #   Column                                                Non-Null Count  Dtype         
---  ------                                                --------------  -----         
 0   inspection_date                                       65504 non-null  datetime64[ns]
 1   facility_rating_status                                65504 non-null  int64         
 2   violation_count                                       65504 non-null  float64       
 3   has_violation_count                                   65504 non-null  int64         
 4   prev_rating_majority_3                                65504 non-null  float64       
 5   days_since_last_inspection                            65504 non-null  float64       
 6   avg_violation_count_last_3                            65504 non-null  float64       
 7   is_first_inspection                                   65504 non-null  int32

In [15]:
df = df.sort_values('inspection_date')

In [16]:
df = df.drop(columns='inspection_date')

In [17]:
# Save the model_ready dataset
df.to_csv('../../data/clean/model_dataset.csv', index=False)


In [18]:
#Try standardisation
#Try dropping both vilatin_observed cols and try to see the score

In [19]:
#Reading the model datset
df = pd.read_csv(r'../../data/clean/model_dataset.csv', keep_default_na = False)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65504 entries, 0 to 65503
Data columns (total 59 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   facility_rating_status                                65504 non-null  int64  
 1   violation_count                                       65504 non-null  float64
 2   has_violation_count                                   65504 non-null  int64  
 3   prev_rating_majority_3                                65504 non-null  float64
 4   days_since_last_inspection                            65504 non-null  float64
 5   avg_violation_count_last_3                            65504 non-null  float64
 6   is_first_inspection                                   65504 non-null  int64  
 7   analysis_neighborhood_Bernal Heights                  65504 non-null  int64  
 8   analysis_neighborhood_Castro/Upper Market             65

In [21]:
df

Unnamed: 0,facility_rating_status,violation_count,has_violation_count,prev_rating_majority_3,days_since_last_inspection,avg_violation_count_last_3,is_first_inspection,analysis_neighborhood_Bernal Heights,analysis_neighborhood_Castro/Upper Market,analysis_neighborhood_Chinatown,...,inspection_type_clean_foodborne_illness,inspection_type_clean_new_construction,inspection_type_clean_new_ownership,inspection_type_clean_new_ownership_followup,inspection_type_clean_plan_check,inspection_type_clean_plan_check_reinspection,inspection_type_clean_reinspection,inspection_type_clean_routine,inspection_type_clean_site_visit,inspection_type_clean_structural
0,0,0.0,0,0.0,0.0,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0.0,0,-1.0,0.0,0.000000,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0.0,0,-1.0,0.0,0.000000,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0.0,0,0.0,0.0,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0.0,0,0.0,0.0,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65499,0,1.0,1,0.0,0.0,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
65500,0,7.0,1,0.0,55.0,2.000000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
65501,0,10.0,1,0.0,161.0,3.000000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
65502,0,0.0,1,0.0,27.0,3.666667,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [22]:
def split_vals(X,Y):
    # 70/30 split
    global X_train, X_test, Y_train, Y_test
    split_idx = int(len(X) * 0.7)
    X_train = X.iloc[:split_idx].copy()
    X_test  = X.iloc[split_idx:].copy()
    Y_train = Y.iloc[:split_idx].copy()
    Y_test  = Y.iloc[split_idx:].copy()
    return None


In [23]:
def print_score(m):
    l=['Training Score','Testing Score','Training Classification Report','Test Classification Report','OOB_Score']
    res = [m.score(X_train, Y_train), m.score(X_test, Y_test),classification_report(Y_train,m.predict(X_train)),
           classification_report(Y_test,m.predict(X_test))]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    
    for i,j in dict(zip(l,res)).items():
        print(f'\n{i}: ')
        print(j)

In [24]:
target = 'facility_rating_status'

# Drop target and inspection_date from features
features = [c for c in df.columns if c != target]

X = df[features]
Y = df[target]


In [25]:
split_vals(X,Y)

In [26]:
#Standardising the data for the building the model
scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## XG BOOST

In [27]:
# 3. Train XGBoost classifier
model = XGBClassifier(random_state=42, n_estimators=500)
model.fit(X_train, Y_train)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

In [28]:
print_score(model)


Training Score: 
0.96370932565646

Testing Score: 
0.9030633014451456

Training Classification Report: 
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     37679
           1       0.99      0.82      0.90      5666
           2       0.99      0.78      0.87      2507

    accuracy                           0.96     45852
   macro avg       0.98      0.86      0.91     45852
weighted avg       0.96      0.96      0.96     45852


Test Classification Report: 
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     18014
           1       0.71      0.37      0.49      1045
           2       0.18      0.25      0.21       593

    accuracy                           0.90     19652
   macro avg       0.61      0.52      0.55     19652
weighted avg       0.91      0.90      0.90     19652



In [29]:
model = XGBClassifier(random_state=42, n_estimators=500)
model.fit(X_train_scaled, Y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

In [30]:

print("TRAINING METRICS")
print("Accuracy:", model.score(X_train_scaled,Y_train))
print(classification_report(Y_train,model.predict(X_train_scaled)))


print("\nTEST METRICS")
print("Accuracy:", model.score(X_test_scaled,Y_test))
print(classification_report(Y_test,model.predict(X_test_scaled)))


TRAINING METRICS
Accuracy: 0.96370932565646
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     37679
           1       0.99      0.82      0.90      5666
           2       0.99      0.78      0.87      2507

    accuracy                           0.96     45852
   macro avg       0.98      0.86      0.91     45852
weighted avg       0.96      0.96      0.96     45852


TEST METRICS
Accuracy: 0.9024017911662935
              precision    recall  f1-score   support

           0       0.94      0.95      0.95     18014
           1       0.71      0.37      0.49      1045
           2       0.18      0.25      0.21       593

    accuracy                           0.90     19652
   macro avg       0.61      0.52      0.55     19652
weighted avg       0.91      0.90      0.90     19652



In [31]:
#Standardisation did not make much difference

## Class_weights

In [32]:
# import numpy as np
# import xgboost as xgb
# from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(Y_train)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=Y_train
)

weight_map = dict(zip(classes, class_weights))
sample_weights = np.array([weight_map[y] for y in Y_train])

model = XGBClassifier(
    objective="multi:softprob",
    num_class=len(classes),
    max_depth=4,
    learning_rate=0.1,
    n_estimators=400,
    random_state=42
)

model.fit(X_train, Y_train, sample_weight=sample_weights)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=400, n_jobs=None, num_class=3,
              num_parallel_tree=None, objective='multi:softprob', ...)

In [33]:
print_score(model)


Training Score: 
0.9413547936840269

Testing Score: 
0.8246488906981477

Training Classification Report: 
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     37679
           1       0.92      0.82      0.87      5666
           2       0.69      0.82      0.75      2507

    accuracy                           0.94     45852
   macro avg       0.86      0.87      0.86     45852
weighted avg       0.94      0.94      0.94     45852


Test Classification Report: 
              precision    recall  f1-score   support

           0       0.94      0.87      0.90     18014
           1       0.51      0.39      0.44      1045
           2       0.08      0.31      0.13       593

    accuracy                           0.82     19652
   macro avg       0.51      0.52      0.49     19652
weighted avg       0.89      0.82      0.85     19652



## Random Forest

In [34]:
#Basic RandomForest Model
base_rf_model = RandomForestClassifier(random_state=55)
base_rf_model_res = base_rf_model.fit(X_train,Y_train)

In [35]:
print_score(base_rf_model)


Training Score: 
0.9707973479891826

Testing Score: 
0.8973132505597394

Training Classification Report: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     37679
           1       0.99      0.86      0.92      5666
           2       0.99      0.82      0.89      2507

    accuracy                           0.97     45852
   macro avg       0.98      0.89      0.93     45852
weighted avg       0.97      0.97      0.97     45852


Test Classification Report: 
              precision    recall  f1-score   support

           0       0.94      0.95      0.95     18014
           1       0.52      0.38      0.44      1045
           2       0.20      0.25      0.23       593

    accuracy                           0.90     19652
   macro avg       0.55      0.53      0.54     19652
weighted avg       0.90      0.90      0.90     19652



In [36]:

balanced_rf_model = RandomForestClassifier(n_estimators=500, class_weight= 'balanced', random_state=55, min_samples_leaf= 5)
balanced_rf_model.fit(X_train,Y_train)

RandomForestClassifier(class_weight='balanced', min_samples_leaf=5,
                       n_estimators=500, random_state=55)

In [37]:
print_score(balanced_rf_model)


Training Score: 
0.9431867748407922

Testing Score: 
0.860828414410747

Training Classification Report: 
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     37679
           1       0.92      0.82      0.87      5666
           2       0.72      0.82      0.76      2507

    accuracy                           0.94     45852
   macro avg       0.87      0.87      0.87     45852
weighted avg       0.94      0.94      0.94     45852


Test Classification Report: 
              precision    recall  f1-score   support

           0       0.95      0.91      0.93     18014
           1       0.53      0.39      0.45      1045
           2       0.12      0.33      0.18       593

    accuracy                           0.86     19652
   macro avg       0.53      0.54      0.52     19652
weighted avg       0.90      0.86      0.88     19652



In [38]:
Y.value_counts()

0    55693
1     6711
2     3100
Name: facility_rating_status, dtype: int64

## Under Sampling 

In [39]:
rus = RandomUnderSampler(
    sampling_strategy={0: 10000, 1: 5666, 2: 2507},
    random_state=42
)


In [40]:
X_train_us, Y_train_us = rus.fit_resample(X_train, Y_train)

In [69]:

balanced_rf_model = RandomForestClassifier(n_estimators=300, random_state=55, min_samples_leaf=7)
balanced_rf_model.fit(X_train_us,Y_train_us)

RandomForestClassifier(min_samples_leaf=7, n_estimators=300, random_state=55)

In [70]:

print("TRAINING METRICS")
print("Accuracy:", balanced_rf_model.score(X_train_us,Y_train_us))
print(classification_report(Y_train_us,balanced_rf_model.predict(X_train_us)))


print("\nTEST METRICS")
print("Accuracy:", balanced_rf_model.score(X_test,Y_test))
print(classification_report(Y_test,balanced_rf_model.predict(X_test)))


TRAINING METRICS
Accuracy: 0.8946238925879051
              precision    recall  f1-score   support

           0       0.85      0.99      0.91     10000
           1       0.98      0.80      0.88      5666
           2       1.00      0.72      0.84      2507

    accuracy                           0.89     18173
   macro avg       0.94      0.84      0.88     18173
weighted avg       0.91      0.89      0.89     18173


TEST METRICS
Accuracy: 0.9341033991451252
              precision    recall  f1-score   support

           0       0.94      0.99      0.97     18014
           1       0.65      0.40      0.49      1045
           2       0.87      0.24      0.38       593

    accuracy                           0.93     19652
   macro avg       0.82      0.54      0.61     19652
weighted avg       0.93      0.93      0.92     19652



In [71]:

#Saving the model
joblib.dump(model, "random_forest_multiclass.pkl")



['random_forest_multiclass.pkl']

In [74]:
model = XGBClassifier(random_state=42, n_estimators=500)
model.fit(X_train_us, Y_train_us)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

In [75]:

print("TRAINING METRICS")
print("Accuracy:", model.score(X_train_us,Y_train_us))
print(classification_report(Y_train_us,model.predict(X_train_us)))


print("\nTEST METRICS")
print("Accuracy:", model.score(X_test,Y_test))
print(classification_report(Y_test,model.predict(X_test)))


TRAINING METRICS
Accuracy: 0.9270346117867165
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     10000
           1       0.99      0.85      0.92      5666
           2       0.99      0.82      0.90      2507

    accuracy                           0.93     18173
   macro avg       0.96      0.89      0.92     18173
weighted avg       0.93      0.93      0.93     18173


TEST METRICS
Accuracy: 0.8354874821901078
              precision    recall  f1-score   support

           0       0.94      0.88      0.91     18014
           1       0.35      0.41      0.38      1045
           2       0.10      0.29      0.15       593

    accuracy                           0.84     19652
   macro avg       0.47      0.53      0.48     19652
weighted avg       0.89      0.84      0.86     19652



In [76]:
Y_train.value_counts()

0    37679
1     5666
2     2507
Name: facility_rating_status, dtype: int64

## Over Sampling

In [77]:
#Many methods are available but here we use a combination of SMOTE and RANdomUnderSampler. 
#Apparaently, the combination of two gives us good results.
smote=SMOTE(random_state=42, sampling_strategy= {1: 12000, 2: 12000})
rus = RandomUnderSampler(sampling_strategy={0:15000}, random_state=42)

In [78]:
#We only balance our training dataset
X_train_smote,Y_train_smote=smote.fit_resample(X_train,Y_train)

In [79]:
X_train_sm, Y_train_sm = rus.fit_resample(X_train_smote,Y_train_smote)

In [80]:
Y_train_sm.value_counts()

0    15000
1    12000
2    12000
Name: facility_rating_status, dtype: int64

In [81]:
rf_base_smote_model = RandomForestClassifier(min_samples_leaf=3, random_state=42)
rf_base_smote_model.fit(X_train_sm,Y_train_sm)

RandomForestClassifier(min_samples_leaf=3, random_state=42)

In [82]:

print("TRAINING METRICS")
print("Accuracy:", rf_base_smote_model.score(X_train_sm,Y_train_sm))
print(classification_report(Y_train_sm,rf_base_smote_model.predict(X_train_sm)))


print("\nTEST METRICS")
print("Accuracy:", rf_base_smote_model.score(X_test,Y_test))
print(classification_report(Y_test,rf_base_smote_model.predict(X_test)))


TRAINING METRICS
Accuracy: 0.8933589743589744
              precision    recall  f1-score   support

           0       0.82      0.98      0.89     15000
           1       0.98      0.82      0.89     12000
           2       0.93      0.86      0.90     12000

    accuracy                           0.89     39000
   macro avg       0.91      0.89      0.89     39000
weighted avg       0.90      0.89      0.89     39000


TEST METRICS
Accuracy: 0.8202727457765113
              precision    recall  f1-score   support

           0       0.94      0.86      0.90     18014
           1       0.55      0.38      0.45      1045
           2       0.08      0.31      0.12       593

    accuracy                           0.82     19652
   macro avg       0.52      0.52      0.49     19652
weighted avg       0.89      0.82      0.85     19652



In [83]:
model = XGBClassifier(random_state=42, n_estimators=500)
model.fit(X_train_sm, Y_train_sm)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

In [84]:

print("TRAINING METRICS")
print("Accuracy:", model.score(X_train_sm,Y_train_sm))
print(classification_report(Y_train_sm, model.predict(X_train_sm)))


print("\nTEST METRICS")
print("Accuracy:", model.score(X_test,Y_test))
print(classification_report(Y_test, model.predict(X_test)))


TRAINING METRICS
Accuracy: 0.9190512820512821
              precision    recall  f1-score   support

           0       0.84      0.99      0.91     15000
           1       0.99      0.87      0.93     12000
           2       0.97      0.88      0.93     12000

    accuracy                           0.92     39000
   macro avg       0.93      0.91      0.92     39000
weighted avg       0.93      0.92      0.92     39000


TEST METRICS
Accuracy: 0.8018522287807857
              precision    recall  f1-score   support

           0       0.94      0.84      0.89     18014
           1       0.29      0.41      0.34      1045
           2       0.08      0.28      0.13       593

    accuracy                           0.80     19652
   macro avg       0.44      0.51      0.45     19652
weighted avg       0.88      0.80      0.84     19652



In [85]:
Y_test.value_counts(normalize=True)

0    0.916650
1    0.053175
2    0.030175
Name: facility_rating_status, dtype: float64

In [86]:
Y_train_sm.value_counts(normalize=True)

0    0.384615
1    0.307692
2    0.307692
Name: facility_rating_status, dtype: float64

In [87]:
# Lets try to keep violation_code feature and see if it helps somehow

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65504 entries, 0 to 65503
Data columns (total 59 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   facility_rating_status                                65504 non-null  int64  
 1   violation_count                                       65504 non-null  float64
 2   has_violation_count                                   65504 non-null  int64  
 3   prev_rating_majority_3                                65504 non-null  float64
 4   days_since_last_inspection                            65504 non-null  float64
 5   avg_violation_count_last_3                            65504 non-null  float64
 6   is_first_inspection                                   65504 non-null  int64  
 7   analysis_neighborhood_Bernal Heights                  65504 non-null  int64  
 8   analysis_neighborhood_Castro/Upper Market             65

In [None]:
df['violation_codes'] = df_before['violation_codes']

In [None]:
import string
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

# stopwords setup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')
stopwords = set(en_stopwords + ['rt'])  # add any extra stopwords

def clean_text(docs):
    table = str.maketrans({key: None for key in string.punctuation + string.digits})
    clean_docs = [d.translate(table) for d in docs]
    nlp_docs = [nlp(d) for d in clean_docs]
    lemmatized_docs = [[w.lemma_ if w.lemma_ != '-PRON-' else w.lower_ for w in d] for d in nlp_docs]
    lemmatized_docs = [[lemma for lemma in doc if lemma not in stopwords] for doc in lemmatized_docs]
    clean_docs = [' '.join(l) for l in lemmatized_docs]
    return clean_docs


In [None]:
just_violation = df['violation_codes']
clean_texts = clean_text(just_violation)

In [None]:
import re
import time
import string
from glob import iglob

# data handling and machine learning
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

# text processing
import spacy
#nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')  # might have to use 'en' instead of 'english'
stopwords = set(en_stopwords + ['rt'])  # add RT to stopwords (retweet)

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer