In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
'''
Trying to predict: Whether restaurant at latest point will have a critical flag inspection
Parameters:
 - poverty rate (socioeconomic) 
 - median income (socioeconomic) 
 - graduation rate (socioeconomic) 
 - Average healthpoints in zipcode (REGULARIZE)
 - Average rating per cuisine (REGULARIZE)
 - avg num of critical flags per restaurant 
'''

# Importing necessary files
rest = pd.read_csv('../Restaurant_Grades_20251109.csv')
socio = pd.read_csv('../socioparam.csv', header = 1)


In [2]:
# Cleaning up socioeconomic factors dataset for join
socio[['Irrelevant', 'Zipcode']] = socio['Geographic Area Name'].str.split(n = 1, expand = True)

socio = socio[["Zipcode"
                ,"Estimate!!Total!!MEDIAN EARNINGS IN THE PAST 12 MONTHS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Population 25 years and over with earnings"
               , "Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree or higher"
               ,"Estimate!!Percent!!POVERTY RATE FOR THE POPULATION 25 YEARS AND OVER FOR WHOM POVERTY STATUS IS DETERMINED BY EDUCATIONAL ATTAINMENT LEVEL!!Less than high school graduate"
              ]]

rename_socio = {"Estimate!!Total!!MEDIAN EARNINGS IN THE PAST 12 MONTHS (IN 2023 INFLATION-ADJUSTED DOLLARS)!!Population 25 years and over with earnings": "Median Income",
                "Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree or higher": "College Graduation Rate",
                "Estimate!!Percent!!POVERTY RATE FOR THE POPULATION 25 YEARS AND OVER FOR WHOM POVERTY STATUS IS DETERMINED BY EDUCATIONAL ATTAINMENT LEVEL!!Less than high school graduate": "Poverty Rate"}

socio.rename(columns = rename_socio, inplace = True)
socio = socio.apply(pd.to_numeric, errors='coerce')
socio = socio.astype('float')
rest = rest.merge(socio, left_on = 'ZIPCODE', right_on = 'Zipcode', how = 'left')

In [3]:
# Drop single restaurant iterates
# Create a critical flag column as y column where the most updated restaurant is and the critical flag score of 1 if cf; 0 else

# For the case scenario where there's a subway in brooklyn and manhattan
rest['DBA'] = rest['DBA'] + '_' + rest['BUILDING'] + '_' + rest['STREET']

# repeated rows with different violations
rest = rest.drop('VIOLATION DESCRIPTION', axis=1)

rest.drop_duplicates(inplace = True)
rest.dropna(how = 'all', inplace = True)

rest = rest[rest['Zipcode'].notna()]

rest.sort_values(by = 'GRADE DATE', inplace = True)

rest['DBA'] = rest['DBA'].str.strip().str.capitalize()

rest['CRITICAL FLAG'] = np.where(
    rest['CRITICAL FLAG'] == 'Critical',
    1,   # Value if True
    0    # Value if False
)

latest = rest.groupby('DBA').last() # y-value

rest = rest[rest['DBA'].duplicated(keep = 'last')] # Keeping Everything but last

In [4]:
# Average number of critical flags restaurant recieved 
avg_crit_flag = rest.groupby(['DBA'])['CRITICAL FLAG'].mean()
mean_val = avg_crit_flag.mean() # global mean important for regularization
avg_crit_flag = avg_crit_flag.to_frame()
avg_crit_flag.reset_index(inplace = True)
rest = rest.merge(avg_crit_flag, on = 'DBA', how = 'inner', suffixes = ('rest', 'r')) # Column of values indicating if previously recieved a critical flag

In [5]:
# Average rating per zipcoode
avg_zip_value = rest.groupby('ZIPCODE')['CRITICAL FLAGrest'].mean()
avg_zip_mean = avg_zip_value.mean() # global mean important for regularization
avg_zip_value = avg_zip_value.to_frame()
avg_zip_value.reset_index(inplace = True)
avg_zip_value.rename(columns={'CRITICAL FLAGrest': 'ZipMean'}, inplace=True)
rest = rest.merge(avg_zip_value, on = 'ZIPCODE', how = 'inner')

# Zipcount per item
avg_zip_count = rest.groupby('ZIPCODE')['CRITICAL FLAGrest'].count()
avg_zip_count = avg_zip_count.to_frame()
avg_zip_count.reset_index(inplace = True)
avg_zip_count.rename(columns={'CRITICAL FLAGrest': 'ZipCount'}, inplace=True)
rest = rest.merge(avg_zip_count, on = 'ZIPCODE', how = 'inner')

In [6]:
# Average rating per cuisine
avg_cuis_value = rest.groupby(['CUISINE DESCRIPTION'])['CRITICAL FLAGrest'].mean()
avg_cuis_mean = avg_cuis_value.mean()
avg_cuis_value = avg_cuis_value.to_frame()
avg_cuis_value.reset_index(inplace = True)
avg_cuis_value.rename(columns={'CRITICAL FLAGrest': 'cuis_mean'}, inplace=True)
rest = rest.merge(avg_cuis_value, on = 'CUISINE DESCRIPTION', how = 'inner')
rest.rename(columns={'CRITICAL FLAGr': 'avg_crit_flag'}, inplace = True)

# Cuiscount per item
avg_cuis_count = rest.groupby('ZIPCODE')['CRITICAL FLAGrest'].count()
avg_cuis_count = avg_cuis_count.to_frame()
avg_cuis_count.reset_index(inplace = True)
avg_cuis_count.rename(columns={'CRITICAL FLAGrest': 'CuisCount'}, inplace=True)
rest = rest.merge(avg_cuis_count, on = 'ZIPCODE', how = 'inner')


In [7]:
rest = rest[[
    'DBA', 'Median Income', 'College Graduation Rate', 'Poverty Rate', 'ZipCount', 'CuisCount',
      'avg_crit_flag',  'ZipMean', 'cuis_mean'
]]

In [8]:
latest.reset_index(inplace = True)
latest = latest[['DBA', 'CRITICAL FLAG']]
rest

Unnamed: 0,DBA,Median Income,College Graduation Rate,Poverty Rate,ZipCount,CuisCount,avg_crit_flag,ZipMean,cuis_mean
0,China pagoda_6918_5 avenue,69403.0,52.8,21.9,662,662,0.500000,0.504532,0.506594
1,Happy bowls_61_4 avenue,118533.0,82.2,55.2,1420,1420,0.600000,0.507042,0.482553
2,Happy bowls_61_4 avenue,118533.0,82.2,55.2,1420,1420,0.600000,0.507042,0.482553
3,La antioquena ii bakery_4007_national st,35167.0,14.4,17.8,652,652,0.500000,0.518405,0.494096
4,Da vinci pizza_6514_18 avenue,37092.0,30.5,21.3,329,329,0.600000,0.480243,0.504644
...,...,...,...,...,...,...,...,...,...
61192,Mike's diner_1454_86 street,51914.0,37.1,23.8,172,172,0.400000,0.488372,0.505051
61193,Dunkin'_2370_grand concourse road,34914.0,16.1,32.6,511,511,0.600000,0.510763,0.465254
61194,Burger king_19510_jamaica ave,49230.0,34.3,20.1,112,112,0.500000,0.473214,0.451380
61195,Unico taco bar_3010_jerome avenue,32137.0,15.6,36.5,264,264,1.000000,0.515152,0.520480


In [9]:
rest = rest.drop_duplicates(subset=['DBA'], keep='first')
final = rest.merge(latest, how = 'inner', on = 'DBA')
final.dropna(inplace = True)
final

Unnamed: 0,DBA,Median Income,College Graduation Rate,Poverty Rate,ZipCount,CuisCount,avg_crit_flag,ZipMean,cuis_mean,CRITICAL FLAG
0,China pagoda_6918_5 avenue,69403.0,52.8,21.9,662,662,0.5,0.504532,0.506594,0
1,Happy bowls_61_4 avenue,118533.0,82.2,55.2,1420,1420,0.6,0.507042,0.482553,0
2,La antioquena ii bakery_4007_national st,35167.0,14.4,17.8,652,652,0.5,0.518405,0.494096,1
3,Da vinci pizza_6514_18 avenue,37092.0,30.5,21.3,329,329,0.6,0.480243,0.504644,0
4,Grill point_69-54_main street,51297.0,45.7,24.5,106,106,0.5,0.518868,0.513158,1
...,...,...,...,...,...,...,...,...,...,...
22036,Miso sushi_328_east 6 street,118533.0,82.2,55.2,1420,1420,1.0,0.507042,0.512737,0
22037,The spotted dog / culture in a bowl_1154_1 avenue,123553.0,85.9,13.3,368,368,1.0,0.494565,0.517857,0
22038,Pizza spot_431_dekalb avenue,72818.0,52.5,54.1,338,338,1.0,0.502959,0.504644,0
22039,Twentyonegrains_445_albee square,119940.0,79.2,39.6,1096,1096,1.0,0.495438,0.508621,0


In [10]:
# Mean transformation
'''
def mean_transform(df, avg, count, alpha, gmean):
    return ((df[avg] * df[count]) + (alpha * gmean)) / (df[count] + alpha)

rest['ZipMean'] = mean_transform(rest, 'ZipMean', 'ZipCount', 3, avg_zip_mean)
rest['cuis_mean'] = mean_transform(rest, 'cuis_mean', 'CuisCount', 3, avg_cuis_mean)
'''

"\ndef mean_transform(df, avg, count, alpha, gmean):\n    return ((df[avg] * df[count]) + (alpha * gmean)) / (df[count] + alpha)\n\nrest['ZipMean'] = mean_transform(rest, 'ZipMean', 'ZipCount', 3, avg_zip_mean)\nrest['cuis_mean'] = mean_transform(rest, 'cuis_mean', 'CuisCount', 3, avg_cuis_mean)\n"

In [11]:
# Actual ML Implementation
X = final.iloc[:, 1:-1].values # All but last column
y = final.iloc[:, -1].values # Only last column

kf = KFold(n_splits=20, shuffle=True, random_state=42) # kfold validation

scaler = StandardScaler() # Normalizing Data 
rfc = RandomForestClassifier(n_estimators=100, random_state=42) # Applying randomforest; performs the worst
#svc = SVC() # performs better (1a)
#lr = LogisticRegression() # performs possibly better (2)
#gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) #(1b)


all_scores = []

# Used for final scores
acc_tot = 0
f1_tot = 0
recall_tot = 0
prec_tot = 0

# Iterate through each fold generated by kf.split()
for train_index, test_index in kf.split(X):
    
    # 1. SPLIT DATA
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 2. STANDARDIZATION (Crucial Step)
    
    # Calculate mu and sigma ONLY on the training data, then transform it
    X_train_scaled = scaler.fit_transform(X_train)
    
    X_test_scaled = scaler.transform(X_test)
    
    # 3. TRAIN & EVALUATE MODEL
    
    # Train the model on the scaled training data
    #rfc.fit(X_train_scaled, y_train)
    #svc.fit(X_train_scaled, y_train)
    #lr.fit(X_train_scaled, y_train)
    #gb_classifier.fit(X_train_scaled, y_train)

    
    # Predict and evaluate on the scaled test data
    #y_pred = rfc.predict(X_test_scaled)
    #y_pred = svc.predict(X_test_scaled)
    #y_pred = lr.predict(X_test_scaled)
    #y_pred = gb_classifier.predict(X_test_scaled)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    # For final average
    acc_tot += acc
    f1_tot += f1
    recall_tot += recall
    prec_tot += precision

    all_scores.append((acc, f1, recall, precision))


print(f'Recall: {recall_tot/20}\nAccuracy: {acc_tot/20}\nPrecision: {prec_tot/20}\nF1 {f1_tot/20}\n')

Recall: 0.8627795063570955
Accuracy: 0.8060280500129263
Precision: 0.7753480765270624
F1 0.8166219481049455



In [12]:
# Test pure mean vs bayesian target encoding 
# Test out other algorithms to find optimal algorithm
