In [312]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt


In [301]:
# Load datasets
# https://www.kaggle.com/datasets/boltcutters/food-allergens-and-allergies?resource=download
allergies = pd.read_csv('FoodData.csv')
allergies.head()

Unnamed: 0,Class,Type,Group,Food,Allergy
0,Plant origin,Nut and seed,Oil seed,Almond,Nut Allergy
1,Plant origin,Fruit,Pome fruit,Apple,Oral Allergy Syndrome
2,Plant origin,Fruit,Stone fruit,Apricot,Stone Fruit Allergy
3,Plant origin,Vegetable,Composite vegetable,Artichoke,Insulin Allergy
4,Plant origin,Vegetable,Liliaceous vegetable,Asparagus,Allium Allergy


In [302]:
# Load datasets
# https://www.kaggle.com/datasets/anoopjohny/new-york-restaurant-menus-and-details
restaurantDishes = pd.read_csv('menu_data.csv')
restaurantDishes.head()

Unnamed: 0,Restaurant,Item,Price
0,Malecon,Potato Salad,$6.00
1,Malecon,Avocado Salad,$6.00
2,Malecon,Mixed Green Salad,$6.00
3,Malecon,Mixed Green Salad w/ Avocado,$7.00
4,Malecon,Seasard Chicken Salad,$17.00


In [303]:
# norm text
allergies = allergies.dropna(subset=['Food', 'Allergy'])
allergies['Food'] = allergies['Food'].str.lower()
allergies['Allergy'] = allergies['Allergy'].str.lower()

# group together foods by allergy category
allergy_map = allergies.groupby("Allergy")["Food"].apply(list).to_dict()
allergy_map


{'allium allergy': ['asparagus',
  'garlic',
  'leek',
  'nira',
  'onion',
  'shallot',
  'welsh'],
 'alpha-gal syndrome': ['cattle', 'deer', 'goat', 'horse', 'pig', 'sheep'],
 'aquagenic urticaria': ['mineral water'],
 'banana allergy': ['banana'],
 'beer allergy': ['hop'],
 'broccoli allergy': ['broccoli'],
 'citrus allergy': ['grapefruit', 'lemon', 'lime', 'orange', 'orange pulp'],
 'corn allergy': ['corn', 'popcorn', 'sweet corn'],
 'cruciferous allergy': ['brussels sprouts',
  'cabbage',
  'cauliflower',
  'chinese cabbage',
  'horseradish',
  'kale',
  'kyona',
  'mustard spinach',
  'qing-geng-cai'],
 'fish allergy': ['eel',
  'globfish',
  'horse mackerel',
  'mackerel',
  'percifomes',
  'salmon',
  'sea bass',
  'sea bream',
  'shelled mollusc',
  'tetraodontiformes',
  'trout',
  'tuna'],
 'gluten allergy': ['barley', 'buckwheat', 'rye', 'wheat'],
 'histamine allergy': ['bamboo shoot', 'bonito', 'ginger', 'okra', 'spinach'],
 'honey allergy': ['honey', 'royal jelly'],
 'hyp

In [304]:
X_text = allergies["Food"].values
y_labels = allergies["Allergy"].values

print("Examples of words in X_text:", X_text[:3])
print("Examples of words in y_labels:", y_labels[:3])

Examples of words in X_text: ['almond' 'apple' 'apricot']
Examples of words in y_labels: ['nut allergy' 'oral allergy syndrome' 'stone fruit allergy']


In [305]:
from sklearn.feature_extraction.text import TfidfVectorizer

# code referenced from:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_text)

In [306]:
# documentation for MLPClassifier:
# https://scikit-learn.org/1.5/modules/generated/sklearn.neural_network.MLPClassifier.html#
# default='adam' -- The solver for weight optimization.
mlp = MLPClassifier(hidden_layer_sizes=(128, 64),
                    activation='relu',
                    max_iter=800,
                    random_state=42)
print(mlp.fit(X, y_labels))

MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=800, random_state=42)


In [307]:
restaurantDishes['Item_lower'] = restaurantDishes['Item'].fillna("unknown").astype(str).str.lower()

# vectorize menu items
X_menu = vectorizer.transform(restaurantDishes['Item_lower'])

probas = mlp.predict_proba(X_menu)

allergy_classes = mlp.classes_
proba_df = pd.DataFrame(probas, columns=[f"prob_{a}" for a in allergy_classes])
restaurantDishes = pd.concat([restaurantDishes.reset_index(drop=True), proba_df], axis=1)

restaurantDishes.head()

Unnamed: 0,Restaurant,Item,Price,Item_lower,prob_allium allergy,prob_alpha-gal syndrome,prob_aquagenic urticaria,prob_banana allergy,prob_beer allergy,prob_broccoli allergy,...,prob_ragweed allergy,prob_rice allergy,prob_salicylate allergy,prob_seed allergy,prob_shellfish allergy,prob_soy allergy,prob_stone fruit allergy,prob_sugar allergy / intolerance,prob_tannin allergy,prob_thyroid
0,Malecon,Potato Salad,$6.00,potato salad,1.7e-05,0.000437,6e-06,3.3e-05,3.2e-05,1.2e-05,...,5.4e-05,5.194296e-05,9e-06,7.6e-05,1.2e-05,1e-06,0.000504,6.6e-05,3e-05,6.9e-05
1,Malecon,Avocado Salad,$6.00,avocado salad,8.6e-05,4.3e-05,1.1e-05,0.000366,4e-06,0.000369,...,8.4e-05,3.521664e-07,7.4e-05,6.4e-05,2e-06,1.1e-05,9.2e-05,7.4e-05,0.000207,5e-06
2,Malecon,Mixed Green Salad,$6.00,mixed green salad,6.4e-05,6.6e-05,0.000113,8.8e-05,3.3e-05,3e-05,...,3.9e-05,2.398222e-06,1.6e-05,0.000146,4.7e-05,0.0001,1.3e-05,1.9e-05,1e-05,4.1e-05
3,Malecon,Mixed Green Salad w/ Avocado,$7.00,mixed green salad w/ avocado,0.000458,0.000313,0.000188,0.001345,4.9e-05,0.000636,...,0.000318,1.69434e-06,0.000143,0.000758,2.7e-05,0.00015,0.0001,0.000168,0.000193,5.9e-05
4,Malecon,Seasard Chicken Salad,$17.00,seasard chicken salad,2.2e-05,0.000112,4.5e-05,7.8e-05,0.000223,3.4e-05,...,5.7e-05,5.121787e-05,0.00027,0.000327,3.6e-05,0.000131,9.7e-05,0.000132,0.000109,0.000116


In [311]:
# safety score = lower allergy probability (i.e. safer rest)
restaurantDishes["dish_safety"] = 1 - proba_df.max(axis=1)

# aggregate per restaurant
restaurantSafety = restaurantDishes.groupby("Restaurant")["dish_safety"].mean().reset_index()
restaurantSafety.columns = ["Restaurant", "SafetyScore"]

# rank highest (safest) first
restaurantSafety.sort_values(by="SafetyScore", ascending=False, inplace=True)
restaurantSafety


Unnamed: 0,Restaurant,SafetyScore
485,il Buco,0.910554
386,San Pietro,0.910554
86,Celeste,0.910554
424,Taqueria San Pedro,0.905824
245,La Nueva Espana,0.898498
...,...,...
289,Modern Gourmet,0.334527
470,Vivi Bubble Tea,0.328997
269,Mango Mango Dessert,0.294569
39,Bagel Works,0.294250


In [309]:
def rank_by_specific_allergy(allergy):
    allergy = allergy.lower()
    col = f"prob_{allergy}"
    
    if col not in restaurantDishes.columns:
        raise ValueError(f"Allergy '{allergy}' not found.")

    restaurantDishes[f"safety_{allergy}"] = 1 - restaurantDishes[col]
    
    ranking = (restaurantDishes
               .groupby("Restaurant")[f"safety_{allergy}"]
               .mean()
               .reset_index()
               .sort_values(by=f"safety_{allergy}", ascending=False))
    
    print(f"\nTop 5 safest restaurants for allergy '{allergy}':")
    print(ranking.head(5).to_string(index=False))
    
    print(f"\nBottom 5 least safe restaurants for allergy '{allergy}':")
    print(ranking.tail(5).to_string(index=False))
    
    return ranking

# plug in diff allergies here
rank_by_specific_allergy("tannin allergy")


Top 5 safest restaurants for allergy 'tannin allergy':
    Restaurant  safety_tannin allergy
Rice To Riches               0.994496
 Pret a Manger               0.989917
       Cottage               0.989671
     Red Mango               0.989224
    Golden Wok               0.989181

Bottom 5 least safe restaurants for allergy 'tannin allergy':
                    Restaurant  safety_tannin allergy
                 Cafe Himalaya               0.906816
               Vivi Bubble Tea               0.906728
             Cupping Room Cafe               0.893426
               Alice's Tea Cup               0.884697
Grace Street Coffee & Desserts               0.848913


Unnamed: 0,Restaurant,safety_tannin allergy
370,Rice To Riches,0.994496
361,Pret a Manger,0.989917
103,Cottage,0.989671
368,Red Mango,0.989224
173,Golden Wok,0.989181
...,...,...
75,Cafe Himalaya,0.906816
470,Vivi Bubble Tea,0.906728
110,Cupping Room Cafe,0.893426
14,Alice's Tea Cup,0.884697
