In [1]:
import json
from pathlib import Path

# Path to your JSON file
json_path = Path("../data/recipes_images.json")

# Load it
with open(json_path, "r", encoding="utf-8") as f:
    recipes = json.load(f)

# Now you have a list of dicts
print(f"Loaded {len(recipes)} recipes")
print(recipes[0]["title"])

Loaded 20935 recipes
Pasta-Shell Risotto with Broccoli Rabe


In [4]:
import pandas as pd

df = pd.DataFrame(recipes)
print(df.columns)

Index(['title', 'description', 'ingredients', 'instructions', 'cooking_time',
       'servings', 'ratings', 'tags', 'publish_date', 'image_filename'],
      dtype='object')


In [5]:
df.head()

Unnamed: 0,title,description,ingredients,instructions,cooking_time,servings,ratings,tags,publish_date,image_filename
0,Pasta-Shell Risotto with Broccoli Rabe,Slowly cooking the pasta risotto-style by ladl...,"[1 quart reduced-sodium chicken broth, 1 quart...","{'1': 'Bring broth, water, and 1 teaspoon salt...",35.0,,"{'rating': 3.3, 'count': 9}","{'type': ['Risotto', 'Pasta Shells', 'Pasta'],...","October 15, 2007",
1,"Penne with Shrimp, Red Onion, and Goat Cheese",,"[1 pound whole grain penne, Kosher salt, 3 tab...",{'1': 'Cook penne in a large pot of boiling sa...,30.0,6.0,,"{'type': ['Pasta', 'Penne'], 'cuisine': ['Ital...","June 6, 2012",ba-syn-penne-with-shrimp-red-onion-and-goat-ch...
2,Roasted Salmon with Lentils,The cooking time for lentils can vary widely d...,"[3/4 cup lentils, picked over and rinsed, 1/2 ...",{'1': 'Combine the lentils and 2 1/2 cups wate...,,,,"{'cuisine': ['Mediterranean'], 'ingredient': [...","December 14, 2011",
3,Radicchio with Bacon,Closely related in flavor and spirit to the cl...,"[2 tablespoons extra virgin olive oil, 1/4 pou...","{'1': 'Put the olive oil in a large skillet, p...",,4.0,,"{'ingredient': ['Bacon', 'Cured Meat', 'Meat',...","December 14, 2011",
4,Louisiana Deviled Crab Cakes,"These crab cakes are flavored with the ""holy t...","[1/4 cup finely chopped onion, 1/4 cup finely ...","{'1': 'Cook onion, bell pepper, and celery in ...",,,"{'rating': 4.6, 'count': 110}","{'cuisine': ['Cajun & Creole', 'Southern', 'Am...","August 20, 2004",louisiana-deviled-crab-cakes-104883


In [11]:


# Make sure Pandas shows the full content (e.g., full ingredients or instructions)
pd.set_option("display.max_colwidth", None)  # show full width for each column
pd.set_option("display.max_columns", None)   # show all columns
pd.set_option("display.expand_frame_repr", False)  # don’t wrap to next line

# Show full first row
print(df.iloc[0])


title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [None]:
from rapidfuzz import process
from nltk.stem import WordNetLemmatizer
import re

known_ingredients = ["garlic", "salt", "olive oil", "butter", "onion"]

def normalize_ingredient(raw):
    # Step 1: Clean
    text = re.sub(r"\b[\d/.,]+\s*(cups?|tbsp|tsp|oz|cloves?|grams?|ml|g)\b", "", raw, flags=re.I)
    text = re.sub(r"[^\w\s]", "", text).lower().strip()
    
    # Step 2: Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = " ".join([lemmatizer.lemmatize(w) for w in text.split()])
    
    # Step 3: Match to known ingredients
    match, score = process.extractOne(words, known_ingredients)
    return match if score > 70 else words


In [None]:
import re
from collections import Counter

all_ingredients = []

for ingredients_list in df['ingredients']:
    if isinstance(ingredients_list, list):
        all_ingredients.extend(ingredients_list)

import re

UNITS = r"(cups?|tbsp|tablespoons?|tsp|teaspoons?|oz|ml|g|grams?|kg|lb|pounds?|cloves?|slices?|bunch|quart|pinch|dash|can|ounce|inch)"
DESCRIPTORS = r"(fresh|finely|coarsely|thinly|grated|chopped|minced|diced|ground|optional|divided|packed|seeded|room temperature|beaten|lightly|preferably|local|cooked|leftover)"

def clean_ingredient(raw):
    raw = raw.lower()
    raw = re.sub(r"\b\d+/?\d*\b", "", raw)  # remove numbers and fractions
    raw = re.sub(r"½", "", raw)  # remove half symbol
    raw = re.sub(rf"\b({UNITS})\b", "", raw)
    raw = re.sub(rf"\b({DESCRIPTORS})\b", "", raw)
    raw = re.sub(r"\bingredient info\b", "", raw)

    # ✅ Fix: remove duplicate words like "egg egg"
    raw = re.sub(r"\b(\w+)\s+\1\b", r"\1", raw)

    raw = re.sub(r"[^\w\s]", "", raw)
    raw = re.sub(r"\s+", " ", raw).strip()
    return raw

cleaned_ingredients = [clean_ingredient(i) for i in all_ingredients]
ingredient_counts = Counter(cleaned_ingredients)



In [21]:
cleaned_ingredients[50:100]

['shallot',
 'plus apple cider vinegar',
 'orange juice',
 'white miso fermented soybean paste',
 'peeled ginger',
 'orange peel',
 'extravirgin olive oil',
 'coriander seeds',
 'sesame seeds',
 'coarse kosher salt',
 'diameter red beets peeled',
 'carrots peeled',
 'white miso also known as shiro miso is available in the refrigerated asian foods section of some supermarkets and at a natural food stores and japanese markets',
 'olive oil',
 'garlic',
 'crushed tomatoes',
 'dried oregano',
 'kosher salt freshly pepper',
 'small zucchini sliced',
 'rice or any other grain',
 'large eggs to blend',
 'large eggs farm eggs',
 'plus coarse kosher salt',
 'small garlic sliced',
 'boiling water',
 'watercress',
 'italian parsley',
 'cayenne pepper',
 'extravirgin olive oil',
 'two cans solid white tuna drained',
 'breadcrumbs',
 'large eggs',
 'onion',
 'celery',
 'pimento',
 'lemon juice',
 'prepared horseradish',
 'garlic',
 'pepper',
 'vegetable oil',
 'onion',
 'lemon juice',
 'vegetable o

In [23]:
df[["ingredients"]].to_csv("ingredients_column.csv", index=False)


In [None]:
df["tags"]

In [5]:
import ast
import pandas as pd
# Skip ast.literal_eval — your 'tags' column is already a dict
df_tags_expanded = pd.json_normalize(df["tags"].fillna({}))

# Merge back with original DataFrame
df = df.drop(columns=["tags"]).join(df_tags_expanded)


In [26]:
df.head()

Unnamed: 0,title,description,ingredients,instructions,cooking_time,servings,ratings,publish_date,image_filename,type,cuisine,ingredient,meal,special-consideration,simple-cooking,technique,source,equipment,occasion,misc,cne-video-tags
0,Pasta-Shell Risotto with Broccoli Rabe,"Slowly cooking the pasta risotto-style by ladling hot broth over it allows the shells to soak up all of the broth's aroma and results in a velvety, creamy texture. The bitter bite of the greens adds just enough edge.","[1 quart reduced-sodium chicken broth, 1 quart water, 1 bunch broccoli rabe, 1/4 cup olive oil, 1 medium onion, chopped, 2 garlic cloves, chopped, 1 lb medium pasta shells, 1/2 cup grated Parmigiano-Reggiano]","{'1': 'Bring broth, water, and 1 teaspoon salt to a boil in a medium saucepan, then reduce heat and keep at a bare simmer.', '2': 'Trim tough stem ends from broccoli rabe, then cut remaining stems crosswise into 1-inch pieces and reserve. Coarsely chop florets and leaves.', '3': 'Heat oil in a 5- to 6-quart heavy pot over medium-high heat until it shimmers, then sauté onion with 1/2 teaspoon salt until softened, 3 to 5 minutes. Stir in garlic and sauté 1 minute, then add pasta, broccoli rabe stems, and 3/4 cup hot broth and simmer briskly, stirring constantly, until broth is absorbed. Continue simmering and adding hot broth, about 3/4 cup at a time, stirring frequently and letting each addition be absorbed before adding the next, until pasta is almost al dente and creamy-looking, 12 to 13 minutes. Stir in remaining broccoli rabe and 1/4 teaspoon pepper and cook, stirring, until pasta is al dente and broccoli rabe is just tender, 2 to 3 minutes. (If needed, add more hot broth to moisten; there will be some broth left over.) Stir in cheese and salt and pepper to taste. Serve with more cheese on the side.'}",35.0,,"{'rating': 3.3, 'count': 9}","October 15, 2007",,"[Risotto, Pasta Shells, Pasta]","[Italian, European]","[Broccoli Rabe, Vegetable]","[Side, Dinner]","[Nut Free, Vegetarian]","[30 Minutes or Less, Weeknight Meals]",[Saute],[Gourmet],,,,
1,"Penne with Shrimp, Red Onion, and Goat Cheese",,"[1 pound whole grain penne, Kosher salt, 3 tablespoons extra-virgin olive oil, divided, 1 1/4 pounds medium uncooked shrimp, peeled, deveined, Freshly ground black pepper, 1 large red onion (about 1 pound), halved, thinly sliced crosswise, 3 cups cherry tomatoes (about 1 1/2 pints), 7 tablespoons chopped flat-leaf parsley, divided, 3 tablespoons chopped fresh oregano, divided, 1 5-ounce log fresh goat cheese, crumbled]","{'1': 'Cook penne in a large pot of boiling salted water, stirring occasionally, until al dente.', '2': 'Meanwhile, heat 1 Tbsp. oil in a large heavy skillet over medium-high heat. Season shrimp with salt and pepper. Cook shrimp until just opaque in the center, 1–2 minutes per side. Using a slotted spoon, transfer shrimp to a medium bowl. Add remaining 2 Tbsp. oil and onion to same skillet. Season with salt and pepper and sauté until onion begins to brown and wilt, 5–7 minutes. Add tomatoes; cook, mashing some of tomatoes with the back of a spoon to release juices, until tomatoes are just beginning to soften, about 3 minutes. Stir in 4 Tbsp. parsley, 2 Tbsp. oregano, and shrimp with any accumulated juices.', '3': 'Drain pasta, reserving 1/2 cup pasta cooking liquid; transfer pasta back to pot. Add shrimp mixture to pasta and toss to evenly incorporate; cook, stirring and adding pasta cooking liquid by tablespoonfuls if pasta is too dry, until pasta is tender and mixture is heated through, 1–2 minutes. Season to taste with salt and pepper.', '4': 'Transfer pasta mixture to a large wide bowl. Add cheese; toss gently 2 or 3 times. Sprinkle remaining 3 Tbsp. parsley and 1 Tbsp. oregano over.'}",30.0,6.0,,"June 6, 2012",ba-syn-penne-with-shrimp-red-onion-and-goat-cheese,"[Pasta, Penne]","[Italian American, Italian, European]","[Shrimp, Shellfish, Seafood, Cherry Tomato, Tomato, Vegetable, Goat Cheese, Cheese, Dairy]","[Main, Lunch, Dinner]",[Nut Free],"[Quick, Weeknight Meals]",[Saute],[Bon Appétit],,,,
2,Roasted Salmon with Lentils,"The cooking time for lentils can vary widely depending on their age, as older lentils require more time. For the best results, buy a fresh, new bag. A mustard vinaigrette boosts the flavor of this Mediterranean-inspired dish.","[3/4 cup lentils, picked over and rinsed, 1/2 small onion, chopped, 1 large celery stalk, chopped, 2 tablespoons red-wine vinegar, 1 tablespoon olive oil, 1 tablespoon Dijon mustard, 1 garlic clove, minced, 1/3 cup chopped fresh parsley, Coarse salt and fresh ground pepper, 4 skinless salmon fillets (6 ounces each), Nonstick cooking spray]","{'1': 'Combine the lentils and 2 1/2 cups water in a medium saucepan. Bring to a boil; reduce the heat, and simmer, covered, for 5 minutes. Add the onion and celery; cover, and continue cooking until the lentils and vegetables are just tender, 15 to 25 minutes more. Drain, reserving the cooking liquid. Transfer the lentil mixture to a medium bowl.', '2': 'In a bowl, whisk 2 tablespoons of the reserved cooking liquid with the vinegar, oil, mustard, garlic, and parsley; season with salt and pepper. Toss half the dressing with the lentil mixture.', '3': 'Heat the broiler. Season the salmon with salt and pepper. Coat a baking sheet with cooking spray. Arrange the salmon on the sheet; broil until opaque throughout, 8 to 10 minutes. Spoon the lentils onto 4 plates; top with the salmon, flaking it into large pieces, if desired. Drizzle with the remaining dressing.', '4': 'Chopped celery leaves add wonderful flavor to the lentils. Use them in place of or in addition to the parsley.'}",,,,"December 14, 2011",,,[Mediterranean],"[Salmon, Fish, Seafood, Lentil, Bean and Legume, Vegetable]","[Main, Dinner]","[Dairy Free, Gluten Free, Nut Free]","[Easy, Weeknight Meals]",[Broil],[Cookbooks],,,,
3,Radicchio with Bacon,"Closely related in flavor and spirit to the classic French pissenlit (dandelion greens with bacon), this differs in that the greens are cooked from the start. Also, though it may be finished with lemon, vinegar is almost never used. Other vegetables you can prepare this way: any relatively tender, bitter green—curly endive, escarole, dandelion, even Belgian endive, cut crosswise.","[2 tablespoons extra virgin olive oil, 1/4 pound slab bacon, cut into 1/4- to 1/2-inch chunks, 1 pound radicchio, roughly chopped, Salt and black pepper to taste, Lemon wedges for serving, optional]","{'1': 'Put the olive oil in a large skillet, preferably nonstick, over medium-high heat. Add the bacon and cook, stirring occasionally, until crisp, 5 to 10 minutes. Remove with a slotted spoon and set aside for the moment.', '2': 'Reduce the heat to medium, add the radicchio, and cook, stirring occasionally, until it wilts and becomes tender, about 10 minutes. Add some pepper and salt if necessary, then return the bacon to the pan and cook, stirring occasionally, until the bacon reheats. Serve hot or at room temperature, with the lemon wedges if you like.'}",,4.0,,"December 14, 2011",,,,"[Bacon, Cured Meat, Meat, Radicchio, Chicory, Leafy Greens, Vegetable]","[Starter, Side, Dinner]","[Dairy Free, Gluten Free, Nut Free, Keto]","[Quick, Easy, Weeknight Meals, 5 Ingredients or Fewer]",[Saute],[Cookbooks],,,,
4,Louisiana Deviled Crab Cakes,"These crab cakes are flavored with the ""holy trinity"" of Cajun cooking: onion, green bell pepper, and celery. Serve them with a helping of coleslaw or a green salad.","[1/4 cup finely chopped onion, 1/4 cup finely chopped green bell pepper, 3 tablespoons finely chopped celery, 3 tablespoons unsalted butter, 1 large egg, 1 tablespoon sour cream, 1/2 teaspoon dry mustard, 1/2 teaspoon Worcestershire sauce, 3/4 teaspoon cayenne, 3/4 teaspoon salt, 2 tablespoons thinly sliced scallion greens, 16 saltines, finely ground, 1/2 lb jumbo lump crabmeat, picked over, 1 tablespoon vegetable oil, Accompaniments: tartar sauce and lemon wedges, ]","{'1': 'Cook onion, bell pepper, and celery in 1 tablespoon butter in a 10-inch nonstick skillet over moderately low heat, stirring occasionally, until vegetables are softened, about 8 minutes.', '2': 'Whisk together egg, sour cream, mustard, Worcestershire sauce, cayenne, and salt in a large bowl, then stir in scallion, cooked vegetables, and 1/4 cup saltine crumbs. Gently stir in crabmeat, then form into 4 cakes (2 1/2 to 3 inches in diameter). Dredge cakes in remaining saltine crumbs.', '3': 'Heat oil and remaining 2 tablespoons butter in cleaned skillet over moderate heat until foam subsides, then cook crab cakes, turning once, until golden brown, 8 minutes total.'}",,,"{'rating': 4.6, 'count': 110}","August 20, 2004",louisiana-deviled-crab-cakes-104883,,"[Cajun & Creole, Southern, American]","[Onion, Root Vegetable, Vegetable, Bell Pepper, Celery, Sour Cream, Dairy, Scallion, Crab, Shellfish, Seafood]","[Main, Starter, Lunch, Dinner]",[Nut Free],"[Quick, Easy]",[Fry],[Gourmet],,,,


In [None]:
pd.reset_option("display.max_colwidth")
pd.reset_option("display.max_columns")
pd.reset_option("display.expand_frame_repr")

df.head()



Unnamed: 0,title,description,ingredients,instructions,cooking_time,servings,ratings,publish_date,image_filename,type,...,ingredient,meal,special-consideration,simple-cooking,technique,source,equipment,occasion,misc,cne-video-tags
0,Pasta-Shell Risotto with Broccoli Rabe,Slowly cooking the pasta risotto-style by ladl...,"[1 quart reduced-sodium chicken broth, 1 quart...","{'1': 'Bring broth, water, and 1 teaspoon salt...",35.0,,"{'rating': 3.3, 'count': 9}","October 15, 2007",,"[Risotto, Pasta Shells, Pasta]",...,"[Broccoli Rabe, Vegetable]","[Side, Dinner]","[Nut Free, Vegetarian]","[30 Minutes or Less, Weeknight Meals]",[Saute],[Gourmet],,,,
1,"Penne with Shrimp, Red Onion, and Goat Cheese",,"[1 pound whole grain penne, Kosher salt, 3 tab...",{'1': 'Cook penne in a large pot of boiling sa...,30.0,6.0,,"June 6, 2012",ba-syn-penne-with-shrimp-red-onion-and-goat-ch...,"[Pasta, Penne]",...,"[Shrimp, Shellfish, Seafood, Cherry Tomato, To...","[Main, Lunch, Dinner]",[Nut Free],"[Quick, Weeknight Meals]",[Saute],[Bon Appétit],,,,
2,Roasted Salmon with Lentils,The cooking time for lentils can vary widely d...,"[3/4 cup lentils, picked over and rinsed, 1/2 ...",{'1': 'Combine the lentils and 2 1/2 cups wate...,,,,"December 14, 2011",,,...,"[Salmon, Fish, Seafood, Lentil, Bean and Legum...","[Main, Dinner]","[Dairy Free, Gluten Free, Nut Free]","[Easy, Weeknight Meals]",[Broil],[Cookbooks],,,,
3,Radicchio with Bacon,Closely related in flavor and spirit to the cl...,"[2 tablespoons extra virgin olive oil, 1/4 pou...","{'1': 'Put the olive oil in a large skillet, p...",,4.0,,"December 14, 2011",,,...,"[Bacon, Cured Meat, Meat, Radicchio, Chicory, ...","[Starter, Side, Dinner]","[Dairy Free, Gluten Free, Nut Free, Keto]","[Quick, Easy, Weeknight Meals, 5 Ingredients o...",[Saute],[Cookbooks],,,,
4,Louisiana Deviled Crab Cakes,"These crab cakes are flavored with the ""holy t...","[1/4 cup finely chopped onion, 1/4 cup finely ...","{'1': 'Cook onion, bell pepper, and celery in ...",,,"{'rating': 4.6, 'count': 110}","August 20, 2004",louisiana-deviled-crab-cakes-104883,,...,"[Onion, Root Vegetable, Vegetable, Bell Pepper...","[Main, Starter, Lunch, Dinner]",[Nut Free],"[Quick, Easy]",[Fry],[Gourmet],,,,


In [6]:
df.to_csv("expanded_tags_output.csv", index=False)

In [7]:
from itertools import chain

# Drop missing values and flatten the list of lists
valid_cuisines = df["cuisine"].dropna()
flat_cuisines = list(chain.from_iterable(valid_cuisines))

# Get unique cuisines and their count
unique_cuisines = sorted(set(flat_cuisines))
print(f"Total unique cuisines: {len(unique_cuisines)}")
print(unique_cuisines)


Total unique cuisines: 107
['African', 'American', 'Argentinean', 'Armenian', 'Asian', 'Australian', 'Austrian', 'Bangladeshi', 'Basque', 'Belgian', 'Brazilian', 'British', 'Burmese', 'Cajun & Creole', 'California Cuisine', 'Canadian', 'Cantonese', 'Caribbean', 'Central American', 'Chinese', 'Chinese-American', 'Colombian', 'Cuban', 'Danish', 'Dominican', 'East African', 'East Asian', 'Eastern European', 'Egyptian', 'English', 'Ethiopian', 'European', 'Filipino', 'French', 'Georgian', 'German', 'Greek', 'Haitian', 'Hawaiian', 'Hungarian', 'Indian', 'Indonesian', 'Iranian', 'Irish', 'Israeli', 'Italian', 'Italian American', 'Jamaican', 'Japanese', 'Jewish', 'Korean', 'Laotian', 'Latin American', 'Lebanese', 'Levantine', 'Low Country Cuisine', 'Malaysian', 'Mediterranean', 'Mexican', 'Middle Eastern', 'Moroccan', 'Native American', 'New England', 'New Zealand', 'Nigerian', 'North African', 'Norwegian', 'Oaxacan', 'Pakistani', 'Palestinian', 'Persian', 'Peruvian', 'Polish', 'Portuguese', 

In [None]:
from collections import Counter
from itertools import chain
import ast

# Drop missing values and flatten the list of lists
valid_cuisines = df["cuisine"].dropna()
flat_cuisines = list(chain.from_iterable(valid_cuisines))

# Count occurrences
cuisine_counts = Counter(flat_cuisines)

# Sort and display
for cuisine, count in sorted(cuisine_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{cuisine}: {count}")


European: 3083
American: 2481
Italian: 1789
Asian: 1738
Southern: 732
French: 603
Latin American: 571
East Asian: 569
Middle Eastern: 516
Southeast Asian: 488
Italian American: 486
Mexican: 446
South Asian: 358
Indian: 329
Chinese: 246
African: 218
Japanese: 211
Spanish: 197
North African: 188
Vietnamese: 182
Thai: 180
Mediterranean: 174
Jewish: 169
Cajun & Creole: 167
Caribbean: 135
Tex-Mex: 116
Moroccan: 113
British: 112
Southwestern: 110
Greek: 108
Korean: 105
Eastern European: 82
South American: 72
Persian: 69
Turkish: 69
New England: 67
California Cuisine: 60
German: 57
Scandinavian: 53
Irish: 46
Lebanese: 38
Cuban: 37
Sicilian: 31
Chinese-American: 29
Filipino: 29
English: 29
Portuguese: 28
Soul Food: 27
Iranian: 26
Indonesian: 23
Hawaiian: 22
Egyptian: 21
Cantonese: 21
Israeli: 20
Russian: 19
Jamaican: 18
Swedish: 18
Scottish: 16
Sichuanese: 16
Argentinean: 16
Central American: 15
Peruvian: 14
Austrian: 13
Polish: 13
Pakistani: 13
Brazilian: 13
Malaysian: 13
Levantine: 12
Puerto

In [8]:
# Mapping from specific cuisine to grouped region
cuisine_region_map = {
    # Middle Eastern
    'Lebanese': 'Middle Eastern', 'Persian': 'Middle Eastern', 'Iranian': 'Middle Eastern',
    'Israeli': 'Middle Eastern', 'Palestinian': 'Middle Eastern', 'Levantine': 'Middle Eastern',
    'Yemeni': 'Middle Eastern',
 
    # African
    'Moroccan': 'African', 'Egyptian': 'African', 'Nigerian': 'African', 'Ethiopian': 'African',
    'South African': 'African', 'West African': 'African', 'East African': 'African', 'North African': 'African',
    'Tunisian': 'African', 'Somali': 'African',
 
    # Asian
    'Chinese': 'Asian', 'Japanese': 'Asian', 'Korean': 'Asian',
    'Chinese-American': 'Asian', 'Taiwanese': 'Asian', 'Cantonese': 'Asian',
    'Sichuanese': 'Asian', 'Armenian': 'Asian',  'Tibetan': 'Asian', 'Shanghainese': 'Asian',
    'Indian': 'Asian', 'Pakistani': 'Asian', 'Bangladeshi': 'Asian', 'Sri Lankan': 'Asian',
    'Vietnamese': 'Asian', 'Thai': 'Asian', 'Filipino': 'Asian',
    'Indonesian': 'Asian', 'Malaysian': 'Asian', 'Burmese': 'Asian','East Asian':'Asian','Southeast Asian':'Asian','South Asian':'Asian',
 
    # Latin America
    'Mexican': 'Mexican', 'Brazilian': 'Latin American', 'Argentinean': 'Latin American',
    'Colombian': 'Latin American', 'Peruvian': 'Latin American', 'Caribbean': 'Latin American',
    'Cuban': 'Latin American', 'Puerto Rican': 'Latin American', 'Central American': 'Latin American',
    'South American': 'Latin American', 'Trinidadian': 'Latin American', 'Dominican': 'Latin American',
    'Jamaican': 'Latin American', 'Oaxacan': 'Latin American', 'Venezuelan': 'Latin American',
    'Salvadoran': 'Latin American',
 
    # Europe
    'Italian': 'Italian', 'Italian American': 'Italian', 'French': 'French', 'Greek': 'European',
    'German': 'European', 'Spanish': 'European', 'British': 'European', 'Eastern European': 'European',
    'Portuguese': 'European', 'Irish': 'European', 'Russian': 'European', 'Scandinavian': 'European',
    'Austrian': 'European', 'Hungarian': 'European', 'Belgian': 'European', 'Swiss': 'European',
    'Turkish': 'European', 'Sicilian': 'European', 'English': 'European', 'Swedish': 'European',
    'Scottish': 'European', 'Polish': 'European', 'Laotian': 'European', 'Romanian': 'European',
    'Danish': 'European', 'Basque': 'European', 'Georgian': 'European', 'Ukrainian': 'European',
    'Norwegian': 'European',
 
    # North America
    'American': 'North American', 'Southern': 'North American', 'Cajun & Creole': 'North American',
    'Tex-Mex': 'North American', 'California Cuisine': 'North American', 'Soul Food': 'North American',
    'New England': 'North American', 'Canadian': 'North American', 'Southwestern': 'North American',
    'Hawaiian': 'North American', 'Low Country Cuisine': 'North American',
 
    # Other
    'Jewish': 'Middle Eastern', 'Mediterranean': 'Middle Eastern', 'Australian': 'Australian',
    'Native American': 'Latin American', 'Haitian': 'Latin American', 'New Zealand': 'Australian'
}

# Apply the mapping (fallback to original name if not mapped)
def map_to_region(cuisine_list):
    if not isinstance(cuisine_list, list):
        return []
    return list({cuisine_region_map.get(c, c) for c in cuisine_list})

# Add new column
df["cuisine_grouped"] = df["cuisine"].apply(map_to_region)


In [9]:
df["cuisine_grouped"].head()

0    [Italian, European]
1    [Italian, European]
2       [Middle Eastern]
3                     []
4       [North American]
Name: cuisine_grouped, dtype: object

In [1]:
# Drop missing values and flatten the list of lists
valid_cuisines = df["cuisine_grouped"].dropna()
flat_cuisines = list(chain.from_iterable(valid_cuisines))

# Count occurrences
cuisine_counts = Counter(flat_cuisines)

# Sort and display
for cuisine, count in sorted(cuisine_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{cuisine}: {count}")


NameError: name 'df' is not defined

In [58]:
df["cuisine_grouped"].iloc[171]

['Asian']

In [37]:
# Drop missing values and flatten the list of lists
valid_ingredients = df["ingredient"].dropna()
flat_ingredients = list(chain.from_iterable(valid_ingredients))

# Count occurrences
ingredients_counts = Counter(flat_ingredients)

# Sort and display
for ingredient, count in sorted(ingredient_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{ingredient}: {count}")


olive oil: 3874
garlic: 3803
salt: 3739
sugar: 3144
kosher salt: 3112
extravirgin olive oil: 2725
lemon juice: 2293
water: 2029
freshly black pepper: 1959
: 1501
unsalted butter: 1464
vegetable oil: 1459
large eggs: 1279
allpurpose flour: 1163
cinnamon: 1073
black pepper: 971
cumin: 959
lime juice: 907
honey: 869
cilantro: 790
dijon mustard: 756
onion: 756
dry white wine: 755
soy sauce: 744
heavy cream: 725
parsley: 680
mayonnaise: 667
kosher salt freshly pepper: 665
butter: 627
baking powder: 616
vanilla extract: 609
flatleaf parsley: 578
kosher salt and freshly black pepper: 542
salt and freshly black pepper: 533
whole milk: 524
ginger: 511
red wine vinegar: 509
large garlic: 498
medium onion: 494
cornstarch: 492
stick unsalted butter: 489
peeled ginger: 485
chives: 476
cayenne pepper: 467
granulated sugar: 451
lemon zest: 439
garlic sliced: 436
salt and pepper: 434
turmeric: 429
large egg: 428
coarse salt: 419
crushed red pepper flakes: 415
canola oil: 412
all purpose flour: 405
bak

In [63]:
# Remove 'Australian' from all rows in cuisine_grouped
df["cuisine_grouped"] = df["cuisine_grouped"].apply(
    lambda tags: [tag for tag in tags if tag != "Australian"]
)


In [65]:
# Clean the ingredient column: replace NaNs with empty lists
df["ingredient"] = df["ingredient"].apply(lambda x: x if isinstance(x, list) else [])

# Clean the cuisine column too
df["cuisine_grouped"] = df["cuisine_grouped"].apply(lambda x: x if isinstance(x, list) else [])

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Transform ingredients into binary matrix
ingredient_mlb = MultiLabelBinarizer()
X = ingredient_mlb.fit_transform(df["ingredient"])

# Transform cuisine labels
cuisine_mlb = MultiLabelBinarizer()
Y = cuisine_mlb.fit_transform(df["cuisine_grouped"])



In [43]:
X[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [45]:
Y[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Wrap logistic regression for multi-label classification
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, Y_train)


In [76]:
from sklearn.metrics import classification_report

# Y_pred = model.predict(X_test)

Y_scores = model.predict_proba(X_test)
Y_pred = (Y_scores >= 0.2).astype(int)  # lower threshold

print(classification_report(Y_test, Y_pred, target_names=cuisine_mlb.classes_))


                precision    recall  f1-score   support

       African       0.23      0.12      0.16        50
         Asian       0.51      0.64      0.57       377
      European       0.37      0.57      0.45       653
        French       0.19      0.08      0.11       130
       Italian       0.43      0.57      0.49       359
Latin American       0.36      0.33      0.34       136
       Mexican       0.36      0.33      0.35        84
Middle Eastern       0.41      0.38      0.40       168
North American       0.31      0.42      0.36       477

     micro avg       0.39      0.48      0.43      2434
     macro avg       0.35      0.38      0.36      2434
  weighted avg       0.38      0.48      0.42      2434
   samples avg       0.20      0.21      0.20      2434



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
cuisine_mlb.classes_[27]

'Salvadoran'

In [83]:
input=ingredient_mlb.transform(["Spaghetti", "Egg"])




In [84]:
Y_scores = model.predict_proba(input)
Y_pred = (Y_scores >= 0.1).astype(int) 

In [85]:
print(Y_pred)

[[0 0 1 0 0 0 0 0 1]
 [0 0 1 0 0 0 0 0 1]]


In [86]:
# Get probabilities for each label
Y_proba = model.predict_proba(X_test)
from sklearn.metrics import precision_recall_curve, f1_score
import numpy as np

optimal_thresholds = []

for i in range(Y.shape[1]):
    precision, recall, thresholds = precision_recall_curve(Y_test[:, i], Y_proba[:, i])
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_idx = np.argmax(f1)
    optimal_thresh = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    optimal_thresholds.append(optimal_thresh)

optimal_thresholds = np.array(optimal_thresholds)


In [87]:
Y_pred_tuned = np.zeros_like(Y_proba)

for i in range(Y_proba.shape[1]):
    Y_pred_tuned[:, i] = (Y_proba[:, i] >= optimal_thresholds[i]).astype(int)


In [88]:
from sklearn.metrics import classification_report

print(classification_report(Y_test, Y_pred_tuned, target_names=cuisine_mlb.classes_))


                precision    recall  f1-score   support

       African       0.20      0.20      0.20        50
         Asian       0.52      0.64      0.58       377
      European       0.39      0.53      0.45       653
        French       0.18      0.17      0.18       130
       Italian       0.53      0.48      0.50       359
Latin American       0.27      0.58      0.37       136
       Mexican       0.33      0.42      0.37        84
Middle Eastern       0.40      0.41      0.41       168
North American       0.38      0.37      0.38       477

     micro avg       0.40      0.47      0.43      2434
     macro avg       0.36      0.42      0.38      2434
  weighted avg       0.41      0.47      0.43      2434
   samples avg       0.20      0.20      0.19      2434



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [89]:
# Get class probabilities
Y_proba = model.predict_proba(X_test)

# For each sample, select the label with highest probability
import numpy as np

Y_pred_top1 = np.zeros_like(Y_proba)
top_indices = np.argmax(Y_proba, axis=1)

for i, idx in enumerate(top_indices):
    Y_pred_top1[i, idx] = 1
    
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred_top1, target_names=cuisine_mlb.classes_))

                precision    recall  f1-score   support

       African       0.21      0.12      0.15        50
         Asian       0.40      0.74      0.52       377
      European       0.23      0.55      0.32       653
        French       0.11      0.04      0.06       130
       Italian       0.44      0.32      0.37       359
Latin American       0.28      0.23      0.25       136
       Mexican       0.36      0.24      0.29        84
Middle Eastern       0.32      0.37      0.35       168
North American       0.24      0.61      0.34       477

     micro avg       0.28      0.48      0.35      2434
     macro avg       0.29      0.36      0.29      2434
  weighted avg       0.30      0.48      0.34      2434
   samples avg       0.28      0.22      0.24      2434



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [92]:
top_k = 2
Y_pred_topk = np.zeros_like(Y_proba)

top_indices_k = np.argsort(Y_proba, axis=1)[:, -top_k:]

for i, row in enumerate(top_indices_k):
    Y_pred_topk[i, row] = 1
    
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred_topk, target_names=cuisine_mlb.classes_))


                precision    recall  f1-score   support

       African       0.10      0.16      0.12        50
         Asian       0.31      0.85      0.45       377
      European       0.20      0.93      0.34       653
        French       0.10      0.15      0.12       130
       Italian       0.28      0.74      0.41       359
Latin American       0.24      0.62      0.34       136
       Mexican       0.26      0.39      0.31        84
Middle Eastern       0.20      0.50      0.29       168
North American       0.17      0.80      0.28       477

     micro avg       0.22      0.74      0.33      2434
     macro avg       0.21      0.57      0.30      2434
  weighted avg       0.22      0.74      0.33      2434
   samples avg       0.22      0.32      0.25      2434



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [93]:
from itertools import chain

# Drop missing values and flatten the list of lists
valid_meals = df["meal"].dropna()
flat_meals = list(chain.from_iterable(valid_meals))

# Get unique cuisines and their count
unique_meals = sorted(set(flat_meals))
print(f"Total unique cuisines: {len(unique_meals)}")
print(unique_meals)


Total unique cuisines: 8
['Breakfast', 'Brunch', 'Dinner', 'Lunch', 'Main', 'Side', 'Snack', 'Starter']


In [98]:
# Transform cuisine labels
df["meal"] = df["meal"].apply(lambda x: x if isinstance(x, list) else [])
meal_mlb = MultiLabelBinarizer()
Y_meal = meal_mlb.fit_transform(df["meal"])

In [99]:
from sklearn.model_selection import train_test_split
# Clean the ingredient column: replace NaNs with empty lists

X_train, X_test, Y_meal_train, Y_meal_test = train_test_split(X, Y_meal, test_size=0.2, random_state=42)

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Wrap logistic regression for multi-label classification
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, Y_meal_train)


In [101]:
top_k = 2
Y_meal_proba = model.predict_proba(X_test)
Y_pred_topk = np.zeros_like(Y_meal_proba)

top_indices_k = np.argsort(Y_meal_proba, axis=1)[:, -top_k:]

for i, row in enumerate(top_indices_k):
    Y_pred_topk[i, row] = 1
    
from sklearn.metrics import classification_report
print(classification_report(Y_meal_test, Y_pred_topk, target_names=meal_mlb.classes_))


              precision    recall  f1-score   support

   Breakfast       0.24      0.12      0.16       206
      Brunch       0.14      0.05      0.08       301
      Dinner       0.81      0.99      0.89      3279
       Lunch       0.40      0.20      0.26      1200
        Main       0.82      0.80      0.81      2173
        Side       0.56      0.59      0.57      1125
       Snack       0.17      0.11      0.13       254
     Starter       0.42      0.05      0.09       736

   micro avg       0.72      0.65      0.68      9274
   macro avg       0.45      0.36      0.37      9274
weighted avg       0.65      0.65      0.63      9274
 samples avg       0.72      0.60      0.64      9274



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [102]:
import xgboost as xgb
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=6, n_estimators=100)

# Wrap XGBoost in a OneVsRestClassifier
model = OneVsRestClassifier(xgb_model)

# Train the model
model.fit(X_train, Y_meal_train)

# Predict on the test set
Y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(Y_meal_test, Y_pred, target_names=meal_mlb.classes_))

              precision    recall  f1-score   support

   Breakfast       0.43      0.05      0.09       206
      Brunch       0.53      0.06      0.11       301
      Dinner       0.91      0.95      0.93      3279
       Lunch       0.62      0.34      0.44      1200
        Main       0.83      0.83      0.83      2173
        Side       0.67      0.63      0.65      1125
       Snack       0.42      0.02      0.04       254
     Starter       0.53      0.15      0.24       736

   micro avg       0.82      0.67      0.73      9274
   macro avg       0.62      0.38      0.42      9274
weighted avg       0.76      0.67      0.69      9274
 samples avg       0.68      0.60      0.62      9274



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [103]:
top_k = 2
Y_meal_proba = model.predict_proba(X_test)
Y_pred_topk = np.zeros_like(Y_meal_proba)

top_indices_k = np.argsort(Y_meal_proba, axis=1)[:, -top_k:]

for i, row in enumerate(top_indices_k):
    Y_pred_topk[i, row] = 1
    
from sklearn.metrics import classification_report
print(classification_report(Y_meal_test, Y_pred_topk, target_names=meal_mlb.classes_))

              precision    recall  f1-score   support

   Breakfast       0.29      0.15      0.20       206
      Brunch       0.16      0.08      0.11       301
      Dinner       0.82      0.99      0.90      3279
       Lunch       0.39      0.18      0.25      1200
        Main       0.83      0.81      0.82      2173
        Side       0.61      0.62      0.61      1125
       Snack       0.15      0.17      0.16       254
     Starter       0.57      0.05      0.10       736

   micro avg       0.72      0.65      0.69      9274
   macro avg       0.48      0.38      0.39      9274
weighted avg       0.67      0.65      0.64      9274
 samples avg       0.72      0.60      0.64      9274



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [111]:
from itertools import chain

# Drop missing values and flatten the list of lists
valid_special = df["special-consideration"].dropna()
flat_special = list(chain.from_iterable(valid_special))

# Get unique cuisines and their count
unique_special = sorted(set(flat_special))
print(f"Total unique cuisines: {len(unique_special)}")
print(unique_special)


Total unique cuisines: 6
['Dairy Free', 'Gluten Free', 'Keto', 'Nut Free', 'Vegan', 'Vegetarian']


In [112]:
# Drop missing values and flatten the list of lists

flat_special = list(chain.from_iterable(valid_special))

# Count occurrences
special_counts = Counter(flat_special)

# Sort and display
for special, count in sorted(special_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{special}: {count}")


Nut Free: 14854
Gluten Free: 11007
Dairy Free: 9456
Vegetarian: 8647
Vegan: 3687
Keto: 1204


In [110]:
columns_to_remove = ["Low Fat", "Low Cal", "Paleo", "Low Carb", "Sugar Conscious", "Fat Free", "Kid-Friendly", 
                     "Kosher", "Pescatarian", "Quick & Easy", "Soy Free", "Healthyish", "Raw"]

columns_set = set(columns_to_remove)

# Apply function with additional check for NaN
df["special-consideration"] = df["special-consideration"].apply(
    lambda tags: [tag for tag in tags if isinstance(tags, list) and tag not in columns_set] if isinstance(tags, list) else tags
)


In [116]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

df["special-consideration"] = df["special-consideration"].apply(lambda x: x if isinstance(x, list) else [])

# Transform the special consideration column into binary matrix
Y_consideration = mlb.fit_transform(df["special-consideration"])

# Now we have the binary labels
from sklearn.model_selection import train_test_split

# Let's assume that you have your features (X) already prepared
# X = your feature set (e.g., ingredients, other relevant columns)

# Split the data: 80% train, 20% test
X_train, X_test, Y_consideration_train, Y_consideration_test = train_test_split(X, Y_consideration, test_size=0.2, random_state=42)


In [117]:
import xgboost as xgb
from sklearn.multiclass import OneVsRestClassifier

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=6, n_estimators=100)

# Wrap XGBoost with OneVsRestClassifier for multi-label classification
model = OneVsRestClassifier(xgb_model)

# Train the model
model.fit(X_train, Y_consideration_train)


In [118]:
import numpy as np

# Get the probabilities of the predictions
Y_proba = model.predict_proba(X_test)

# Initialize an array for storing the top-2 predictions
top2_predictions = np.zeros_like(Y_proba)

# Get the indices of the top-2 labels for each sample
top_indices = np.argsort(Y_proba, axis=1)[:, -2:]

# Set the top-2 predicted labels to 1 (in binary matrix format)
for i, indices in enumerate(top_indices):
    top2_predictions[i, indices] = 1


In [119]:
from sklearn.metrics import classification_report

# Print classification report for top-2 labels prediction
print(classification_report(Y_consideration_test, top2_predictions, target_names=mlb.classes_))


              precision    recall  f1-score   support

  Dairy Free       0.70      0.49      0.58      1932
 Gluten Free       0.57      0.42      0.48      2220
        Keto       0.00      0.00      0.00       252
    Nut Free       0.82      0.92      0.87      3032
       Vegan       0.69      0.06      0.12       729
  Vegetarian       0.75      0.83      0.79      1732

   micro avg       0.73      0.62      0.67      9897
   macro avg       0.59      0.45      0.47      9897
weighted avg       0.70      0.62      0.63      9897
 samples avg       0.73      0.65      0.65      9897



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [120]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, Y_consideration_train)

# Train with a classifier with class weights
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_resampled, y_resampled)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(Y_consideration_test, y_pred))

ValueError: Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported.

In [121]:
from sklearn.multioutput import MultiOutputClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import classification_report

# Assuming Y_consideration_train is a binary matrix where each column is a dietary label
# Get the shape to verify
print(Y_consideration_train.shape)  # Should be (n_samples, 6) for your 6 categories

# Create a separate classifier for each label with its own SMOTE
classifiers = []
for i in range(Y_consideration_train.shape[1]):
    # Extract the binary label for this dietary restriction
    y_single = Y_consideration_train[:, i]
    
    # Apply SMOTE for this single label
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train, y_single)
    
    # Train model on the resampled data
    model = RandomForestClassifier(random_state=42)
    model.fit(X_res, y_res)
    
    # Store the trained model
    classifiers.append(model)

# To predict all labels for a new sample
def predict_all(X_new):
    predictions = np.zeros((X_new.shape[0], len(classifiers)))
    for i, clf in enumerate(classifiers):
        predictions[:, i] = clf.predict(X_new)
    return predictions

# Example evaluation
y_pred = predict_all(X_test)
for i, label in enumerate(['Nut Free', 'Gluten Free', 'Dairy Free', 'Vegetarian', 'Vegan', 'Keto']):
    print(f"Performance for {label}:")
    print(classification_report(Y_consideration_test[:, i], y_pred[:, i]))

(16748, 6)




Performance for Nut Free:
              precision    recall  f1-score   support

           0       0.80      0.71      0.75      2255
           1       0.70      0.79      0.74      1932

    accuracy                           0.75      4187
   macro avg       0.75      0.75      0.75      4187
weighted avg       0.75      0.75      0.75      4187

Performance for Gluten Free:
              precision    recall  f1-score   support

           0       0.64      0.66      0.65      1967
           1       0.69      0.67      0.68      2220

    accuracy                           0.67      4187
   macro avg       0.67      0.67      0.67      4187
weighted avg       0.67      0.67      0.67      4187

Performance for Dairy Free:
              precision    recall  f1-score   support

           0       0.95      0.86      0.90      3935
           1       0.13      0.32      0.18       252

    accuracy                           0.83      4187
   macro avg       0.54      0.59      0.54  

In [122]:
classifiers = []

# Apply SMOTE for each label separately and train a Random Forest classifier for each label
for i in range(Y_consideration_train.shape[1]):
    # Extract the binary label for this dietary restriction
    y_single = Y_consideration_train[:, i]
    
    # Apply SMOTE for this single label
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train, y_single)
    
    # Train model on the resampled data
    model = RandomForestClassifier(random_state=42)
    model.fit(X_res, y_res)
    
    # Store the trained model
    classifiers.append(model)

# Function to predict all labels for a new sample
def predict_all(X_new):
    predictions = np.zeros((X_new.shape[0], len(classifiers)))
    for i, clf in enumerate(classifiers):
        predictions[:, i] = clf.predict(X_new)
    return predictions

# Make predictions on the test set
y_pred = predict_all(X_test)

# Evaluate the performance for each label
for i, label in enumerate(mlb.classes_):
    print(f"Performance for {label}:")
    print(classification_report(Y_consideration_test[:, i], y_pred[:, i]))



Performance for Dairy Free:
              precision    recall  f1-score   support

           0       0.80      0.71      0.75      2255
           1       0.70      0.79      0.74      1932

    accuracy                           0.75      4187
   macro avg       0.75      0.75      0.75      4187
weighted avg       0.75      0.75      0.75      4187

Performance for Gluten Free:
              precision    recall  f1-score   support

           0       0.64      0.66      0.65      1967
           1       0.69      0.67      0.68      2220

    accuracy                           0.67      4187
   macro avg       0.67      0.67      0.67      4187
weighted avg       0.67      0.67      0.67      4187

Performance for Keto:
              precision    recall  f1-score   support

           0       0.95      0.86      0.90      3935
           1       0.13      0.32      0.18       252

    accuracy                           0.83      4187
   macro avg       0.54      0.59      0.54      

In [1]:
!pip show fastapi
!pip show pydantic
!pip show mlflow
!pip show scikit-learn
!pip show numpy
!pip show pandas


Name: fastapi
Version: 0.115.12
Summary: FastAPI framework, high performance, easy to learn, fast to code, ready for production
Home-page: https://github.com/fastapi/fastapi
Author: 
Author-email: =?utf-8?q?Sebasti=C3=A1n_Ram=C3=ADrez?= <tiangolo@gmail.com>
License: 
Location: C:\Users\elion\AppData\Roaming\Python\Python312\site-packages
Requires: pydantic, starlette, typing-extensions
Required-by: 
Name: pydantic
Version: 2.8.2
Summary: Data validation using Python type hints
Home-page: https://github.com/pydantic/pydantic
Author: 
Author-email: Samuel Colvin <s@muelcolvin.com>, Eric Jolibois <em.jolibois@gmail.com>, Hasan Ramezani <hasan.r67@gmail.com>, Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>, Terrence Dorsey <terry@pydantic.dev>, David Montague <david@pydantic.dev>, Serge Matveenko <lig@countzero.co>, Marcelo Trylesinski <marcelotryle@gmail.com>, Sydney Runkle <sydneymarierunkle@gmail.com>, David Hewitt <mail@davidhewitt.io>, Alex Hall <alex.mojaki@gmail.