## Prediction modeling using NLP

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from rapidfuzz import process, fuzz
from collections import Counter,defaultdict


In [32]:
import warnings
warnings.filterwarnings("ignore")

In [13]:
ingredients_df = pd.read_csv('dishesDataset.csv')
ingredients_df= ingredients_df[['TranslatedRecipeName', 'TranslatedIngredients', 'TotalTimeInMins',
       'Cuisine', 'Cleaned-Ingredients', 'Ingredient-count']].copy()
display(ingredients_df.head())

Unnamed: 0,TranslatedRecipeName,TranslatedIngredients,TotalTimeInMins,Cuisine,Cleaned-Ingredients,Ingredient-count
0,Masala Karela Recipe,"1 tablespoon Red Chilli powder,3 tablespoon Gr...",45,Indian,"salt,amchur (dry mango powder),karela (bitter ...",10
1,Spicy Tomato Rice (Recipe),"2 teaspoon cashew - or peanuts, 1/2 Teaspoon ...",15,South Indian Recipes,"tomato,salt,chickpea lentils,green chilli,rice...",12
2,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1 Onion - sliced,1 teaspoon White Urad Dal (Sp...",50,South Indian Recipes,"salt,rice vermicelli noodles (thin),asafoetida...",12
3,Gongura Chicken Curry Recipe - Andhra Style Go...,"1/2 teaspoon Turmeric powder (Haldi),1 tablesp...",45,Andhra,"tomato,salt,ginger,sorrel leaves (gongura),fen...",15
4,Andhra Style Alam Pachadi Recipe - Adrak Chutn...,"oil - as per use, 1 tablespoon coriander seed...",30,Andhra,"tomato,salt,ginger,red chillies,curry,asafoeti...",12


In [14]:
ingredients_df.dropna(subset=['TranslatedRecipeName', 'Cleaned-Ingredients'], inplace=True)
ingredients_df.drop_duplicates(subset='TranslatedRecipeName', inplace=True)

_bracket_patterns = [r"\([^)]*\)", r"\[[^\]]*\]", r"\{[^}]*\}"]

def remove_bracketed(text: str) -> str:
    if not isinstance(text, str):
        return ""
    out = text
    changed = True
    while changed:
        old = out
        for pat in _bracket_patterns:
            out = re.sub(pat, "", out)
        changed = (out != old)
    return out

def clean_ingredients_cell(text: str):
    text = (text or "").lower()
    text = remove_bracketed(text)
    text = re.sub(r"\s+", " ", text).strip()
    parts = [p.strip(" -") for p in text.split(",")]
    parts = [p for p in parts if p]
    seen = set()
    deduped = []
    for p in parts:
        if p not in seen:
            seen.add(p)
            deduped.append(p)
    return ','.join(deduped)

ingredients_df["Cleaned-Ingredients"] = ingredients_df["Cleaned-Ingredients"].apply(clean_ingredients_cell)


In [16]:
from unidecode import unidecode

def clean_recipe_name(name):
    if not isinstance(name, str) or not name.strip():
        return None

    name = unidecode(name)
    name = name.lower().strip()

    noise_patterns = [
        r'\brecipe\b', r'\bhow to make\b', r'\bvideo\b', r'\bquick & spicy', r'\bquick & easy',r'\bhealthy & delicious', r'\beggless &gluten',r'\bdelicious & cheesy',
        r'\bdelicious\b', r'\bhealthy\b', r'\bwholesome\b',
        r'\bin hindi\b', r'\binstant pot\b', r'\bquick\b',
        r'\bminutes?\b', r'\bminute\b', r'\beggless\b'
        
    ]
    for pat in noise_patterns:
        name = re.sub(pat, '', name)

    keep = []
    for match in re.finditer(r'\(([^)]*)\)', name):
        content = match.group(1).strip()
        if 1 <= len(content.split()) <= 5:  
            keep.append(content)
    name = re.sub(r'\([^)]*\)', '', name)

    parts = re.split(r'\s*[-/|]\s*', name)
    parts = [re.sub(r'\s+', ' ', p).strip(' .,;') for p in parts if p.strip()]

    all_parts = list(dict.fromkeys(parts + keep)) 

    if not all_parts:
        return None

    canonical = all_parts[0]
    aliases = all_parts[1:]
    cleaned = ', '.join(all_parts)
    return canonical, aliases, cleaned

ingredients_df[['Canonical_Name', 'Name_Aliases', 'Cleaned_RecipeName']] = (
    ingredients_df['TranslatedRecipeName']
    .apply(lambda x: pd.Series(clean_recipe_name(x) or ('', [], '')))
)


In [17]:
ingredients_df.to_csv('Cleaned_DishesDataset.csv', index=False)

In [18]:
final_df = ingredients_df[["Canonical_Name", "Cleaned_RecipeName", 'Cleaned-Ingredients']].copy()
final_df.head()

Unnamed: 0,Canonical_Name,Cleaned_RecipeName,Cleaned-Ingredients
0,masala karela,masala karela,"salt,amchur,karela,red chilli powder,gram flou..."
1,spicy tomato rice,spicy tomato rice,"tomato,salt,chickpea lentils,green chilli,rice..."
2,ragi semiya upma,"ragi semiya upma, ragi millet vermicelli break...","salt,rice vermicelli noodles,asafoetida,mustar..."
3,gongura chicken curry,"gongura chicken curry, andhra style gongura ch...","tomato,salt,ginger,sorrel leaves,fennel seeds,..."
4,andhra style alam pachadi,"andhra style alam pachadi, adrak chutney","tomato,salt,ginger,red chillies,curry,asafoeti..."


In [21]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def embed_aliases(alias_string):
    aliases = [a.strip() for a in alias_string.split(",") if a.strip()]
    vecs = model.encode(aliases)
    return np.mean(vecs, axis=0)

final_df["embedding"] = final_df["Cleaned_RecipeName"].apply(embed_aliases)


In [59]:
PROTEIN_LEX = {
    "veg": {"paneer","tofu","mushroom","baby corn","cauliflower","matar","kaju","soya","mixed veg"},
    "nonveg": {"chicken","egg","mutton","fish","prawn","shrimp","crab"},
}

def detect_protein(name: str):
    n = " " + name.lower() + " "
    for p in sorted(PROTEIN_LEX["nonveg"], key=len, reverse=True):
        if f" {p} " in n:
            return p, "nonveg"
    for p in sorted(PROTEIN_LEX["veg"], key=len, reverse=True):
        if f" {p} " in n:
            return p, "veg"
    return None, None
final_df["protein"] = final_df["Canonical_Name"].apply(lambda s: detect_protein(s)[0])
final_df["vegflag"] = final_df["Canonical_Name"].apply(lambda s: detect_protein(s)[1] or "unknown")

In [34]:
final_df = final_df.reset_index(drop=True)
final_df['recipe_id'] = final_df.index
emb_matrix = np.vstack(final_df["embedding"].values) 


In [35]:
alias_df = (
    final_df[["recipe_id","Cleaned_RecipeName"]]
    .assign(alias_list=lambda d: d["Cleaned_RecipeName"].str.split(","))
    .explode("alias_list")
    .assign(alias=lambda d: d["alias_list"].str.strip().str.lower())
    .drop(columns=["alias_list"])
    .drop_duplicates(subset=["recipe_id","alias"])
    .query("alias != ''")
)

def find_by_alias(query, score_cutoff=92):
    q = query.strip().lower()
    exact = alias_df[alias_df["alias"] == q]
    if not exact.empty:
        return int(exact.iloc[0]["recipe_id"])
    match = process.extractOne(q, alias_df["alias"].tolist(), scorer=fuzz.WRatio, score_cutoff=score_cutoff)
    if match:
        alias_hit = match[0]
        rid = int(alias_df.loc[alias_df["alias"] == alias_hit, "recipe_id"].iloc[0])
        return rid
    return None


In [40]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_similar(query_vector, top_k=5):
    sims = cosine_similarity([query_vector], emb_matrix)[0]
    idx = sims.argsort()[-top_k:][::-1]
    return idx, sims[idx]


In [None]:
def embed_query(text):
    return model.encode(text, normalize_embeddings=True)


In [None]:
def merge_ingredients_simple(rows):
    bag = []
    for s in rows["Cleaned-Ingredients"]:
        bag.extend([x.strip() for x in s.split(",") if x.strip()])
    seen, out = set(), []
    for ing in bag:
        if ing not in seen:
            seen.add(ing)
            out.append(ing)
    return out

def merge_ingredients_weighted(rows, sims):
    c = Counter()
    for s, w in zip(rows["Cleaned-Ingredients"], sims):
        for ing in s.split(","):
            ing = ing.strip()
            if ing:
                c[ing] += float(w)
    return [k for k,_ in c.most_common()]


In [53]:
def predict_ingredients(query, top_k=5, use_weighted=True, fuzzy_alias=True):
    if fuzzy_alias:
        rid = find_by_alias(query)
        if rid is not None:
            row = final_df.loc[[rid], ["Canonical_Name","Cleaned-Ingredients"]]
            return merge_ingredients_simple(row), row, [1.0]

    qv = embed_query(query)
    idx, sims = retrieve_similar(qv, top_k=top_k)
    rows = final_df.loc[idx, ["Canonical_Name","Cleaned-Ingredients"]]

    if use_weighted:
        pred = merge_ingredients_weighted(rows, sims)
    else:
        pred = merge_ingredients_simple(rows)

    return pred, rows, sims.tolist()


In [56]:
pred, refs, sims = predict_ingredients("hyderabadi chicken biryani", top_k=5)
print(pred[:20]); print(refs.assign(similarity=sims))

pred2, refs2, sims2 = predict_ingredients("tofu butter masala", top_k=5)
print(pred2[:20]); print(refs2.assign(similarity=sims2))


['turmeric powder', 'salt', 'red chilli powder', 'onion', 'ghee', 'green chillies', 'curd', 'black peppercorns', 'black cardamom', 'cardamom podsseeds', 'cinnamon stick', 'cloves', 'basmati rice', 'ginger', 'bay leaves', 'rice', 'kewra water', 'mace', 'bay leaf', 'ginger garlic paste']
                      Canonical_Name  \
202     hyderabadi vegetable biryani   
4089          lucknowi murgh biryani   
4095  calcutta style chicken biryani   
5159      handi biryani with chicken   
3336             chicken dum biryani   

                                    Cleaned-Ingredients  similarity  
202   turmeric powder,salt,ginger,rice,cinnamon,red ...    0.838174  
4089  basmati rice,coriander leaves,ajwain,black pep...    0.819565  
4095  basmati rice,black cardamom,ginger,salt,bonele...    0.808624  
5159  black peppercorns,onion,black pepper powder,tu...    0.802994  
3336  tomato,basmati rice,gram garlic ground paste,s...    0.788902  
['tomato', 'salt', 'garam masala powder', 'kasuri me

In [57]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(final_df.index, test_size=0.2, random_state=42)
train_df = final_df.loc[train_idx].reset_index(drop=True)
test_df  = final_df.loc[test_idx].reset_index(drop=True)

emb_matrix = np.vstack(train_df["embedding"].values)

def jaccard(a, b):
    A, B = set(a), set(b)
    return len(A & B) / len(A | B) if A or B else 0.0

scores = []
for _, row in test_df.iterrows():
    q = row["Canonical_Name"]
    true_ings = [x.strip() for x in row["Cleaned-Ingredients"].split(",") if x.strip()]

    def predict_from_train(query):
        qv = model.encode(query, normalize_embeddings=True)
        idx, sims = retrieve_similar(qv, top_k=5) 
        rows = train_df.loc[idx, ["Cleaned-Ingredients"]]
        return merge_ingredients_weighted(rows, sims)

    pred_ings = predict_from_train(q)
    score = jaccard(true_ings, pred_ings[:len(true_ings) or 10]) 
    scores.append(score)

print("Mean Jaccard on test:", sum(scores)/len(scores))


Mean Jaccard on test: 0.26638305017455577


In [None]:
def reweight_by_protein(query_name, idx, sims,
                        protein_bonus=1.10,
                        veg_penalty=0.60,
                        cross_penalty=0.80):
    qprot, qveg = detect_protein(query_name)
    scores = []

    for i, s in zip(idx, sims):
        prot = final_df.at[i, "protein"]
        vegf = final_df.at[i, "vegflag"]
        w = float(s)

        if qprot and prot == qprot:
            w *= protein_bonus
        else:
            if qveg == "veg" and vegf == "nonveg":
                w *= veg_penalty
            elif qveg == "nonveg" and vegf == "veg":
                w *= cross_penalty

        scores.append((i, w))

    scores.sort(key=lambda x: x[1], reverse=True)
    return scores


In [None]:
def weighted_merge_ingredients(neighbor_scores, max_neighbors=8):
    c = Counter()
    support = defaultdict(int)

    for i, w in neighbor_scores[:max_neighbors]:
        ings = [x.strip() for x in final_df.at[i, "Cleaned-Ingredients"].split(",") if x.strip()]
        for ing in ings:
            c[ing] += w
            support[ing] += 1

    pruned = [k for k, _ in c.most_common() if support[k] >= 2]
    return pruned[:15]


In [None]:
def predict_ingredients(query_name, top_k=20, score_cutoff=92):
    rid = find_by_alias(query_name, score_cutoff=score_cutoff)
    if rid is not None:
        return [x.strip() for x in final_df.at[rid, "Cleaned-Ingredients"].split(",")]

    qv = embed_query(query_name)            
    idx, sims = retrieve_similar(qv, top_k=top_k)

    nbrs = reweight_by_protein(query_name, idx, sims)

    return weighted_merge_ingredients(nbrs, max_neighbors=8)


In [63]:
print("Tofu butter masala →")
print(predict_ingredients("tofu butter masala"))

print("\nChicken biryani →")
print(predict_ingredients("chicken biryani"))


Tofu butter masala →
['salt', 'sunflower oil', 'cloves garlic', 'onion', 'cheese', 'pepper', 'rice', 'coriander leaves', 'eggs', 'coconut', 'green bell pepper', 'green chillies', 'carrot', 'black peppercorns', 'cumin seeds']

Chicken biryani →
['salt', 'ginger', 'coriander leaves', 'bay leaf', 'cardamom podsseeds', 'curd', 'cinnamon stick', 'chicken', 'lemon', 'seeraga samba rice', 'onion', 'cloves', 'mint leaves', 'garlic', 'turmeric powder', 'green chillies', 'star anise', 'sunflower oil']
