## Prediction modeling using NLP

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
import re


In [80]:
ingredients_df = pd.read_csv('dishesDataset.csv')
ingredients_df= ingredients_df[['TranslatedRecipeName', 'TranslatedIngredients', 'TotalTimeInMins',
       'Cuisine', 'Cleaned-Ingredients', 'Ingredient-count']].copy()
display(ingredients_df.head())

Unnamed: 0,TranslatedRecipeName,TranslatedIngredients,TotalTimeInMins,Cuisine,Cleaned-Ingredients,Ingredient-count
0,Masala Karela Recipe,"1 tablespoon Red Chilli powder,3 tablespoon Gr...",45,Indian,"salt,amchur (dry mango powder),karela (bitter ...",10
1,Spicy Tomato Rice (Recipe),"2 teaspoon cashew - or peanuts, 1/2 Teaspoon ...",15,South Indian Recipes,"tomato,salt,chickpea lentils,green chilli,rice...",12
2,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1 Onion - sliced,1 teaspoon White Urad Dal (Sp...",50,South Indian Recipes,"salt,rice vermicelli noodles (thin),asafoetida...",12
3,Gongura Chicken Curry Recipe - Andhra Style Go...,"1/2 teaspoon Turmeric powder (Haldi),1 tablesp...",45,Andhra,"tomato,salt,ginger,sorrel leaves (gongura),fen...",15
4,Andhra Style Alam Pachadi Recipe - Adrak Chutn...,"oil - as per use, 1 tablespoon coriander seed...",30,Andhra,"tomato,salt,ginger,red chillies,curry,asafoeti...",12


In [81]:
ingredients_df.dropna(subset=['TranslatedRecipeName', 'Cleaned-Ingredients'], inplace=True)
ingredients_df.drop_duplicates(subset='TranslatedRecipeName', inplace=True)

_bracket_patterns = [r"\([^)]*\)", r"\[[^\]]*\]", r"\{[^}]*\}"]

def remove_bracketed(text: str) -> str:
    if not isinstance(text, str):
        return ""
    out = text
    changed = True
    while changed:
        old = out
        for pat in _bracket_patterns:
            out = re.sub(pat, "", out)
        changed = (out != old)
    return out

def clean_ingredients_cell(text: str):
    text = (text or "").lower()
    text = remove_bracketed(text)
    text = re.sub(r"\s+", " ", text).strip()
    parts = [p.strip(" -") for p in text.split(",")]
    parts = [p for p in parts if p]
    seen = set()
    deduped = []
    for p in parts:
        if p not in seen:
            seen.add(p)
            deduped.append(p)
    return ','.join(deduped)

ingredients_df["Cleaned-Ingredients"] = ingredients_df["Cleaned-Ingredients"].apply(clean_ingredients_cell)


In [82]:
from unidecode import unidecode

def clean_recipe_name(name):
    if not isinstance(name, str) or not name.strip():
        return None

    name = unidecode(name)
    name = name.lower().strip()

    noise_patterns = [
        r'\brecipe\b', r'\bhow to make\b', r'\bvideo\b', r'\bquick & spicy', r'\bquick & easy',r'\bhealthy & delicious', r'\beggless &gluten',r'\bdelicious & cheesy',
        r'\bdelicious\b', r'\bhealthy\b', r'\bwholesome\b',
        r'\bin hindi\b', r'\binstant pot\b', r'\bquick\b',
        r'\bminutes?\b', r'\bminute\b', r'\beggless\b'
        
    ]
    for pat in noise_patterns:
        name = re.sub(pat, '', name)

    keep = []
    for match in re.finditer(r'\(([^)]*)\)', name):
        content = match.group(1).strip()
        if 1 <= len(content.split()) <= 5:  
            keep.append(content)
    name = re.sub(r'\([^)]*\)', '', name)

    parts = re.split(r'\s*[-/|]\s*', name)
    parts = [re.sub(r'\s+', ' ', p).strip(' .,;') for p in parts if p.strip()]

    all_parts = list(dict.fromkeys(parts + keep)) 

    if not all_parts:
        return None

    canonical = all_parts[0]
    aliases = all_parts[1:]
    cleaned = ', '.join(all_parts)
    return canonical, aliases, cleaned

ingredients_df[['Canonical_Name', 'Name_Aliases', 'Cleaned_RecipeName']] = (
    ingredients_df['TranslatedRecipeName']
    .apply(lambda x: pd.Series(clean_recipe_name(x) or ('', [], '')))
)


In [83]:
ingredients_df.to_csv('Cleaned_DishesDataset.csv', index=False)

In [89]:
final_df = ingredients_df[["Canonical_Name", "Cleaned_RecipeName", 'Cleaned-Ingredients']].copy()
final_df.head()

Unnamed: 0,Canonical_Name,Cleaned_RecipeName,Cleaned-Ingredients
0,masala karela,masala karela,"salt,amchur,karela,red chilli powder,gram flou..."
1,spicy tomato rice,spicy tomato rice,"tomato,salt,chickpea lentils,green chilli,rice..."
2,ragi semiya upma,"ragi semiya upma, ragi millet vermicelli break...","salt,rice vermicelli noodles,asafoetida,mustar..."
3,gongura chicken curry,"gongura chicken curry, andhra style gongura ch...","tomato,salt,ginger,sorrel leaves,fennel seeds,..."
4,andhra style alam pachadi,"andhra style alam pachadi, adrak chutney","tomato,salt,ginger,red chillies,curry,asafoeti..."


In [97]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Use canonical names (and aliases if you want)
final_df['dish_text'] = final_df['Cleaned_RecipeName'].astype(str)
final_df['embedding'] = final_df['dish_text'].apply(lambda x: model.encode(x))


'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1028)')))"), '(Request ID: c8393774-4fdc-4234-bb16-9996a16fe573)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1028)')))"), '(Request ID: 54db05a0-3fec-40ee-b1da-eaa6e98068a5)')' thrown while requesting HEAD https://huggingface.co/sente

SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/adapter_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1028)')))"), '(Request ID: e7952563-deab-47a5-81d9-a70973399625)')

In [98]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1028)')))"), '(Request ID: be288db2-ead5-40cc-9cb5-6187f3c82661)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1028)')))"), '(Request ID: c330639e-21eb-4e27-b456-d79977cc5be3)')' thrown while requesting HEAD htt

SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1028)')))"), '(Request ID: 1deb4bb9-a50e-4a31-a747-6df5be17cc0f)')