## Prediction modeling using NLP

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
import re


In [49]:
ingredients_df = pd.read_csv('dishesDataset.csv')
ingredients_df= ingredients_df[['TranslatedRecipeName', 'TranslatedIngredients', 'TotalTimeInMins',
       'Cuisine', 'Cleaned-Ingredients', 'Ingredient-count']].copy()
display(ingredients_df.head())

Unnamed: 0,TranslatedRecipeName,TranslatedIngredients,TotalTimeInMins,Cuisine,Cleaned-Ingredients,Ingredient-count
0,Masala Karela Recipe,"1 tablespoon Red Chilli powder,3 tablespoon Gr...",45,Indian,"salt,amchur (dry mango powder),karela (bitter ...",10
1,Spicy Tomato Rice (Recipe),"2 teaspoon cashew - or peanuts, 1/2 Teaspoon ...",15,South Indian Recipes,"tomato,salt,chickpea lentils,green chilli,rice...",12
2,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1 Onion - sliced,1 teaspoon White Urad Dal (Sp...",50,South Indian Recipes,"salt,rice vermicelli noodles (thin),asafoetida...",12
3,Gongura Chicken Curry Recipe - Andhra Style Go...,"1/2 teaspoon Turmeric powder (Haldi),1 tablesp...",45,Andhra,"tomato,salt,ginger,sorrel leaves (gongura),fen...",15
4,Andhra Style Alam Pachadi Recipe - Adrak Chutn...,"oil - as per use, 1 tablespoon coriander seed...",30,Andhra,"tomato,salt,ginger,red chillies,curry,asafoeti...",12


In [50]:
ingredients_df.dropna(subset=['TranslatedRecipeName', 'Cleaned-Ingredients'], inplace=True)
ingredients_df.drop_duplicates(subset='TranslatedRecipeName', inplace=True)


In [51]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.replace('recipe', '').strip()
    return text

ingredients_df['TranslatedRecipeName'] = ingredients_df['TranslatedRecipeName'].apply(clean_text)
# ingredients_df['Cleaned-Ingredients'] = ingredients_df['Cleaned-Ingredients'].apply(clean_text)

ingredients_df['Cuisine'] = ingredients_df['Cuisine'].str.lower().str.strip()
ingredients_df['Name_WordCount'] = ingredients_df['TranslatedRecipeName'].apply(lambda x: len(x.split()))
ingredients_df['Cleaned-Ingredients'] = ingredients_df['Cleaned-Ingredients'].apply(lambda x: [i.strip() for i in x.split(',')])

In [53]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(ingredients_df['Cleaned-Ingredients'])

ingredient_labels = mlb.classes_

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(ingredients_df['TranslatedRecipeName'])

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)



In [22]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_test)
print("F1 Score:", f1_score(y_test, y_pred, average='micro'))

F1 Score: 0.0


In [23]:
new_dish = ["spicy mango curry"]
new_vector = tfidf.transform(new_dish)
predicted = model.predict(new_vector)
ingredients = [ingredient_labels[i] for i, val in enumerate(predicted[0]) if val == 1]
print(ingredients)

[]
