<a href="https://colab.research.google.com/github/Kaushigithub/NLP-RecipeRover/blob/main/NLP_RecipeRover.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
import numpy as np
import re

In [3]:
import nltk

In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk import ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import MWETokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from textblob import TextBlob

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [6]:
from gensim.models.phrases import Phrases, Phraser

In [7]:
df = pd.read_csv("/content/drive/MyDrive/NLP/IndianFoodDatasetCSV.csv")

In [8]:
print(df)

       Srno                                         RecipeName  \
0         1                               Masala Karela Recipe   
1         2  टमाटर पुलियोगरे रेसिपी - Spicy Tomato Rice (Re...   
2         3  Ragi Semiya Upma Recipe - Ragi Millet Vermicel...   
3         4  Gongura Chicken Curry Recipe - Andhra Style Go...   
4         5  आंध्रा स्टाइल आलम पचड़ी रेसिपी - Adrak Chutney ...   
...     ...                                                ...   
6866  14073  गोअन मशरुम जकुटी रेसिपी - Goan Mushroom Xacuti...   
6867  14107  शकरकंदी और मेथी का पराठा रेसिपी - Sweet Potato...   
6868  14165      Ullikadala Pulusu Recipe | Spring Onion Curry   
6869  14167  Kashmiri Style Kokur Yakhni Recipe-Chicken Coo...   
6870  14211              नवरंग दाल रेसिपी - Navrang Dal Recipe   

                                   TranslatedRecipeName  \
0                                  Masala Karela Recipe   
1                            Spicy Tomato Rice (Recipe)   
2     Ragi Semiya Upma Recipe 

In [9]:
print(list(df.columns))

['Srno', 'RecipeName', 'TranslatedRecipeName', 'Ingredients', 'TranslatedIngredients', 'PrepTimeInMins', 'CookTimeInMins', 'TotalTimeInMins', 'Servings', 'Cuisine', 'Course', 'Diet', 'Instructions', 'TranslatedInstructions', 'URL']


In [10]:
print(df.head())

   Srno                                         RecipeName  \
0     1                               Masala Karela Recipe   
1     2  टमाटर पुलियोगरे रेसिपी - Spicy Tomato Rice (Re...   
2     3  Ragi Semiya Upma Recipe - Ragi Millet Vermicel...   
3     4  Gongura Chicken Curry Recipe - Andhra Style Go...   
4     5  आंध्रा स्टाइल आलम पचड़ी रेसिपी - Adrak Chutney ...   

                                TranslatedRecipeName  \
0                               Masala Karela Recipe   
1                         Spicy Tomato Rice (Recipe)   
2  Ragi Semiya Upma Recipe - Ragi Millet Vermicel...   
3  Gongura Chicken Curry Recipe - Andhra Style Go...   
4  Andhra Style Alam Pachadi Recipe - Adrak Chutn...   

                                         Ingredients  \
0  6 Karela (Bitter Gourd/ Pavakkai) - deseeded,S...   
1  2-1/2 कप चावल - पका ले,3 टमाटर,3 छोटा चमच्च बी...   
2  1-1/2 cups Rice Vermicelli Noodles (Thin),1 On...   
3  500 grams Chicken,2 Onion - chopped,1 Tomato -...   
4  1 बड़ा च

In [11]:
columns_to_preprocess = ['TranslatedIngredients', 'TranslatedInstructions']

In [12]:
# Function to clean text
def clean_text(text):
    text = text.lower()   # converts text to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [13]:
for column in columns_to_preprocess:
    df[column + '_clean'] = df[column].astype(str).apply(clean_text)
    print(f"Cleaned data for {column}:")
    print(df[column + '_clean'].head())
    print("\n")

Cleaned data for TranslatedIngredients:
0     karela bitter gourd pavakkai  deseededsalt  t...
1       cups rice  cooked  tomatoes  teaspoons bc b...
2     cups rice vermicelli noodles thin onion  slic...
3     grams chicken onion  chopped tomato  chopped ...
4     tablespoon chana dal  tablespoon white urad d...
Name: TranslatedIngredients_clean, dtype: object


Cleaned data for TranslatedInstructions:
0    to begin making the masala karela recipedeseed...
1    to make tomato puliogere first cut the tomatoe...
2    to begin making the ragi vermicelli recipe fir...
3    to begin making gongura chicken curry recipe f...
4    to make andhra style alam pachadi first heat o...
Name: TranslatedInstructions_clean, dtype: object




In [14]:
# Function to tokenize text
def tokenize(text):
    return word_tokenize(text)

In [15]:
for column in columns_to_preprocess:
    df[column + '_tokens'] = df[column + '_clean'].apply(tokenize)
    print(f"Tokenized data for {column}:")
    print(df[column + '_tokens'].head())
    print("\n")

Tokenized data for TranslatedIngredients:
0    [karela, bitter, gourd, pavakkai, deseededsalt...
1    [cups, rice, cooked, tomatoes, teaspoons, bc, ...
2    [cups, rice, vermicelli, noodles, thin, onion,...
3    [grams, chicken, onion, chopped, tomato, chopp...
4    [tablespoon, chana, dal, tablespoon, white, ur...
Name: TranslatedIngredients_tokens, dtype: object


Tokenized data for TranslatedInstructions:
0    [to, begin, making, the, masala, karela, recip...
1    [to, make, tomato, puliogere, first, cut, the,...
2    [to, begin, making, the, ragi, vermicelli, rec...
3    [to, begin, making, gongura, chicken, curry, r...
4    [to, make, andhra, style, alam, pachadi, first...
Name: TranslatedInstructions_tokens, dtype: object




In [16]:
# Function to remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if not token in stop_words]

In [17]:
for column in columns_to_preprocess:
    df[column + '_no_stopwords'] = df[column + '_tokens'].apply(remove_stopwords)
    print(f"Data without stopwords for {column}:")
    print(df[column + '_no_stopwords'].head())
    print("\n")

Data without stopwords for TranslatedIngredients:
0    [karela, bitter, gourd, pavakkai, deseededsalt...
1    [cups, rice, cooked, tomatoes, teaspoons, bc, ...
2    [cups, rice, vermicelli, noodles, thin, onion,...
3    [grams, chicken, onion, chopped, tomato, chopp...
4    [tablespoon, chana, dal, tablespoon, white, ur...
Name: TranslatedIngredients_no_stopwords, dtype: object


Data without stopwords for TranslatedInstructions:
0    [begin, making, masala, karela, recipedeseed, ...
1    [make, tomato, puliogere, first, cut, tomatoes...
2    [begin, making, ragi, vermicelli, recipe, firs...
3    [begin, making, gongura, chicken, curry, recip...
4    [make, andhra, style, alam, pachadi, first, he...
Name: TranslatedInstructions_no_stopwords, dtype: object




In [18]:
# Function to perform lemmatization
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [19]:
for column in columns_to_preprocess:
    df[column + '_lemmatized'] = df[column + '_no_stopwords'].apply(lemmatize)
    print(f"Lemmatized data for {column}:")
    print(df[column + '_lemmatized'].head())
    print("\n")

Lemmatized data for TranslatedIngredients:
0    [karela, bitter, gourd, pavakkai, deseededsalt...
1    [cup, rice, cooked, tomato, teaspoon, bc, bell...
2    [cup, rice, vermicelli, noodle, thin, onion, s...
3    [gram, chicken, onion, chopped, tomato, choppe...
4    [tablespoon, chana, dal, tablespoon, white, ur...
Name: TranslatedIngredients_lemmatized, dtype: object


Lemmatized data for TranslatedInstructions:
0    [begin, making, masala, karela, recipedeseed, ...
1    [make, tomato, puliogere, first, cut, tomato, ...
2    [begin, making, ragi, vermicelli, recipe, firs...
3    [begin, making, gongura, chicken, curry, recip...
4    [make, andhra, style, alam, pachadi, first, he...
Name: TranslatedInstructions_lemmatized, dtype: object




In [20]:
# Function to perform POS tagging
def pos_tagging(tokens):
    return pos_tag(tokens)

In [21]:
for column in columns_to_preprocess:
    df[column + '_pos_tags'] = df[column + '_lemmatized'].apply(pos_tagging)
    print(f"POS tagged data for {column}:")
    print(df[column + '_pos_tags'].head())
    print("\n")

POS tagged data for TranslatedIngredients:
0    [(karela, NN), (bitter, NN), (gourd, NN), (pav...
1    [(cup, NN), (rice, NN), (cooked, VBD), (tomato...
2    [(cup, NN), (rice, NN), (vermicelli, NN), (noo...
3    [(gram, NN), (chicken, NN), (onion, NN), (chop...
4    [(tablespoon, NN), (chana, NN), (dal, JJ), (ta...
Name: TranslatedIngredients_pos_tags, dtype: object


POS tagged data for TranslatedInstructions:
0    [(begin, VB), (making, VBG), (masala, JJ), (ka...
1    [(make, VB), (tomato, NN), (puliogere, IN), (f...
2    [(begin, VB), (making, VBG), (ragi, JJ), (verm...
3    [(begin, VB), (making, VBG), (gongura, JJ), (c...
4    [(make, VB), (andhra, JJ), (style, NN), (alam,...
Name: TranslatedInstructions_pos_tags, dtype: object




In [22]:
#Function to perform Noun Extraction
def extract_nouns(pos_tagged_text):
    nouns = [word for word, pos in pos_tagged_text if pos in ["NN", "NNS", "NNP", "NNPS"]]
    return nouns

In [23]:
for column in columns_to_preprocess:
    noun_column = column + '_nouns'
    pos_tagged_column = column + '_pos_tags'
    df[noun_column] = df[pos_tagged_column].apply(extract_nouns)
    print(f"Noun data for {column}:")
    print(df[noun_column].head())
    print("\n")

Noun data for TranslatedIngredients:
0    [karela, bitter, gourd, pavakkai, deseededsalt...
1    [cup, rice, teaspoon, bc, belle, powder, salt,...
2    [cup, rice, vermicelli, onion, carrot, gajjar,...
3    [gram, chicken, onion, tomato, chilli, slit, i...
4    [tablespoon, chana, tablespoon, dal, chilli, t...
Name: TranslatedIngredients_nouns, dtype: object


Noun data for TranslatedInstructions:
0    [karela, slice, skin, nutrient, pressure, cook...
1    [tomato, cut, tomato, grinder, puree, heat, oi...
2    [vermicelli, recipe, steam, ragi, vermicelli, ...
3    [chicken, curry, recipe, ingredient, pan, roas...
4    [style, alam, pachadi, oil, pan, cook, till, a...
Name: TranslatedInstructions_nouns, dtype: object




In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

# Assuming 'Ingredients_nouns' column contains lists of nouns, join them into a single string
df['Ingredients_nouns_joined'] = df['TranslatedIngredients_nouns'].apply(lambda x: ' '.join(x))

# Fit and transform the joined ingredients nouns strings
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Ingredients_nouns_joined'])

feature_names = tfidf_vectorizer.get_feature_names_out()
idf_values = tfidf_vectorizer.idf_

print("Feature Names and their IDF Values:")
for noun, idf in zip(feature_names, idf_values):
    print(f"{noun}: {idf}")

df_tfidf = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf_vectorizer.get_feature_names_out())

print("TF-IDF vectors for Ingredients:")
print(df_tfidf.head())


Feature Names and their IDF Values:
aa: 9.142063283104147
aam: 9.142063283104147
aamras: 9.142063283104147
aata: 9.142063283104147
access: 9.142063283104147
achar: 9.142063283104147
achari: 7.1271602625618815
acid: 8.736598174995981
ada: 9.142063283104147
adai: 9.142063283104147
adaisalt: 9.142063283104147
add: 7.532625370670046
adjust: 5.757673019758371
adjustable: 9.142063283104147
adjustablecoriander: 9.142063283104147
adjustablemint: 9.142063283104147
adjustablesaffron: 9.142063283104147
adjustablesalt: 8.22577255122999
adjustablewater: 9.142063283104147
adjustcoriander: 9.142063283104147
adjusthazelnut: 9.142063283104147
adjustsalt: 7.637985886327872
adjutsalt: 9.142063283104147
adobo: 9.142063283104147
afza: 8.736598174995981
agar: 7.4373151908657205
agathi: 9.142063283104147
agavemaple: 9.142063283104147
agedsalt: 9.142063283104147
aid: 9.142063283104147
air: 9.142063283104147
ajwain: 4.375624949519932
al: 9.142063283104147
alcohol: 9.142063283104147
alesugar: 9.142063283104147


In [25]:
from gensim.models import Word2Vec

In [27]:
ingredient_nouns = df['TranslatedIngredients_nouns'].dropna().apply(lambda x: [str(noun) for noun in x]).tolist()

In [28]:
word2vec_model = Word2Vec(sentences=ingredient_nouns, vector_size=100, window=5, min_count=2, sg=1, workers=4)

In [29]:
word2vec_model.save("word2vec_ingredients.model")

In [38]:
word_vector = word2vec_model.wv['karela']  # example for a single word vector

similar_words = word2vec_model.wv.most_similar('karela', topn=10)

for word, similarity in similar_words:
    print(f"{word}: {similarity}")

bitter: 0.992222785949707
pavakkai: 0.9721605181694031
gourd: 0.9409947395324707
ridge: 0.9356967806816101
turai: 0.9287927746772766
bottle: 0.926409900188446
lauki: 0.9129647016525269
peerkangai: 0.9017698168754578
ash: 0.8834407329559326
yam: 0.8810307383537292


In [36]:
# To use Word2Vec word vectors in your ML models, we need to average the vectors for each recipe
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector


In [37]:
vocabulary = set(word2vec_model.wv.index_to_key)
df['Ingredients_vector'] = df['TranslatedIngredients_nouns'].apply(lambda nouns: average_word_vectors(nouns, word2vec_model, vocabulary, 100))

print("Word2Vec vectors for Ingredients (first few rows):")
print(df['Ingredients_vector'].head())

Word2Vec vectors for Ingredients (first few rows):
0    [-0.1226094254691686, 0.3846301742430244, -0.2...
1    [-0.10331889198949704, 0.4308657170488284, -0....
2    [-0.08765334449708462, 0.3418417362868786, -0....
3    [-0.10855182394734583, 0.35085325250402094, -0...
4    [-0.063250987008214, 0.38269858807325363, -0.4...
Name: Ingredients_vector, dtype: object
