In [1]:
import pandas as pd
import re

df = pd.read_csv("IndianFoodDatasetCSV.csv")

def tokenize(text):
    if not isinstance(text, str):
        return []
    # Split on commas, semicolons, newlines, and spaces
    words = re.split(r'[,;\n\s]+', text.lower())
    # Remove empty and numeric tokens
    return [w for w in words if w and not w.isdigit()]

# Collect all words from both columns
all_words = set()
for col in ["Ingredients", "TranslatedIngredients"]:
    df[col] = df[col].fillna("")
    for line in df[col]:
        all_words.update(tokenize(line))

print(f"Total unique words in ingredients: {len(all_words)}")
print("Sample words:", list(all_words)[:30])

Total unique words in ingredients: 4585
Sample words: ['robusta', 'milagai', 'pani', 'keerai)', 'खस', 'cheetos', '(विनेगर)', 'lengths', 'pesarattu', 'paprika)', 'चमचच', 'toppings', 'two', 'podi', 'ल', 'लिए', 'standard', 'foot', 'semolina', 'prawns', 'ढोले', 'diced', 'sooji', 'bitter', 'ball', '(firm)', '/jam', '(beetroot)', '(mahali', '(baby']


In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english'))
hi_stopwords = set([
    "और", "का", "की", "के", "से", "को", "पर", "में", "है", "यह", "था", "थे", "तो", "भी", "हो", "कि", "जो", "तक", "जब", "या", "एक", "लेकिन", "अपने", "लिए", "इसे", "इस", "उस", "उन", "उनके", "उनकी", "उनका"
])

ingredient_stopwords = set()
for word in all_words:
    if word in en_stopwords or word in hi_stopwords:
        ingredient_stopwords.add(word)

print(f"Stopwords found in ingredients: {ingredient_stopwords}")

Stopwords found in ingredients: {'each', 'an', 'was', 'की', 'but', 'we', 'लिए', 'को', 'very', 'been', 'then', 'such', 'into', 'है', 'should', 'i', 'down', 'o', 'about', 'to', 'एक', 'this', 'it', 'in', 'so', 'भी', 'how', 'which', 'can', 'of', 'का', 'your', 'a', 'the', 'now', 'off', 'या', 'and', 'will', 'have', 'more', 'from', 'same', 's', 'after', 'if', 'or', 'few', 'से', 'while', 'तक', 'here', 'that', 'both', 'too', 'at', 'other', 'not', 'do', 'when', 'be', 'up', "don't", 'its', 'are', 'और', 'इस', 'में', 'on', 'any', 'above', 'than', 'पर', 'some', 'all', 'अपने', 'के', 'with', 'for', 'until', 'you', 'as', 'before', 'during', 'over', 'them', 'just', 'no', 'out', 'is', 'only', 'by', 't', 'between'}


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hvish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [7]:
import networkx as nx

# Load the combined graph from your GraphML file
G_all = nx.read_graphml("all_recipes.graphml")
def remove_ingredient_stopwords(text):
    words = re.findall(r'\w+', text.lower())
    filtered = [w for w in words if w not in ingredient_stopwords]
    return ' '.join(filtered)

# Example usage for transitions:
transition_counter = Counter()
for u, v in G_all.edges():
    u_text = G_all.nodes[u].get("text", "").lower()
    v_text = G_all.nodes[v].get("text", "").lower()
    u_clean = remove_ingredient_stopwords(u_text)
    v_clean = remove_ingredient_stopwords(v_text)
    if u_clean and v_clean:
        transition_counter[(u_clean, v_clean)] += 1

print(f"{'From Step':<50} {'→':^3} {'To Step':<50} {'Count':>8}")
print("-" * 120)
for (u, v), count in transition_counter.most_common(15):
    print(f"{u[:48]:<50} → {v[:48]:<50} {count:>8}")

From Step                                           →  To Step                                               Count
------------------------------------------------------------------------------------------------------------------------
keep aside                                         → heat oil pan                                             42
अलग स रख द                                         → अब कढ़ ई म त ल गरम कर                                     11
heat oil pan                                       → add mustard seeds let splutter                            9
heat oil pan                                       → add mustard seeds let cook 10 seconds                     8
heat oil pan                                       → add cumin seeds let splutter                              7
heat oil pan                                       → add onions cook till they become soft                     6
keep separately                                    → heat oil pan                     

In [None]:
import matplotlib.pyplot as plt

# Build directed graph from transitions with weight > 2
G_trans = nx.DiGraph()
for (u, v), w in transition_counter.items():
    if w > 2:
        G_trans.add_edge(u, v, weight=w)

plt.figure(figsize=(14, 10))
pos = nx.spring_layout(G_trans, k=0.7, seed=42)
edges = G_trans.edges()
weights = [G_trans[u][v]['weight'] for u, v in edges]
nx.draw(
    G_trans, pos, with_labels=True, node_size=700, node_color='skyblue',
    arrowsize=20, width=[w/5 for w in weights], font_size=8
)
plt.title("Recipe Step Transition Network (weight > 2)")
plt.show()

KeyboardInterrupt: 

<Figure size 1400x1000 with 0 Axes>