<a href="https://colab.research.google.com/github/JITHIN-ANTONY-JOSEPH/ERP_11358080/blob/main/5_Text_Based_FOOD2VEC_Ingredients_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting to connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Importing libraries and required datasets

In [None]:
import pandas as pd
import json
import networkx as nx
from gensim.models import Word2Vec
from concurrent.futures import ProcessPoolExecutor
import jellyfish

In [None]:
# Load the flavor graph
flavor_graph = nx.read_graphml('/content/drive/My Drive/ERP/knowledge_graph.graphml') # Adjust the path as needed , this is the path to my personal Google Drive

# Extract ingredient nodes from the flavor graph
valid_ingredients = {n for n, attr in flavor_graph.nodes(data=True) if attr['node_type'] == 'ingredient'}

# Load the Recipe1M processed dataset
with open('/content/drive/My Drive/ERP/modified_Processed_Layer1.json', 'r') as file: # Adjust the path as needed , this is the path to my personal Google Drive
    recipe1m_data = [json.loads(line) for line in file]

In [None]:
# Convert to DataFrame
recipe1m_df = pd.DataFrame(recipe1m_data)
recipe1m_df['ingredients'] = recipe1m_df['processed_ingredients'].apply(lambda x: ' '.join(x))

# Load the substitution pairs
substitution_pairs_df = pd.read_csv('/content/drive/My Drive/ERP/Recipe1MSubs_full.csv') # Adjust the path as needed , this is the path to my personal Google Drive

# Merge the datasets based on recipe_id (substitution_pairs_df) and id (recipe1m_df)
merged_df = pd.merge(recipe1m_df, substitution_pairs_df, left_on='id', right_on='recipe_id')

### Steps to generate Word2Vec Embeddings

In [None]:
# Step 2: Generate Enhanced Ingredient Lists
# Initialize an empty list to store the enhanced sentences
enhanced_sentences = []

# Iterate over each recipe in merged_df
for _, recipe_row in merged_df.iterrows():
    recipe_id = recipe_row['id']
    ingredients = recipe_row['processed_ingredients']

    # Initialize an empty set to store valid ingredients
    ingredient_set = set()

    # Split each ingredient description into individual words and check each word
    for ingredient in ingredients:
        words = ingredient.split()
        valid_words = [word for word in words if word in valid_ingredients]
        ingredient_set.update(valid_words)

    # Find substitution pairs for the current recipe
    recipe_subs = substitution_pairs_df[substitution_pairs_df['recipe_id'] == recipe_id]

    # Add valid substitution pairs to the ingredient set
    for _, sub_row in recipe_subs.iterrows():
        ingredient1 = sub_row['ingredient1']
        ingredient2 = sub_row['ingredient2']
        if ingredient1 in valid_ingredients and ingredient2 in valid_ingredients:
            ingredient_set.update([ingredient1, ingredient2])

    # Convert the set back to a list and add to enhanced sentences
    enhanced_sentences.append(list(ingredient_set))


In [None]:
# Step 3: Train the Word2Vec Model
# Train Food2Vec model on enhanced sentences
food2vec_model = Word2Vec(enhanced_sentences, vector_size=100, window=5, min_count=1, workers=4)


In [None]:
import numpy as np

### Evaluation

In [None]:
# Step 4: Generate Predictions and Evaluate
# Function to generate predictions based on cosine similarity
def generate_similarity_predictions(ingredient, model, top_k=10):
    if ingredient in model.wv:
        similarities = model.wv.most_similar(ingredient, topn=top_k)
        return [item[0] for item in similarities]
    return []

# Optimized function to calculate metrics using Jaro-Winkler similarity
def calculate_metrics(predictions, ground_truths, threshold=0.8):
    mrr, hit_1, hit_3, hit_10 = 0.0, 0.0, 0.0, 0.0
    total = len(ground_truths)

    for pred, gt in zip(predictions, ground_truths):
        for i, candidate in enumerate(pred):
            sim = jellyfish.jaro_winkler_similarity(gt, candidate)
            if sim >= threshold:
                rank = i + 1
                mrr += 1.0 / rank
                if rank == 1:
                    hit_1 += 1.0
                if rank <= 3:
                    hit_3 += 1.0
                if rank <= 10:
                    hit_10 += 1.0
                break

    mrr /= total
    hit_1 /= total
    hit_3 /= total
    hit_10 /= total
    return mrr, hit_1, hit_3, hit_10

# Function to generate predictions for a batch of ingredients
def batch_generate_predictions(batch, model):
    predictions = []
    for ingredient in batch['ingredient1']:
        predictions.append(generate_similarity_predictions(ingredient, model))
    return predictions

# Extract ground truths from the validation set
val_ingredient1 = substitution_pairs_df['ingredient1'].tolist()
val_ground_truths = substitution_pairs_df['ingredient2'].tolist()

# Split validation pairs into batches
num_batches = 8  # Adjust based on your CPU cores
batches = np.array_split(substitution_pairs_df, num_batches)

# Use multiprocessing to generate predictions faster
with ProcessPoolExecutor(max_workers=num_batches) as executor:
    results = list(executor.map(batch_generate_predictions, batches, [food2vec_model] * num_batches))

# Flatten the list of results
val_predictions = [item for sublist in results for item in sublist]

# Calculate and print the metrics
mrr, hit_1, hit_3, hit_10 = calculate_metrics(val_predictions, val_ground_truths, threshold=0.8)
print(f"Word2Vec: MRR: {mrr:.4f}, Hit@1: {hit_1:.4f}, Hit@3: {hit_3:.4f}, Hit@10: {hit_10:.4f}")

# Check substitutions for "beef"
example_ingredient = 'beef'
substitutions = generate_similarity_predictions(example_ingredient, food2vec_model)
print(f"Top substitutions for '{example_ingredient}':")
for substitution in substitutions:
    print(substitution)

  return bound(*args, **kwds)


Word2Vec: MRR: 0.0438, Hit@1: 0.0204, Hit@3: 0.0480, Hit@10: 0.1180
Top substitutions for 'beef':
cooked_rice
ground_pork
small_onion
crushed_tomato
chili_powder
canned_tomato
saltine
mushroom
pea
beef_broth
