### Mounting to connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Baseline Generation for BERT on Recipe1M Data

In [None]:
import pandas as pd
import json
import networkx as nx
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from concurrent.futures import ProcessPoolExecutor
import jellyfish
from sklearn.metrics.pairwise import cosine_similarity

# Load the flavor graph
flavor_graph = nx.read_graphml('/content/drive/My Drive/ERP/knowledge_graph.graphml')

# Extract ingredient nodes from the flavor graph
valid_ingredients = {n for n, attr in flavor_graph.nodes(data=True) if attr['node_type'] == 'ingredient'}

# Load the Recipe1M processed dataset
with open('/content/drive/My Drive/ERP/modified_Processed_Layer1.json', 'r') as file:
    recipe1m_data = [json.loads(line) for line in file]

# Convert to DataFrame
recipe1m_df = pd.DataFrame(recipe1m_data)
recipe1m_df['ingredients'] = recipe1m_df['processed_ingredients'].apply(lambda x: ' '.join(x))

# Load the substitution pairs
substitution_pairs_df = pd.read_csv('/content/drive/My Drive/ERP/Recipe1MSubs_full.csv')

# Merge the datasets based on recipe_id (substitution_pairs_df) and id (recipe1m_df)
merged_df = pd.merge(recipe1m_df, substitution_pairs_df, left_on='id', right_on='recipe_id')

# Step 1: Initialize BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Ensure that the model runs on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Function to generate BERT embeddings
def generate_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Step 2: Generate BERT Embeddings for Ingredients
enhanced_sentences = []

# Iterate over each recipe in merged_df and process
for _, recipe_row in merged_df.iterrows():
    recipe_id = recipe_row['id']
    ingredients = recipe_row['processed_ingredients']

    # Filter valid ingredients
    ingredient_set = set()
    for ingredient in ingredients:
        words = ingredient.split()
        valid_words = [word for word in words if word in valid_ingredients]
        ingredient_set.update(valid_words)

    # Add substitution pairs to the ingredient set
    recipe_subs = substitution_pairs_df[substitution_pairs_df['recipe_id'] == recipe_id]
    for _, sub_row in recipe_subs.iterrows():
        ingredient1 = sub_row['ingredient1']
        ingredient2 = sub_row['ingredient2']
        if ingredient1 in valid_ingredients and ingredient2 in valid_ingredients:
            ingredient_set.update([ingredient1, ingredient2])

    enhanced_sentences.append(list(ingredient_set))

# Step 3: Generate Embeddings for each ingredient using BERT
ingredient_embeddings = {}
for sentence in enhanced_sentences:
    for ingredient in sentence:
        if ingredient not in ingredient_embeddings:
            embedding = generate_bert_embedding(ingredient, tokenizer, model)
            ingredient_embeddings[ingredient] = embedding

# Step 4: Generate Predictions and Evaluate using Cosine Similarity
def generate_similarity_predictions(ingredient, embeddings, top_k=10):
    if ingredient in embeddings:
        similarities = [(ing, cosine_similarity(embeddings[ingredient], embeddings[ing])[0][0]) for ing in embeddings]
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
        return [item[0] for item in similarities[:top_k]]
    return []

# Optimized function to calculate metrics using Jaro-Winkler similarity
def calculate_metrics(predictions, ground_truths, threshold=0.8):
    mrr, hit_1, hit_3, hit_10 = 0.0, 0.0, 0.0, 0.0
    total = len(ground_truths)

    for pred, gt in zip(predictions, ground_truths):
        for i, candidate in enumerate(pred):
            sim = jellyfish.jaro_winkler_similarity(gt, candidate)
            if sim >= threshold:
                rank = i + 1
                mrr += 1.0 / rank
                if rank == 1:
                    hit_1 += 1.0
                if rank <= 3:
                    hit_3 += 1.0
                if rank <= 10:
                    hit_10 += 1.0
                break

    mrr /= total
    hit_1 /= total
    hit_3 /= total
    hit_10 /= total
    return mrr, hit_1, hit_3, hit_10

# Generate predictions for a batch of ingredients
def batch_generate_predictions(batch, embeddings):
    predictions = []
    for ingredient in batch['ingredient1']:
        predictions.append(generate_similarity_predictions(ingredient, embeddings))
    return predictions

# Extract ground truths from the validation set
val_ingredient1 = substitution_pairs_df['ingredient1'].tolist()
val_ground_truths = substitution_pairs_df['ingredient2'].tolist()

# Split validation pairs into batches
num_batches = 8  # Adjust based on your CPU cores
batches = np.array_split(substitution_pairs_df, num_batches)

# Use multiprocessing to generate predictions faster
with ProcessPoolExecutor(max_workers=num_batches) as executor:
    results = list(executor.map(batch_generate_predictions, batches, [ingredient_embeddings] * num_batches))

# Flatten the list of results
val_predictions = [item for sublist in results for item in sublist]

# Calculate and print the metrics
mrr, hit_1, hit_3, hit_10 = calculate_metrics(val_predictions, val_ground_truths, threshold=0.8)
print(f"BERT: MRR: {mrr:.4f}, Hit@1: {hit_1:.4f}, Hit@3: {hit_3:.4f}, Hit@10: {hit_10:.4f}")

# Check substitutions for "beef"
example_ingredient = 'beef'
substitutions = generate_similarity_predictions(example_ingredient, ingredient_embeddings)
print(f"Top substitutions for '{example_ingredient}':")
for substitution in substitutions:
    print(substitution)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  return bound(*args, **kwds)


BERT: MRR: 0.0769, Hit@1: 0.0458, Hit@3: 0.0883, Hit@10: 0.1647
Top substitutions for 'beef':
beef
chicken
cabbage
roast
grape
butter
marrow
potato
hamburger
cinnamon
