<a href="https://colab.research.google.com/github/JITHIN-ANTONY-JOSEPH/ERP_11358080/blob/main/11_Recipe_Substitute.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import json
import random
import pickle

# Load the main dataset
with open('/content/drive/My Drive/ERP/modified_Processed_Layer1.json', 'r') as file:
    recipe1m_data = [json.loads(line) for line in file]

recipe1m_df = pd.DataFrame(recipe1m_data)

# Load the substitution pairs
substitution_pairs_df = pd.read_csv('/content/drive/My Drive/ERP/Recipe1MSubs_full.csv')

# Load flavor graph
flavorgraph_df = pd.read_csv('/content/drive/My Drive/ERP/Dataset/nodes_191120.csv')  # replace with your actual path

# Example ingredient list for NER-like extraction (replace with your own comprehensive list or use NER model)
ingredient_list = set(flavorgraph_df[flavorgraph_df['node_type'] == 'ingredient']['name'].dropna().unique())

# Load the existing graph embeddings from file
with open('/content/drive/My Drive/ERP/MODEL_BEST_NUMBERS/graph_embeddings.pkl', 'rb') as f:
    graph_embeddings = pickle.load(f)

# Generate the valid substitutes dictionary
valid_substitutes = {}
for _, row in substitution_pairs_df.iterrows():
    ing1 = row['ingredient1']
    ing2 = row['ingredient2']
    if ing1 not in valid_substitutes:
        valid_substitutes[ing1] = set()
    valid_substitutes[ing1].add(ing2)

# Generate random embeddings for recipe IDs
recipe_id_list = recipe1m_df['id'].unique()  # Get all unique recipe IDs
recipe_id_embeddings = {recipe_id: np.random.rand(100) for recipe_id in recipe_id_list}  # Example: 100-dimensional embeddings

# Function to extract ingredients from instructions
def extract_ingredients_from_instructions(instructions, ingredient_list):
    extracted_ingredients = []
    for instruction in instructions:
        words = instruction.split()
        for word in words:
            if word in ingredient_list:
                extracted_ingredients.append(word)
    return extracted_ingredients

# Apply the extraction function
recipe1m_df['extracted_ingredients'] = recipe1m_df['processed_instructions'].apply(
    lambda instructions: extract_ingredients_from_instructions(instructions, ingredient_list) if isinstance(instructions, list) else []
)

# Prepare sentences for training
sentences = recipe1m_df['extracted_ingredients'].tolist()

# Add substitution contexts to sentences
for _, row in substitution_pairs_df.iterrows():
    ingredient1 = row['ingredient1']
    ingredient2 = row['ingredient2']
    sentences.append([ingredient1, ingredient2])

# Train the Word2Vec model
from gensim.models import Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=8)

# Function to combine text, graph, and recipe ID embeddings
def get_combined_embedding(ingredient, recipe_id, text_embeddings, graph_embeddings, recipe_id_embeddings):
    # Get text embedding
    if ingredient in text_embeddings:
        text_embedding = text_embeddings[ingredient]
    else:
        text_embedding = np.zeros(100)

    # Get graph embedding
    if ingredient in graph_embeddings:
        graph_embedding = graph_embeddings[ingredient]
    else:
        graph_embedding = np.zeros(100)

    # Get recipe ID embedding
    if recipe_id in recipe_id_embeddings:
        recipe_embedding = recipe_id_embeddings[recipe_id]
    else:
        recipe_embedding = np.zeros(100)

    # Combine embeddings by concatenation
    combined_embedding = np.concatenate((text_embedding, graph_embedding, recipe_embedding))

    return combined_embedding

# Prepare training data with negative samples
train_data = []
train_labels = []
negative_samples = []
negative_labels = []

for _, row in substitution_pairs_df.iterrows():
    ing1 = row['ingredient1']
    ing2 = row['ingredient2']
    recipe_id = row['recipe_id']  # Assuming each row in the substitution pairs includes a recipe ID

    combined_embedding1 = get_combined_embedding(ing1, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)
    combined_embedding2 = get_combined_embedding(ing2, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)

    train_data.append(combined_embedding1)
    train_labels.append(combined_embedding2)

    # Generate negative samples by excluding valid substitutes
    possible_negatives = [
        ing for ing in ingredient_list
        if ing != ing1 and ing not in valid_substitutes.get(ing1, set())
    ]
    selected_negatives = random.sample(possible_negatives, min(100, len(possible_negatives)))  # Pick 100 non-substitutes

    for neg in selected_negatives:
        neg_embedding = get_combined_embedding(neg, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)
        negative_labels.append(neg_embedding)
        negative_samples.append(combined_embedding1)

# Convert to tensors
train_data = torch.tensor(train_data, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.float32)
negative_samples = torch.tensor(negative_samples, dtype=torch.float32)
negative_labels = torch.tensor(negative_labels, dtype=torch.float32)

# Define the neural network
class CombinedNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CombinedNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Instantiate the model
nn_model = CombinedNN(input_dim=300, output_dim=300)  # Combined embedding dimension is 300 (100 text + 100 graph + 100 recipe)
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.001)

# Training loop with negative sampling
for epoch in range(50):
    nn_model.train()
    optimizer.zero_grad()

    # Positive pair loss
    outputs = nn_model(train_data)
    loss = criterion(outputs, train_labels)

    # Negative pair loss (contrastive or margin-based loss)
    neg_outputs = nn_model(negative_samples)
    expanded_outputs = outputs.repeat_interleave(100, dim=0)  # Ensure dimensions match for 100 negatives
    negative_loss = torch.mean(torch.relu(1.0 - torch.sum(expanded_outputs * neg_outputs, dim=1)))

    total_loss = loss + negative_loss
    total_loss.backward()
    optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {total_loss.item()}')

# Function to extract ingredients from a specific recipe based on recipeID
def extract_ingredients_for_recipe(recipe_id, recipe_df, ingredient_list):
    recipe_row = recipe_df[recipe_df['id'] == recipe_id]
    if recipe_row.empty:
        return []
    instructions = recipe_row.iloc[0]['processed_instructions']
    return extract_ingredients_from_instructions(instructions, ingredient_list) if isinstance(instructions, list) else []




  train_data = torch.tensor(train_data, dtype=torch.float32)


Epoch 1, Loss: 0.6585375666618347
Epoch 2, Loss: 0.6332034468650818
Epoch 3, Loss: 0.6154450178146362
Epoch 4, Loss: 0.6027324199676514
Epoch 5, Loss: 0.5927829146385193
Epoch 6, Loss: 0.5849650502204895
Epoch 7, Loss: 0.5780840516090393
Epoch 8, Loss: 0.5722802877426147
Epoch 9, Loss: 0.5666438937187195
Epoch 10, Loss: 0.5614410042762756
Epoch 11, Loss: 0.5562489628791809
Epoch 12, Loss: 0.551291823387146
Epoch 13, Loss: 0.5468875169754028
Epoch 14, Loss: 0.5423744916915894
Epoch 15, Loss: 0.5383607745170593
Epoch 16, Loss: 0.5343826413154602
Epoch 17, Loss: 0.530830979347229
Epoch 18, Loss: 0.5274160504341125
Epoch 19, Loss: 0.5240288376808167
Epoch 20, Loss: 0.5213919878005981
Epoch 21, Loss: 0.5182521343231201
Epoch 22, Loss: 0.5156731605529785
Epoch 23, Loss: 0.5132817625999451
Epoch 24, Loss: 0.5109080672264099
Epoch 25, Loss: 0.5088294148445129
Epoch 26, Loss: 0.5069332718849182
Epoch 27, Loss: 0.5049417018890381
Epoch 28, Loss: 0.5033687949180603
Epoch 29, Loss: 0.5016764998435

In [None]:
# Modified function to filter valid substitutes for a given recipeID and ingredient
def get_valid_substitutes(recipe_id, ingredient, recipe_df, model, graph_embeddings, recipe_id_embeddings, valid_substitutes, top_n=10):
    # Extract context ingredients from the recipe
    context_ingredients = extract_ingredients_for_recipe(recipe_id, recipe_df, ingredient_list)

    if ingredient not in context_ingredients:
        print(f"Ingredient {ingredient} not found in recipe {recipe_id}.")
        return []

    # Get combined embedding for the input ingredient
    ingredient_embedding = get_combined_embedding(ingredient, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)

    # Get all potential substitutes
    potential_substitutes = valid_substitutes.get(ingredient, [])

    # Filter substitutes that are in the context of the recipe
    potential_substitutes = [sub for sub in potential_substitutes if sub in context_ingredients]

    if not potential_substitutes:
        print(f"No valid substitutes found in context for ingredient {ingredient} in recipe {recipe_id}.")
        return []

    # Calculate cosine similarity and find the top substitutes
    substitutes = []
    for sub in potential_substitutes:
        sub_embedding = get_combined_embedding(sub, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)
        similarity = cosine_similarity(ingredient_embedding.reshape(1, -1), sub_embedding.reshape(1, -1))[0][0]
        substitutes.append((sub, similarity))

    substitutes = sorted(substitutes, key=lambda x: x[1], reverse=True)[:top_n]
    print(substitutes)
    return [sub[0] for sub in substitutes]

# Example usage
recipe_id = "1c448c5d40"  # Replace with the actual recipe ID
ingredient_to_replace = "white_chocolate"  # Replace with the ingredient you want to substitute

top_substitutes = get_valid_substitutes(recipe_id, ingredient_to_replace, recipe1m_df, model, graph_embeddings, recipe_id_embeddings, valid_substitutes)

print(f"Top substitutes for {ingredient_to_replace} in recipe {recipe_id}: {top_substitutes}")

[('milk_chocolate', 0.7684454057091302), ('chocolate', 0.7018237058410502), ('peanut_butter_chip', 0.4894215049097287), ('butterscotch_chip', 0.45943049943650993), ('orange', 0.23705233600884962)]
Top substitutes for white_chocolate in recipe 1c448c5d40: ['milk_chocolate', 'chocolate', 'peanut_butter_chip', 'butterscotch_chip', 'orange']


In [None]:
import torch

# Define the path where you want to save the model and other components
model_save_path = '/content/drive/My Drive/ERP/RECIPE_SPECIFIC/saved_model.pth'
embeddings_save_path = '/content/drive/My Drive/ERP/RECIPE_SPECIFIC/saved_embeddings.pkl'

# Save the model's state dictionary
torch.save({
    'model_state_dict': nn_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch,  # Save the last epoch if you want to resume training later
}, model_save_path)

# Save the embeddings (text, graph, and recipe ID embeddings) if needed
embeddings_data = {
    'text_embeddings': model.wv,
    'graph_embeddings': graph_embeddings,
    'recipe_id_embeddings': recipe_id_embeddings
}

with open(embeddings_save_path, 'wb') as f:
    pickle.dump(embeddings_data, f)

print(f"Model saved to {model_save_path}")
print(f"Embeddings saved to {embeddings_save_path}")

Model saved to /content/drive/My Drive/ERP/RECIPE_SPECIFIC/saved_model.pth
Embeddings saved to /content/drive/My Drive/ERP/RECIPE_SPECIFIC/saved_embeddings.pkl
