<a href="https://colab.research.google.com/github/JITHIN-ANTONY-JOSEPH/ERP_11358080/blob/main/11_RECIPE_SPECIFIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Input : Recipe Instructions
### Model : Word2Vec(Text) + Node2Vec(Graph) + Attention based Neural Networks(Model) with Negative Sampling -> V1(BATCH SIZE = 128) + RECIPE SPECIFIC
**PROPOSED BEST FINAL MODEL**

### Mounting to connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Importing required libraries , loading datasets and pre-processing

In [None]:
from gensim.models import Word2Vec

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import random
import pickle
import json

# Load the main dataset (modify the path as per your setup)
with open('/content/drive/My Drive/ERP/modified_Processed_Layer1.json', 'r') as file: # Adjust the path as needed , this is the path to my personal Google Drive
    recipe1m_data = [json.loads(line) for line in file]
recipe1m_df = pd.DataFrame(recipe1m_data)

# Load the substitution pairs (modify the path as per your setup)
substitution_pairs_df = pd.read_csv('/content/drive/My Drive/ERP/Recipe1MSubs_full.csv')# Adjust the path as needed , this is the path to my personal Google Drive

# Load flavor graph nodes (modify the path as per your setup)
flavorgraph_df = pd.read_csv('/content/drive/My Drive/ERP/Dataset/nodes_191120.csv')# Adjust the path as needed , this is the path to my personal Google Drive

# Load the precomputed graph embeddings
with open('/content/drive/My Drive/ERP/MODEL_BEST_NUMBERS/graph_embeddings.pkl', 'rb') as f:# Adjust the path as needed , this is the path to my personal Google Drive
    graph_embeddings = pickle.load(f)

# Ingredient list for NER-like extraction
ingredient_list = set(flavorgraph_df[flavorgraph_df['node_type'] == 'ingredient']['name'].dropna().unique())

# Generate random embeddings for recipe IDs
recipe_id_list = recipe1m_df['id'].unique()  # Get all unique recipe IDs
recipe_id_embeddings = {recipe_id: np.random.rand(100) for recipe_id in recipe_id_list}  # Example: 100-dimensional embeddings

# Function to extract ingredients from instructions
def extract_ingredients_from_instructions(instructions, ingredient_list):
    extracted_ingredients = []
    for instruction in instructions:
        words = instruction.split()
        for word in words:
            if word in ingredient_list:
                extracted_ingredients.append(word)
    return extracted_ingredients

# Apply the extraction function
recipe1m_df['extracted_ingredients'] = recipe1m_df['processed_instructions'].apply(
    lambda instructions: extract_ingredients_from_instructions(instructions, ingredient_list) if isinstance(instructions, list) else []
)

# Prepare sentences for training the Word2Vec model
sentences = recipe1m_df['extracted_ingredients'].tolist()

# Add substitution contexts to sentences
for _, row in substitution_pairs_df.iterrows():
    ingredient1 = row['ingredient1']
    ingredient2 = row['ingredient2']
    sentences.append([ingredient1, ingredient2])

# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=8)



### Function to generate combined emedding, defining the neural network and running the model

In [None]:
# Generate valid substitutes dictionary
valid_substitutes = {}
for _, row in substitution_pairs_df.iterrows():
    ing1 = row['ingredient1']
    ing2 = row['ingredient2']
    if ing1 not in valid_substitutes:
        valid_substitutes[ing1] = set()
    valid_substitutes[ing1].add(ing2)

# Function to combine text, graph, and recipe ID embeddings
def get_combined_embedding(ingredient, recipe_id, text_embeddings, graph_embeddings, recipe_id_embeddings):
    # Get text embedding
    if ingredient in text_embeddings:
        text_embedding = text_embeddings[ingredient]
    else:
        text_embedding = np.zeros(100)

    # Get graph embedding
    if ingredient in graph_embeddings:
        graph_embedding = graph_embeddings[ingredient]
    else:
        graph_embedding = np.zeros(100)

    # Get recipe ID embedding
    if recipe_id in recipe_id_embeddings:
        recipe_embedding = recipe_id_embeddings[recipe_id]
    else:
        recipe_embedding = np.zeros(100)

    # Combine embeddings by concatenation
    combined_embedding = np.concatenate((text_embedding, graph_embedding, recipe_embedding))

    return combined_embedding

# Prepare training data with negative sampling
train_data = []
train_labels = []
negative_samples = []
negative_labels = []

for _, row in substitution_pairs_df.iterrows():
    ing1 = row['ingredient1']
    ing2 = row['ingredient2']
    recipe_id = row['recipe_id']  # Use recipe_id in this model

    combined_embedding1 = get_combined_embedding(ing1, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)
    combined_embedding2 = get_combined_embedding(ing2, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)

    train_data.append(combined_embedding1)
    train_labels.append(combined_embedding2)

    # Generate negative samples by excluding valid substitutes
    possible_negatives = [ing for ing in ingredient_list if ing != ing1 and ing not in valid_substitutes.get(ing1, set())]
    selected_negatives = random.sample(possible_negatives, min(100, len(possible_negatives)))

    for neg in selected_negatives:
        neg_embedding = get_combined_embedding(neg, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)
        negative_labels.append(neg_embedding)
        negative_samples.append(combined_embedding1)

# Convert to tensors
train_data = torch.tensor(train_data, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.float32)
negative_samples = torch.tensor(negative_samples, dtype=torch.float32)
negative_labels = torch.tensor(negative_labels, dtype=torch.float32)

# Define the neural network with attention
class AttentionLayer(nn.Module):
    def __init__(self, input_dim, attention_dim):
        super(AttentionLayer, self).__init__()
        self.fc1 = nn.Linear(input_dim, attention_dim)
        self.fc2 = nn.Linear(attention_dim, 1)

    def forward(self, x):
        # Compute attention scores
        attention_scores = torch.tanh(self.fc1(x))
        attention_scores = self.fc2(attention_scores).squeeze(-1)
        attention_weights = F.softmax(attention_scores, dim=-1)

        # Apply attention weights to input
        weighted_sum = torch.sum(attention_weights.unsqueeze(-1) * x, dim=1)
        return weighted_sum

class CombinedNN(nn.Module):
    def __init__(self, input_dim, output_dim, attention_dim=64):
        super(CombinedNN, self).__init__()
        self.attention_layer = AttentionLayer(input_dim, attention_dim)
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # Apply attention
        if x.dim() == 2:  # Ensure x is 3D: [batch_size, sequence_length, input_dim]
            x = x.unsqueeze(1)
        x = self.attention_layer(x)

        # Feed-forward layers
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Instantiate the model
nn_model = CombinedNN(input_dim=300, output_dim=300)  # Combined embedding dimension is 300 (100 text + 100 graph + 100 recipe)
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.001)

# Training loop with negative sampling
batch_size = 128  # Define your batch size here
num_epochs = 50  # Number of epochs

for epoch in range(num_epochs):
    nn_model.train()
    permutation = torch.randperm(train_data.size(0))

    for i in range(0, train_data.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_data = train_data[indices]
        batch_labels = train_labels[indices]

        optimizer.zero_grad()

        # Forward pass
        outputs = nn_model(batch_data)
        loss = criterion(outputs, batch_labels)

        # Negative pair loss (contrastive or margin-based loss)
        neg_indices = permutation[i:i + batch_size]
        batch_neg_samples = negative_samples[neg_indices]
        neg_outputs = nn_model(batch_neg_samples)

        expanded_outputs = outputs.repeat_interleave(len(neg_indices) // len(indices), dim=0)
        negative_loss = torch.mean(torch.relu(1.0 - torch.sum(expanded_outputs * neg_outputs, dim=1)))

        total_loss = loss + negative_loss
        total_loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss.item()}')

# Function to extract ingredients for a specific recipe based on recipeID
def extract_ingredients_for_recipe(recipe_id, recipe_df, ingredient_list):
    recipe_row = recipe_df[recipe_df['id'] == recipe_id]
    if recipe_row.empty:
        return []
    instructions = recipe_row.iloc[0]['processed_instructions']
    return extract_ingredients_from_instructions(instructions, ingredient_list) if isinstance(instructions, list) else []

# Modified function to filter valid substitutes for a given recipeID and ingredient
def get_valid_substitutes(recipe_id, ingredient, recipe_df, model, graph_embeddings, recipe_id_embeddings, valid_substitutes, top_n=10):
    # Extract context ingredients from the recipe
    context_ingredients = extract_ingredients_for_recipe(recipe_id, recipe_df, ingredient_list)

    if ingredient not in context_ingredients:
        print(f"Ingredient {ingredient} not found in recipe {recipe_id}.")
        return []

    # Get combined embedding for the input ingredient
    ingredient_embedding = get_combined_embedding(ingredient, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)

    # Get all potential substitutes
    potential_substitutes = valid_substitutes.get(ingredient, [])

    # Filter substitutes that are in the context of the recipe
    potential_substitutes = [sub for sub in potential_substitutes if sub in context_ingredients]

    if not potential_substitutes:
        print(f"No valid substitutes found in context for ingredient {ingredient} in recipe {recipe_id}.")
        return []

    # Calculate cosine similarity and find the top substitutes
    substitutes = []
    for sub in potential_substitutes:
        sub_embedding = get_combined_embedding(sub, recipe_id, model.wv, graph_embeddings, recipe_id_embeddings)
        similarity = cosine_similarity(ingredient_embedding.reshape(1, -1), sub_embedding.reshape(1, -1))[0][0]
        substitutes.append((sub, similarity))

    substitutes = sorted(substitutes, key=lambda x: x[1], reverse=True)[:top_n]
    print(substitutes)
    return [sub[0] for sub in substitutes]



  train_data = torch.tensor(train_data, dtype=torch.float32)


Epoch 1/50, Loss: 0.48097413778305054
Epoch 2/50, Loss: 0.4775213599205017
Epoch 3/50, Loss: 0.47219714522361755
Epoch 4/50, Loss: 0.4489995539188385
Epoch 5/50, Loss: 0.4367998540401459
Epoch 6/50, Loss: 0.48261579871177673
Epoch 7/50, Loss: 0.4547628164291382
Epoch 8/50, Loss: 0.4538419246673584
Epoch 9/50, Loss: 0.45715194940567017
Epoch 10/50, Loss: 0.46239030361175537
Epoch 11/50, Loss: 0.46768951416015625
Epoch 12/50, Loss: 0.43980035185813904
Epoch 13/50, Loss: 0.4363708794116974
Epoch 14/50, Loss: 0.44432759284973145
Epoch 15/50, Loss: 0.4821905791759491
Epoch 16/50, Loss: 0.43044084310531616
Epoch 17/50, Loss: 0.45188960433006287
Epoch 18/50, Loss: 0.44796231389045715
Epoch 19/50, Loss: 0.46891504526138306
Epoch 20/50, Loss: 0.4287979006767273
Epoch 21/50, Loss: 0.4378634989261627
Epoch 22/50, Loss: 0.4695764482021332
Epoch 23/50, Loss: 0.4602090120315552
Epoch 24/50, Loss: 0.48366451263427734
Epoch 25/50, Loss: 0.4418071508407593
Epoch 26/50, Loss: 0.45714786648750305
Epoch 2

### Getting valid substitutes - Validation

In [None]:
# Example usage
recipe_id = "0032493a22"  # Replace with the actual recipe ID
ingredient_to_replace = "vanilla"  # Replace with the ingredient you want to substitute

top_substitutes = get_valid_substitutes(recipe_id, ingredient_to_replace, recipe1m_df, model, graph_embeddings, recipe_id_embeddings, valid_substitutes)

print(f"Top substitutes for {ingredient_to_replace} in recipe {recipe_id}: {top_substitutes}")

[('almond_extract', 0.6400206914192298), ('chocolate', 0.5323698887546771), ('sugar', 0.5191019816008007), ('flour', 0.4456575118104007), ('butter', 0.33350217272881116), ('lemon_rind', 0.33138547555210773), ('cream', 0.2942671854626668), ('lemon_juice', 0.08823140024502203)]
Top substitutes for vanilla in recipe 0032493a22: ['almond_extract', 'chocolate', 'sugar', 'flour', 'butter', 'lemon_rind', 'cream', 'lemon_juice']


### Savng the model and embeddings

In [None]:
import torch

# Define the path where you want to save the model and other components
model_save_path = '/content/drive/My Drive/ERP/RECIPE_SPECIFIC_ATTENTION/saved_model.pth'
embeddings_save_path = '/content/drive/My Drive/ERP/RECIPE_SPECIFIC_ATTENTION/saved_embeddings.pkl'

# Save the model's state dictionary
torch.save({
    'model_state_dict': nn_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch,  # Save the last epoch if you want to resume training later
}, model_save_path)

# Save the embeddings (text, graph, and recipe ID embeddings) if needed
embeddings_data = {
    'text_embeddings': model.wv,
    'graph_embeddings': graph_embeddings,
    'recipe_id_embeddings': recipe_id_embeddings
}

with open(embeddings_save_path, 'wb') as f:
    pickle.dump(embeddings_data, f)

print(f"Model saved to {model_save_path}")
print(f"Embeddings saved to {embeddings_save_path}")

Model saved to /content/drive/My Drive/ERP/RECIPE_SPECIFIC_ATTENTION/saved_model.pth
Embeddings saved to /content/drive/My Drive/ERP/RECIPE_SPECIFIC_ATTENTION/saved_embeddings.pkl
