<a href="https://colab.research.google.com/github/JITHIN-ANTONY-JOSEPH/ERP_11358080/blob/main/9_Experiment8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Input : Recipe Instructions
### Model : Word2Vec(Text) + Node2Vec(Graph) + Neural Networks(Model) with Negative Sampling

### Mounting to connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Installing required libraries

In [None]:
!pip install node2vec



###Importing required libraries , loading datasets, pre-processing, combined embeddings , negative sampling , final_model and evaluation

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import json
import jellyfish
import random

# Load the main dataset
with open('/content/drive/My Drive/ERP/modified_Processed_Layer1.json', 'r') as file: # Adjust the path as needed , this is the path to my personal Google Drive
    recipe1m_data = [json.loads(line) for line in file]

recipe1m_df = pd.DataFrame(recipe1m_data)

# Load the substitution pairs
substitution_pairs_df = pd.read_csv('/content/drive/My Drive/ERP/Recipe1MSubs_full.csv')# Adjust the path as needed , this is the path to my personal Google Drive

# Load flavor graph
flavorgraph_df = pd.read_csv('/content/drive/My Drive/ERP/Dataset/nodes_191120.csv')  # Adjust the path as needed , this is the path to my personal Google Drive

# Example ingredient list for NER-like extraction (replace with your own comprehensive list or use NER model)
ingredient_list = set(flavorgraph_df[flavorgraph_df['node_type'] == 'ingredient']['name'].dropna().unique())

# Function to extract ingredients from instructions
def extract_ingredients_from_instructions(instructions, ingredient_list):
    extracted_ingredients = []
    for instruction in instructions:
        words = instruction.split()
        for word in words:
            if word in ingredient_list:
                extracted_ingredients.append(word)
    return extracted_ingredients

# Apply the extraction function
recipe1m_df['extracted_ingredients'] = recipe1m_df['processed_instructions'].apply(
    lambda instructions: extract_ingredients_from_instructions(instructions, ingredient_list) if isinstance(instructions, list) else []
)

# Prepare sentences for training
sentences = recipe1m_df['extracted_ingredients'].tolist()

# Add substitution contexts to sentences
for _, row in substitution_pairs_df.iterrows():
    ingredient1 = row['ingredient1']
    ingredient2 = row['ingredient2']
    sentences.append([ingredient1, ingredient2])

# Train the Word2Vec model
from gensim.models import Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=8)

# Generate the valid substitutes dictionary
valid_substitutes = {}
for _, row in substitution_pairs_df.iterrows():
    ing1 = row['ingredient1']
    ing2 = row['ingredient2']
    if ing1 not in valid_substitutes:
        valid_substitutes[ing1] = set()
    valid_substitutes[ing1].add(ing2)

# Import networkx and Node2Vec for the graph
import networkx as nx
from node2vec import Node2Vec
from joblib import Parallel, delayed

# Load the knowledge graph
flavor_graph = nx.read_graphml('/content/drive/My Drive/ERP/knowledge_graph.graphml')

# Function to filter ingredient nodes in parallel
def filter_ingredient_nodes(node, attr):
    return node if attr['node_type'] == 'ingredient' else None

# Parallelize the filtering process
ingredient_nodes = Parallel(n_jobs=-1)(delayed(filter_ingredient_nodes)(n, attr) for n, attr in flavor_graph.nodes(data=True))
ingredient_nodes = [node for node in ingredient_nodes if node is not None]

# Create a subgraph with only ingredient nodes
flavor_graph = flavor_graph.subgraph(ingredient_nodes)

# Generate Node2Vec embeddings considering edge weights
node2vec = Node2Vec(flavor_graph, dimensions=100, walk_length=30, num_walks=200, workers=16, weight_key='weight')
graph_model = node2vec.fit(window=10, min_count=1, batch_words=128)

# Generate graph embeddings for the ingredients
graph_embeddings = {str(node): graph_model.wv[str(node)] for node in flavor_graph.nodes()}

# Function to combine text and graph embeddings
def get_combined_embedding(ingredient, text_embeddings, graph_embeddings):
    # Get text embedding
    if ingredient in text_embeddings:
        text_embedding = text_embeddings[ingredient]
    else:
        text_embedding = np.zeros(100)

    # Get graph embedding
    if ingredient in graph_embeddings:
        graph_embedding = graph_embeddings[ingredient]
    else:
        graph_embedding = np.zeros(100)

    # Combine embeddings by concatenation
    combined_embedding = np.concatenate((text_embedding, graph_embedding))

    return combined_embedding

# Prepare training data with negative samples
train_data = []
train_labels = []
negative_samples = []
negative_labels = []

for _, row in substitution_pairs_df.iterrows():
    ing1 = row['ingredient1']
    ing2 = row['ingredient2']

    combined_embedding1 = get_combined_embedding(ing1, model.wv, graph_embeddings)
    combined_embedding2 = get_combined_embedding(ing2, model.wv, graph_embeddings)

    train_data.append(combined_embedding1)
    train_labels.append(combined_embedding2)

    # Generate 500 negative samples by excluding valid substitutes
    possible_negatives = [
        ing for ing in ingredient_list
        if ing != ing1 and ing not in valid_substitutes.get(ing1, set())
    ]
    selected_negatives = random.sample(possible_negatives, min(100, len(possible_negatives)))  # Pick 100 non-substitutes

    for neg in selected_negatives:
        neg_embedding = get_combined_embedding(neg, model.wv, graph_embeddings)
        negative_labels.append(neg_embedding)
        negative_samples.append(combined_embedding1)

# Convert to tensors
train_data = torch.tensor(train_data, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.float32)
negative_samples = torch.tensor(negative_samples, dtype=torch.float32)
negative_labels = torch.tensor(negative_labels, dtype=torch.float32)

# Define the neural network
class CombinedNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CombinedNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Instantiate the model
nn_model = CombinedNN(input_dim=200, output_dim=200)  # Combined embedding dimension is 200 (100 + 100)
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.001)

# Training loop with negative sampling
for epoch in range(300):
    nn_model.train()
    optimizer.zero_grad()

    # Positive pair loss
    outputs = nn_model(train_data)
    loss = criterion(outputs, train_labels)

    # Negative pair loss (contrastive or margin-based loss)
    neg_outputs = nn_model(negative_samples)
    expanded_outputs = outputs.repeat_interleave(100, dim=0)  # Ensure dimensions match for 100 negatives
    negative_loss = torch.mean(torch.relu(1.0 - torch.sum(expanded_outputs * neg_outputs, dim=1)))

    total_loss = loss + negative_loss
    total_loss.backward()
    optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {total_loss.item()}')

# Validation
val_data = []
val_labels = []

for _, row in substitution_pairs_df.iterrows():
    if len(val_data) >= 500:
        break
    ing1 = row['ingredient1']
    combined_embedding = get_combined_embedding(ing1, model.wv, graph_embeddings)

    val_data.append(combined_embedding)
    val_labels.append(row['ingredient2'])

val_data = np.array(val_data)
val_data = torch.tensor(val_data, dtype=torch.float32)
val_predictions = nn_model(val_data).detach().numpy()

# Function to find the top N most similar ingredients based on cosine similarity
def find_top_similar_ingredients(predicted_embedding, combined_embeddings, top_n=10):
    similarities = {}
    for ingredient, embedding in combined_embeddings.items():
        similarity = cosine_similarity(predicted_embedding.reshape(1, -1), embedding.reshape(1, -1))[0][0]
        similarities[ingredient] = similarity
    sorted_ingredients = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return [ingredient for ingredient, similarity in sorted_ingredients[:top_n]]

# Function to calculate metrics with Jaro-Winkler similarity threshold
def calculate_metrics(predictions, ground_truths, combined_embeddings, top_n=10, threshold=0.8):
    mrr, hit_1, hit_3, hit_10 = 0.0, 0.0, 0.0, 0.0
    total = len(ground_truths)

    for pred, gt in zip(predictions, ground_truths):
        top_similar = find_top_similar_ingredients(pred, combined_embeddings, top_n=top_n)
        for rank, candidate in enumerate(top_similar, start=1):
            sim = jellyfish.jaro_winkler_similarity(gt, candidate)
            if sim >= threshold:
                mrr += 1.0 / rank
                if rank == 1:
                    hit_1 += 1.0
                if rank <= 3:
                    hit_3 += 1.0
                if rank <= 10:
                    hit_10 += 1.0
                break

    mrr /= total
    hit_1 /= total
    hit_3 /= total
    hit_10 /= total
    return mrr, hit_1, hit_3, hit_10

# Calculate metrics for the first 1000 entries of the validation set
val_labels_str = val_labels  # Assuming labels are ingredient names
combined_embeddings = {ingredient: get_combined_embedding(ingredient, model.wv, graph_embeddings) for ingredient in model.wv.index_to_key}

mrr, hit_1, hit_3, hit_10 = calculate_metrics(val_predictions, val_labels_str, combined_embeddings)

print(f"MRR: {mrr:.4f}, Hit@1: {hit_1:.4f}, Hit@3: {hit_3:.4f}, Hit@10: {hit_10:.4f}")

Computing transition probabilities:   0%|          | 0/6651 [00:00<?, ?it/s]

  train_data = torch.tensor(train_data, dtype=torch.float32)


Epoch 1, Loss: 0.8221169710159302
Epoch 2, Loss: 0.8052106499671936
Epoch 3, Loss: 0.7907443642616272
Epoch 4, Loss: 0.7783506512641907
Epoch 5, Loss: 0.767662525177002
Epoch 6, Loss: 0.7581714391708374
Epoch 7, Loss: 0.7509445548057556
Epoch 8, Loss: 0.7440974712371826
Epoch 9, Loss: 0.7390084266662598
Epoch 10, Loss: 0.7336133122444153
Epoch 11, Loss: 0.7291702032089233
Epoch 12, Loss: 0.725024402141571
Epoch 13, Loss: 0.7213757038116455
Epoch 14, Loss: 0.7177773714065552
Epoch 15, Loss: 0.7149329781532288
Epoch 16, Loss: 0.7119714021682739
Epoch 17, Loss: 0.7091203927993774
Epoch 18, Loss: 0.7065960168838501
Epoch 19, Loss: 0.7040128707885742
Epoch 20, Loss: 0.7017800211906433
Epoch 21, Loss: 0.69927978515625
Epoch 22, Loss: 0.6971295475959778
Epoch 23, Loss: 0.6952140927314758
Epoch 24, Loss: 0.6931967735290527
Epoch 25, Loss: 0.6915526390075684
Epoch 26, Loss: 0.6896356344223022
Epoch 27, Loss: 0.688218355178833
Epoch 28, Loss: 0.6866771578788757
Epoch 29, Loss: 0.6853801608085632

In [None]:
# Save the Word2Vec model
model.save("/content/drive/My Drive/ERP/MODEL_BEST_NUMBERS/word2vec_model.model")

# Save the neural network model
torch.save(nn_model.state_dict(), "/content/drive/My Drive/ERP/MODEL_BEST_NUMBERS/nn_model.pth")

# Save graph embeddings
import pickle
with open("/content/drive/My Drive/ERP/MODEL_BEST_NUMBERS/graph_embeddings.pkl", "wb") as f:
    pickle.dump(graph_embeddings, f)

print("Models saved successfully!")


Models saved successfully!
