<a href="https://colab.research.google.com/github/JITHIN-ANTONY-JOSEPH/ERP_11358080/blob/main/7_Graph_Based_GCN_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting to connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Installing the required libraries

In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m61.4/64.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.5.3


### Loading the prepared knowledge graph

In [None]:
import networkx as nx
import pandas as pd
import torch
from torch_geometric.data import Data

# Load the graph from the GraphML file
G = nx.read_graphml('/content/drive/My Drive/ERP/knowledge_graph.graphml')


### Generating sub-graph to extract only ingredients

In [None]:
import torch
from torch_geometric.utils import from_networkx

# Extract subgraph with only ingredient nodes
ingredient_nodes = [n for n, attr in G.nodes(data=True) if attr['node_type'] == 'ingredient']
G_ingredients = G.subgraph(ingredient_nodes).copy()

# Convert NetworkX graph to PyTorch Geometric data object
data = from_networkx(G_ingredients)

# Initialize node features randomly or using another approach
num_features = 64  # Adjust as needed
data.x = torch.randn((data.num_nodes, num_features), dtype=torch.float)

# Move data to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)


### Defining the Neural Network Architecture

In [None]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import torch
# Setting the Neural Network Architecture
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(42)
        self.conv1 = GCNConv(data.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x


In [None]:
model = GCN(hidden_channels=64)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# Train the GCN model
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out, data.x)  # Adjust the target as needed
    loss.backward()
    optimizer.step()
    return loss

for epoch in range(200):
    loss = train()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

# Generate node embeddings using the trained GCN model
model.eval()
with torch.no_grad():
    gcn_embeddings = model(data).cpu().numpy()


Epoch 0, Loss: 1.002287745475769
Epoch 20, Loss: 0.9143369197845459
Epoch 40, Loss: 0.8254010081291199
Epoch 60, Loss: 0.7786675095558167
Epoch 80, Loss: 0.7559046745300293
Epoch 100, Loss: 0.7434118390083313
Epoch 120, Loss: 0.7362469434738159
Epoch 140, Loss: 0.7328438758850098
Epoch 160, Loss: 0.7305194139480591
Epoch 180, Loss: 0.7289988994598389


### Evaluation of the model

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Create a mapping from node name to GCN embeddings
node_to_embedding = {name: gcn_embeddings[i] for i, name in enumerate(G_ingredients.nodes())}

# Function to get substitutes for a specific ingredient name
def get_gcn_substitutes_by_name(ingredient_name, top_n=10):
    if ingredient_name in node_to_embedding:
        similar_ingredients = sorted(
            node_to_embedding.keys(),
            key=lambda ing: cosine_similarity([node_to_embedding[ingredient_name]], [node_to_embedding[ing]])[0][0],
            reverse=True
        )[:top_n]
        substitutes = [ingredient for ingredient in similar_ingredients if ingredient != ingredient_name]
        return substitutes
    else:
        print(f"Ingredient '{ingredient_name}' not found in the model.")
        return []

# Example usage
specific_ingredient_name = 'fruit'  # Replace with your specific ingredient name
substitutes = get_gcn_substitutes_by_name(specific_ingredient_name)
print(f"Substitutes for {specific_ingredient_name}: {substitutes}")

Substitutes for fruit: ['liquid_coffee_creamer', 'juice', 'ice_cream', 'fruit_pectin', 'fresh_fruit', 'granola_cereal', 'plum_tomato', 'hp_steak_sauce', 'fresh_cream']


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ProcessPoolExecutor
import numpy as np

# Create a mapping from node name to GCN embeddings
node_to_embedding = {name: gcn_embeddings[i] for i, name in enumerate(G_ingredients.nodes())}

# Function to generate predictions for the validation set
def generate_predictions(validation_pairs, embeddings):
    predictions = []
    for _, row in validation_pairs.iterrows():
        ingredient1 = row['ingredient1']
        if ingredient1 in embeddings:
            similar_ingredients = sorted(
                embeddings.keys(),
                key=lambda ing: cosine_similarity([embeddings[ingredient1]], [embeddings[ing]])[0][0],
                reverse=True
            )[:10]
            predictions.append(similar_ingredients)
        else:
            predictions.append([])
    return predictions

# Load the substitution pairs
substitution_pairs_df = pd.read_csv('/content/drive/My Drive/ERP/Recipe1MSubs_full.csv')

# Generate predictions for the validation set
val_ground_truths = substitution_pairs_df['ingredient2'].tolist()

# Use multiprocessing to generate predictions faster
def batch_generate_predictions(batch):
    return generate_predictions(batch, node_to_embedding)

# Split validation pairs into batches
num_batches = 8  # Adjust based on your CPU cores
batches = np.array_split(substitution_pairs_df, num_batches)

with ProcessPoolExecutor(max_workers=num_batches) as executor:
    results = list(executor.map(batch_generate_predictions, batches))

# Flatten the list of results
val_predictions = [item for sublist in results for item in sublist]

  return bound(*args, **kwds)


In [None]:
# Function to calculate MRR, Hit@1, Hit@3, Hit@10
def calculate_metrics(predictions, ground_truths):
    mrr = 0.0
    hit_1 = 0.0
    hit_3 = 0.0
    hit_10 = 0.0
    for pred, gt in zip(predictions, ground_truths):
        for i, candidate in enumerate(pred):
            if gt == candidate:
                rank = i + 1
                mrr += 1.0 / rank
                if rank == 1:
                    hit_1 += 1.0
                if rank <= 3:
                    hit_3 += 1.0
                if rank <= 10:
                    hit_10 += 1.0
                break
    mrr /= len(ground_truths)
    hit_1 /= len(ground_truths)
    hit_3 /= len(ground_truths)
    hit_10 /= len(ground_truths)
    return mrr, hit_1, hit_3, hit_10

# Calculate metrics for the GCN model
mrr, hit_1, hit_3, hit_10 = calculate_metrics(val_predictions, val_ground_truths)

print(f"GCN: MRR: {mrr:.4f}, Hit@1: {hit_1:.4f}, Hit@3: {hit_3:.4f}, Hit@10: {hit_10:.4f}")


GCN: MRR: 0.0111, Hit@1: 0.0000, Hit@3: 0.0163, Hit@10: 0.0407
