<a href="https://colab.research.google.com/github/JITHIN-ANTONY-JOSEPH/ERP_11358080/blob/main/7_Graph_Based_Node2Vec_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting to connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Installing the required libraries

In [None]:
!pip install networkx node2vec gensim

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3070, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 5

### Loading the prepared knowledge graph

In [None]:
import networkx as nx

# Load the graph from the GraphML file
G = nx.read_graphml('/content/drive/My Drive/ERP/knowledge_graph.graphml') # Adjust the path as needed , this is the path to my personal Google Drive


In [None]:
# Extract subgraph with only ingredient nodes
ingredient_nodes = [n for n, attr in G.nodes(data=True) if attr['node_type'] == 'ingredient']
G_ingredients = G.subgraph(ingredient_nodes).copy()

### Generating Graph Embeddings

In [None]:
from node2vec import Node2Vec

# Generate node embeddings using Node2Vec, incorporating edge weights
node2vec = Node2Vec(G_ingredients, dimensions=64, walk_length=30, num_walks=200, workers=8, weight_key='weight')
model = node2vec.fit(window=10, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/6651 [00:00<?, ?it/s]

### Evaluating the model

In [None]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import numpy as np

# Load the substitution pairs
substitution_pairs_df = pd.read_csv('/content/drive/My Drive/ERP/Recipe1MSubs_full.csv') # Adjust the path as needed , this is the path to my personal Google Drive

# Function to generate predictions for the validation set
def generate_predictions(validation_pairs, model):
    predictions = []
    for _, row in validation_pairs.iterrows():
        ingredient1 = row['ingredient1']
        if ingredient1 in model.wv:
            similar_ingredients = model.wv.most_similar(ingredient1, topn=10)
            candidates = [ingredient for ingredient, _ in similar_ingredients]
        else:
            candidates = []
        predictions.append(candidates)
    return predictions

# Generate predictions for the validation set
val_ground_truths = substitution_pairs_df['ingredient2'].tolist()

# Use multiprocessing to generate predictions faster
def batch_generate_predictions(batch):
    return generate_predictions(batch, model)

# Split validation pairs into batches
num_batches = 8  # Adjust based on your CPU cores
batches = np.array_split(substitution_pairs_df, num_batches)

with ProcessPoolExecutor(max_workers=num_batches) as executor:
    results = list(executor.map(batch_generate_predictions, batches))

# Flatten the list of results
val_predictions = [item for sublist in results for item in sublist]


  return bound(*args, **kwds)


In [None]:
# Function to calculate MRR, Hit@1, Hit@3, Hit@10
def calculate_metrics(predictions, ground_truths):
    mrr = 0.0
    hit_1 = 0.0
    hit_3 = 0.0
    hit_10 = 0.0
    for pred, gt in zip(predictions, ground_truths):
        for i, candidate in enumerate(pred):
            if gt == candidate:
                rank = i + 1
                mrr += 1.0 / rank
                if rank == 1:
                    hit_1 += 1.0
                if rank <= 3:
                    hit_3 += 1.0
                if rank <= 10:
                    hit_10 += 1.0
                break
    mrr /= len(ground_truths)
    hit_1 /= len(ground_truths)
    hit_3 /= len(ground_truths)
    hit_10 /= len(ground_truths)
    return mrr, hit_1, hit_3, hit_10

# Calculate metrics for the Node2Vec model
mrr, hit_1, hit_3, hit_10 = calculate_metrics(val_predictions, val_ground_truths)

print(f"Node2Vec: MRR: {mrr:.4f}, Hit@1: {hit_1:.4f}, Hit@3: {hit_3:.4f}, Hit@10: {hit_10:.4f}")

Node2Vec: MRR: 0.0133, Hit@1: 0.0069, Hit@3: 0.0155, Hit@10: 0.0334


In [None]:
# Save the model
model.save('/content/drive/My Drive/ERP/node2vec_model')

In [None]:
from gensim.models import Word2Vec

In [None]:
model = Word2Vec.load('/content/drive/My Drive/ERP/node2vec_model')
