<a href="https://colab.research.google.com/github/JITHIN-ANTONY-JOSEPH/ERP_11358080/blob/main/4_Statistical_Baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting to connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Baselines for Traditional Methods

In [None]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Load the substitution pairs DataFrame
substitution_pairs_df = pd.read_csv('/content/drive/My Drive/ERP/Recipe1MSubs_full.csv')  # Adjust the path as needed , this is the path to my personal Google Drive
flavorgraph_df = pd.read_csv('/content/drive/My Drive/ERP/Dataset/nodes_191120.csv')  # Adjust the path as needed , this is the path to my personal Google Drive
# Drop duplicate rows
substitution_pairs_df = substitution_pairs_df.drop_duplicates()

# Remove entries where ingredient1 == ingredient2
substitution_pairs_df = substitution_pairs_df[substitution_pairs_df['ingredient1'] != substitution_pairs_df['ingredient2']]

# Split the data into training and validation sets
train_df, val_df = train_test_split(substitution_pairs_df, test_size=0.2, random_state=42)

# Train the frequency model
substitution_freq = defaultdict(int)
for _, row in train_df.iterrows():
    substitution_freq[(row['ingredient1'], row['ingredient2'])] += 1

# Train the mode model
mode_ingredient = train_df['ingredient2'].mode()[0]

# Function to calculate MRR, Hit@1, Hit@3, and Hit@10
def calculate_metrics(predictions, ground_truths):
    mrr = 0.0
    hit_1 = 0.0
    hit_3 = 0.0
    hit_10 = 0.0
    for pred, gt in zip(predictions, ground_truths):
        if gt in pred:
            rank = pred.index(gt) + 1
            mrr += 1.0 / rank
            if rank == 1:
                hit_1 += 1.0
            if rank <= 3:
                hit_3 += 1.0
            if rank <= 10:
                hit_10 += 1.0
    mrr /= len(ground_truths)
    hit_1 /= len(ground_truths)
    hit_3 /= len(ground_truths)
    hit_10 /= len(ground_truths)
    return mrr, hit_1, hit_3, hit_10

# Generate predictions for the Random model
val_ground_truths = val_df['ingredient2'].tolist()
# Extract unique ingredients where node_type is "ingredient"
flavorgraph_ingredients = set(flavorgraph_df[flavorgraph_df['node_type'] == 'ingredient']['name'].dropna().unique())
val_predictions_random = [[random.choice(list(flavorgraph_ingredients))] for _ in val_ground_truths]

# Generate predictions for the Mode model
val_predictions_mode = [[mode_ingredient] for _ in val_ground_truths]


# Calculate metrics for each model
metrics = {
    "Random": calculate_metrics(val_predictions_random, val_ground_truths),
    "Mode": calculate_metrics(val_predictions_mode, val_ground_truths)
}

# Display the metrics
for model_name, (mrr, hit_1, hit_3, hit_10) in metrics.items():
    print(f"{model_name}: MRR: {mrr:.4f}, Hit@1: {hit_1:.4f}, Hit@3: {hit_3:.4f}, Hit@10: {hit_10:.4f}")


Random: MRR: 0.0005, Hit@1: 0.0005, Hit@3: 0.0005, Hit@10: 0.0005
Mode: MRR: 0.0235, Hit@1: 0.0235, Hit@3: 0.0235, Hit@10: 0.0235
