<a href="https://colab.research.google.com/github/MuMind93/4439_COMP_SCI_7318_Assignment1_a1919980/blob/main/new_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Libraries

In [2]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.1+cu117.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.1+cu117.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.0.1+cu117.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_scatter-2.1.2%2Bpt20cu117-cp310-cp310-linux_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt20cu117
Looking in links: https://data.pyg.org/whl/torch-2.0.1+cu117.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_sparse-0.6.18%2Bpt20cu117-cp310-cp310-linux_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt20cu117
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━

In [32]:
import torch
import torch.nn as nn
from torch_geometric.nn import HeteroConv, SAGEConv
from torch_geometric.data import HeteroData
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## 2. Load and Merge Data

In [4]:
# Load datasets
aisles = pd.read_csv("aisles.csv")
departments = pd.read_csv("departments.csv")
order_products_prior = pd.read_csv("order_products__train.csv")
orders = pd.read_csv("orders.csv")
products = pd.read_csv("products.csv")

# Merge orders with order_products_prior
merged_data = orders.merge(order_products_prior, on="order_id", how="inner")

# Merge with products to include product details
merged_data = merged_data.merge(products, on="product_id", how="inner")

# Display merged data for verification
print("Merged Data:")
print(merged_data.head())

Merged Data:
   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   1187899        1    train            11          4                  8   
1   1187899        1    train            11          4                  8   
2   1187899        1    train            11          4                  8   
3   1187899        1    train            11          4                  8   
4   1187899        1    train            11          4                  8   

   days_since_prior_order  product_id  add_to_cart_order  reordered  \
0                    14.0         196                  1          1   
1                    14.0       25133                  2          1   
2                    14.0       38928                  3          1   
3                    14.0       26405                  4          1   
4                    14.0       39657                  5          1   

                       product_name  aisle_id  department_id  
0                              Sod

## 3. Prepare User Features

In [5]:
# Map user_id and product_id to unique indices
unique_users = merged_data["user_id"].unique()
unique_products = merged_data["product_id"].unique()

user_mapping = {user_id: idx for idx, user_id in enumerate(unique_users)}
product_mapping = {product_id: idx for idx, product_id in enumerate(unique_products)}

# Map IDs in the merged data
merged_data["user_idx"] = merged_data["user_id"].map(user_mapping)
merged_data["product_idx"] = merged_data["product_id"].map(product_mapping)

# Create user features
user_features = pd.DataFrame({
    "user_id": unique_users,
    "total_orders": orders.groupby("user_id").size().reindex(unique_users).fillna(0).values,
})
user_features["avg_days_between_orders"] = orders.groupby("user_id")["days_since_prior_order"].mean().reindex(unique_users).fillna(0).values
user_features["weekend_order_ratio"] = orders.groupby("user_id")["order_dow"].apply(
    lambda x: (x >= 5).sum() / len(x)
).reindex(unique_users).fillna(0).values

# Convert to tensor
user_features_tensor = torch.tensor(
    user_features.drop(columns=["user_id"]).values, dtype=torch.float
)

## 4. Prepare Product Features

In [6]:
# Create product features
product_features = pd.DataFrame({
    "product_id": unique_products,
    "total_purchases": merged_data.groupby("product_id").size().values,
    "avg_cart_position": merged_data.groupby("product_id")["add_to_cart_order"].mean().values,
    "reorder_rate": merged_data.groupby("product_id")["reordered"].mean().values,
})

# Convert to tensor
product_features_tensor = torch.tensor(
    product_features.drop(columns=["product_id"]).values, dtype=torch.float
)

## 5. Prepare Edge Features

In [7]:
# Create edge features for user-product interactions
user_product_features = merged_data.groupby(["user_idx", "product_idx"]).agg({
    "add_to_cart_order": "mean",
    "days_since_prior_order": "mean"
}).reset_index()

# Prepare edge index
edge_index = torch.tensor(
    user_product_features[["user_idx", "product_idx"]].values.T, dtype=torch.long
)

# Prepare edge features
edge_features_tensor = torch.tensor(
    user_product_features[["add_to_cart_order", "days_since_prior_order"]].values,
    dtype=torch.float
)

## 6. Create Heterogeneous Data Object

In [27]:
# Create Heterogeneous Data object
data = HeteroData()
data['user'].x = user_features_tensor  # User node features
data['product'].x = product_features_tensor  # Product node features
data['user', 'bought', 'product'].edge_index = edge_index  # Edge index
data['user', 'bought', 'product'].edge_attr = edge_features_tensor  # Edge features
# Add reverse edges ('product', 'bought_by', 'user')
data['product', 'bought_by', 'user'].edge_index = edge_index[[1, 0]]  # Reverse the edge index

## 7. Split Data into Train, Validation, and Test Sets

In [28]:
# Convert edge index and edge attributes to NumPy arrays
edges = data['user', 'bought', 'product'].edge_index.numpy().T
edge_attrs = data['user', 'bought', 'product'].edge_attr.numpy()

# Split edges into training, validation, and test sets
train_edges, val_test_edges, train_attrs, val_test_attrs = train_test_split(
    edges, edge_attrs, test_size=0.3, random_state=42
)
val_edges, test_edges, val_attrs, test_attrs = train_test_split(
    val_test_edges, val_test_attrs, test_size=0.5, random_state=42
)

# Create separate data objects for train, validation, and test
train_data = HeteroData()
train_data['user'].x = data['user'].x
train_data['product'].x = data['product'].x
train_data['user', 'bought', 'product'].edge_index = torch.tensor(train_edges.T, dtype=torch.long)
train_data['user', 'bought', 'product'].edge_attr = torch.tensor(train_attrs, dtype=torch.float)

val_data = HeteroData()
val_data['user'].x = data['user'].x
val_data['product'].x = data['product'].x
val_data['user', 'bought', 'product'].edge_index = torch.tensor(val_edges.T, dtype=torch.long)
val_data['user', 'bought', 'product'].edge_attr = torch.tensor(val_attrs, dtype=torch.float)

test_data = HeteroData()
test_data['user'].x = data['user'].x
test_data['product'].x = data['product'].x
test_data['user', 'bought', 'product'].edge_index = torch.tensor(test_edges.T, dtype=torch.long)
test_data['user', 'bought', 'product'].edge_attr = torch.tensor(test_attrs, dtype=torch.float)

# Add reverse edges to train, validation, and test data
train_data['product', 'bought_by', 'user'].edge_index = train_data['user', 'bought', 'product'].edge_index[[1, 0]]
val_data['product', 'bought_by', 'user'].edge_index = val_data['user', 'bought', 'product'].edge_index[[1, 0]]
test_data['product', 'bought_by', 'user'].edge_index = test_data['user', 'bought', 'product'].edge_index[[1, 0]]

## 8. Define the Heterogeneous GNN

In [29]:
class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('user', 'bought', 'product'): SAGEConv((-1, -1), hidden_dim),
            ('product', 'bought_by', 'user'): SAGEConv((-1, -1), hidden_dim),
        }, aggr='mean')
        self.conv2 = HeteroConv({
            ('user', 'bought', 'product'): SAGEConv((-1, -1), output_dim),
            ('product', 'bought_by', 'user'): SAGEConv((-1, -1), output_dim),
        }, aggr='mean')

    # The forward function should be indented to be part of the HeteroGNN class
    def forward(self, x_dict, edge_index_dict):
        print("Input x_dict Keys:", x_dict.keys())
        print("Input Edge Index Dict Keys:", edge_index_dict.keys())

        x_dict = self.conv1(x_dict, edge_index_dict)
        # print("After conv1:", {key: x.shape for key, x in x_dict.items()})

        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        print("After conv2:", {key: x.shape for key, x in x_dict.items()})

        return x_dict

## 9. Initialize Model and Optimizer

In [30]:
hidden_dim = 64
output_dim = 32
model = HeteroGNN(hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

best_val_loss = float('inf')
patience = 5
patience_counter = 0

## 10. Train the Model

In [35]:
def evaluate_recommendations(data, model, k=5):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        user_emb = out['user']
        product_emb = out['product']
        precision_list, recall_list, f1_list = [], [], []
        for user_idx in range(user_emb.size(0)):
            scores = torch.matmul(user_emb[user_idx], product_emb.T)
            top_k_products = scores.topk(k).indices
            ground_truth = data['user', 'bought', 'product'].edge_index[1][
                data['user', 'bought', 'product'].edge_index[0] == user_idx
            ]
            top_k_set = set(top_k_products.tolist())
            ground_truth_set = set(ground_truth.tolist())
            intersection = top_k_set & ground_truth_set
            precision = len(intersection) / k
            recall = len(intersection) / len(ground_truth_set) if len(ground_truth_set) > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)
        avg_precision = sum(precision_list) / len(precision_list)
        avg_recall = sum(recall_list) / len(recall_list)
        avg_f1 = sum(f1_list) / len(f1_list)
        return avg_precision, avg_recall, avg_f1

for epoch in range(50):
    # Training Phase
    model.train()
    optimizer.zero_grad()
    out = model(train_data.x_dict, train_data.edge_index_dict)
    user_emb = out['user']
    product_emb = out['product']
    edge_emb = torch.cat([
      user_emb[train_data['user', 'bought', 'product'].edge_index[0]],
      product_emb[train_data['user', 'bought', 'product'].edge_index[1]]
    ], dim=1)

    # Apply a linear layer to reduce the dimensionality of edge_emb to match edge_attr
    linear_layer = torch.nn.Linear(edge_emb.shape[1], train_data['user', 'bought', 'product'].edge_attr.shape[1]) #added
    edge_emb = linear_layer(edge_emb) #added

    train_loss = F.mse_loss(edge_emb, train_data['user', 'bought', 'product'].edge_attr)
    train_loss.backward() # Fixed indentation: Aligned with train_loss calculation
    optimizer.step()

    # Validation Phase
    model.eval()
    with torch.no_grad():
        out_val = model(val_data.x_dict, val_data.edge_index_dict)
        user_emb_val = out_val['user']
        product_emb_val = out_val['product']
        edge_emb_val = torch.cat([
            user_emb_val[val_data['user', 'bought', 'product'].edge_index[0]],
            product_emb_val[val_data['user', 'bought', 'product'].edge_index[1]]
        ], dim=1)

        # Apply the same linear layer used in training to reduce dimensionality
        edge_emb_val = linear_layer(edge_emb_val)

        val_loss = F.mse_loss(edge_emb_val, val_data['user', 'bought', 'product'].edge_attr)

    # Evaluate Precision, Recall, and F1 on Validation Data
    precision, recall, f1 = evaluate_recommendations(val_data, model, k=5)

    # Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        print(f"Epoch {epoch + 1}: New best validation loss: {val_loss.item():.4f}")
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch + 1}. Best validation loss: {best_val_loss:.4f}")
        break

    # Print losses and validation metrics
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss.item():.4f}, Validation Loss: {val_loss.item():.4f}")
    print(f"Epoch {epoch + 1}, Validation Precision@5: {precision:.4f}, Recall@5: {recall:.4f}, F1@5: {f1:.4f}")

Input x_dict Keys: dict_keys(['user', 'product'])
Input Edge Index Dict Keys: dict_keys([('user', 'bought', 'product'), ('product', 'bought_by', 'user')])
After conv2: {'product': torch.Size([39123, 32]), 'user': torch.Size([131209, 32])}
Input x_dict Keys: dict_keys(['user', 'product'])
Input Edge Index Dict Keys: dict_keys([('user', 'bought', 'product'), ('product', 'bought_by', 'user')])
After conv2: {'product': torch.Size([39123, 32]), 'user': torch.Size([131209, 32])}
Input x_dict Keys: dict_keys(['user', 'product'])
Input Edge Index Dict Keys: dict_keys([('user', 'bought', 'product'), ('product', 'bought_by', 'user')])
After conv2: {'product': torch.Size([39123, 32]), 'user': torch.Size([131209, 32])}
Epoch 1: New best validation loss: 414.2592
Epoch 1, Train Loss: 330.5005, Validation Loss: 414.2592
Epoch 1, Validation Precision@5: 0.0000, Recall@5: 0.0000, F1@5: 0.0000
Input x_dict Keys: dict_keys(['user', 'product'])
Input Edge Index Dict Keys: dict_keys([('user', 'bought', 'p

In [14]:
# Check edge index shape
print("Edge Index Shape:", train_data['user', 'bought', 'product'].edge_index.shape)

# Check edge attribute shape
print("Edge Attribute Shape:", train_data['user', 'bought', 'product'].edge_attr.shape)

# Optionally print some sample values to confirm the data
print("Sample Edge Index:", train_data['user', 'bought', 'product'].edge_index[:, :5])
print("Sample Edge Attributes:", train_data['user', 'bought', 'product'].edge_attr[:5])

Edge Index Shape: torch.Size([2, 969231])
Edge Attribute Shape: torch.Size([969231, 2])
Sample Edge Index: tensor([[ 45661,  46473,  18420, 102887,  98225],
        [   123,   1243,   1372,   9530,    470]])
Sample Edge Attributes: tensor([[ 7.,  8.],
        [ 5., 30.],
        [26., 30.],
        [ 1., 20.],
        [ 7.,  5.]])


## 11. Evaluate the Model

In [None]:
def evaluate_recommendations(data, model, k=5):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        user_emb = out['user']
        product_emb = out['product']
        precision_list, recall_list, f1_list = [], [], []
        for user_idx in range(user_emb.size(0)):
            scores = torch.matmul(user_emb[user_idx], product_emb.T)
            top_k_products = scores.topk(k).indices
            ground_truth = data['user', 'bought', 'product'].edge_index[1][
                data['user', 'bought', 'product'].edge_index[0] == user_idx
            ]
            top_k_set = set(top_k_products.tolist())
            ground_truth_set = set(ground_truth.tolist())
            intersection = top_k_set & ground_truth_set
            precision = len(intersection) / k
            recall = len(intersection) / len(ground_truth_set) if len(ground_truth_set) > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)
        avg_precision = sum(precision_list) / len(precision_list)
        avg_recall = sum(recall_list) / len(recall_list)
        avg_f1 = sum(f1_list) / len(f1_list)
        print(f"Precision@{k}: {avg_precision:.4f}, Recall@{k}: {avg_recall:.4f}, F1@{k}: {avg_f1:.4f}")
        return avg_precision, avg_recall, avg_f1

# Evaluate on test data
precision, recall, f1 = evaluate_recommendations(test_data, model, k=5)