<a href="https://colab.research.google.com/github/GinuraAdikari/InsightHive/blob/Recommendation_Engine/PyG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns


import os

In [None]:
# Load the events data
event_df=pd.read_csv('drive/MyDrive/Colab Notebooks/events.csv')
event_df

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,
2756097,1438399813142,762376,view,115946,
2756098,1438397820527,1251746,view,78144,
2756099,1438398530703,1184451,view,283392,


In [None]:
# Load the category tree data
category_tree=pd.read_csv('drive/MyDrive/Colab Notebooks/category_tree.csv')

category_tree.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [None]:
# load item-related datasets
item1=pd.read_csv('drive/MyDrive/Colab Notebooks/item_properties_part1.csv')
item2=pd.read_csv('drive/MyDrive/Colab Notebooks/item_properties_part2.csv')

# Combine the item properties dataframes
item_properties = pd.concat([item1, item2])
item_properties.head(10)

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
5,1436065200000,285026,available,0
6,1434250800000,89534,213,1121373
7,1431831600000,264312,6,319724
8,1433646000000,229370,202,1330310
9,1434250800000,98113,451,1141052 n48.000


In [None]:
# Filter for items with at least 'categoryid' OR 'availability'
filtered_items = item_properties[item_properties['property'].isin(['categoryid', 'available'])]

filtered_items.shape

(2291853, 4)

In [None]:
duplicates = filtered_items[filtered_items.duplicated(subset=['itemid', 'property', 'value'], keep=False)]
print(duplicates)
print(f"Number of duplicate rows: {len(duplicates)}")

             timestamp  itemid    property value
5        1436065200000  285026   available     0
15       1437274800000  186518   available     0
82       1434250800000  316253   available     1
96       1437274800000  430459   available     0
101      1433041200000  411262   available     0
...                ...     ...         ...   ...
9275856  1438484400000  225378   available     1
9275873  1435460400000  444741  categoryid   511
9275879  1435460400000   97513  categoryid  1385
9275892  1439089200000  200211   available     0
9275897  1436065200000   74745   available     0

[1534714 rows x 4 columns]
Number of duplicate rows: 1534714


In [None]:
# Convert 'value' column in filtered_items to int64 using .loc
filtered_items.loc[:, 'value'] = pd.to_numeric(filtered_items['value'], errors='coerce').astype('Int64')

# Now perform the merge
items_cats = pd.merge(filtered_items, category_tree, how='left', left_on='value', right_on='categoryid')
items_cats

Unnamed: 0,timestamp,itemid,property,value,categoryid,parentid
0,1435460400000,460429,categoryid,1338,1338.0,1278.0
1,1436065200000,285026,available,0,0.0,605.0
2,1437274800000,186518,available,0,0.0,605.0
3,1433646000000,423682,available,0,0.0,605.0
4,1434250800000,316253,available,1,1.0,854.0
...,...,...,...,...,...,...
2291848,1435460400000,444741,categoryid,511,511.0,724.0
2291849,1435460400000,97513,categoryid,1385,1385.0,1202.0
2291850,1433041200000,356167,available,0,0.0,605.0
2291851,1439089200000,200211,available,0,0.0,605.0


In [None]:
items_cats['categoryid'] = items_cats['categoryid'].fillna('Unknown')
items_cats['parentid'] = items_cats['parentid'].fillna('Unknown')

In [None]:
# Sort by itemid and timestamp
items_cats_sorted = items_cats.sort_values(by=['itemid', 'timestamp'])

# Group by itemid
grouped = items_cats_sorted.groupby('itemid')

# Initialize an empty list to store the results
result_data = []

# Iterate through each group
for itemid, group in grouped:
    # Filter for 'available' property
    available_rows = group[group['property'] == 'available']

    # Iterate through available rows
    for i in range(len(available_rows) - 1):
        # Get current and next row
        current_row = available_rows.iloc[i]
        next_row = available_rows.iloc[i + 1]

        # Create a dictionary to store the new data
        new_row = current_row.to_dict()  # Start with all columns from current_row

        # Add new features
        new_row['from'] = current_row['timestamp']
        new_row['to'] = next_row['timestamp']

        # Determine 'availability' value
        if current_row['value'] == 0 and next_row['value'] == 1:
            new_row['availability'] = 0
        elif current_row['value'] == 1 and next_row['value'] == 0:
            new_row['availability'] = 1

        # Append to the result list
        result_data.append(new_row)

# Create a DataFrame from the result list
result_df = pd.DataFrame(result_data)


In [None]:
import pandas as pd

# Sort by itemid, from, and to
result_df_sorted = result_df.sort_values(by=['itemid', 'from', 'to'])

# Group by itemid and availability
grouped = result_df_sorted.groupby(['itemid', 'availability'])

# Initialize an empty list to store the processed rows
processed_rows = []

# Iterate through each group
for (itemid, availability), group in grouped:
    # If there are multiple rows with the same availability
    if len(group) > 1:
        # Update the 'to' value of the first row with the 'to' value of the last row
        first_row = group.iloc[0].copy()  # Create a copy to avoid modifying the original DataFrame
        first_row['to'] = group.iloc[-1]['to']
        processed_rows.append(first_row)
    else:
        # If only one row, keep it as is
        processed_rows.append(group.iloc[0])

# Create a new DataFrame from the processed rows
final_df = pd.DataFrame(processed_rows)

# Display the final DataFrame
final_df

In [None]:
# 1. Extract itemid, categoryid, and parentid from items_cats where property is 'categoryid'
item_category_mapping = items_cats[items_cats['property'] == 'categoryid'][['itemid', 'categoryid', 'parentid']]

# 2. Convert itemid to string for matching
item_category_mapping['itemid'] = item_category_mapping['itemid'].astype(str)
final_df['itemid'] = final_df['itemid'].astype(str)

# 3. Replace categoryid and parentid in final_df using the mapping
final_df = pd.merge(final_df, item_category_mapping[['itemid', 'categoryid', 'parentid']], on='itemid', how='left', suffixes=('', '_new'))
final_df['categoryid'] = final_df['categoryid_new']  # Directly replace categoryid
final_df['parentid'] = final_df['parentid_new']  # Directly replace parentid
final_df = final_df.drop(columns=['categoryid_new', 'parentid_new'])  # Drop temporary columns

# Display the updated final_df
final_df

In [None]:
item_final = final_df.drop(columns=['property','value','timestamp'])
item_final

In [None]:
event_df['is_transaction'] = event_df['transactionid'].notnull().astype(int)

In [None]:
event_df.drop(columns=['transactionid'], inplace=True)

In [None]:
item_final['itemid'] = item_final['itemid'].astype(str)
item_final['categoryid'] = item_final['categoryid'].astype(str)
item_final['parentid'] = item_final['parentid'].astype(str)
item_final['categoryid'] = item_final['categoryid'].astype(str)

event_df['itemid'] = event_df['itemid'].astype(str)
event_df['visitorid'] = event_df['visitorid'].astype(str)


In [None]:
# Convert timestamp columns to datetime for comparison and merging
event_df['timestamp'] = pd.to_datetime(event_df['timestamp'])
item_final['from'] = pd.to_datetime(item_final['from'])
item_final['to'] = pd.to_datetime(item_final['to'])

# Rename columns to avoid conflicts after merging
item_final = item_final.rename(columns={'categoryid': 'item_categoryid', 'parentid': 'item_parentid'})

# Merge event_df and item_final based on 'itemid'
merged_df = pd.merge(event_df, item_final, on='itemid', how='left')

# Filter the merged DataFrame based on timestamp range
filtered_df = merged_df[
    (merged_df['from'] <= merged_df['timestamp']) & (merged_df['timestamp'] <= merged_df['to'])
]

# Select relevant columns and rename them back to original names
event_df = filtered_df[['visitorid', 'timestamp', 'itemid', 'event', 'is_transaction',
                         'item_categoryid', 'item_parentid', 'availability']]
event_df = event_df.rename(columns={'item_categoryid': 'categoryid', 'item_parentid': 'parentid'})

# Display the updated event_df
print(event_df)

        visitorid                     timestamp  itemid event  is_transaction  \
98        1076270 1970-01-01 00:23:53.222147345  262799  view               0   
99        1076270 1970-01-01 00:23:53.222147345  262799  view               0   
143        692195 1970-01-01 00:23:53.223138106  112792  view               0   
146        392042 1970-01-01 00:23:53.223790254   16813  view               0   
147        392042 1970-01-01 00:23:53.223790254   16813  view               0   
...           ...                           ...     ...   ...             ...   
6108754    639019 1970-01-01 00:23:58.400010258   47138  view               0   
6108764    611270 1970-01-01 00:23:58.397455397  432925  view               0   
6108765    611270 1970-01-01 00:23:58.397455397  432925  view               0   
6108774    701750 1970-01-01 00:23:58.399289446  296172  view               0   
6108775    701750 1970-01-01 00:23:58.399289446  296172  view               0   

        categoryid parentid

In [None]:
# Define a custom encoding for the event column
event_weight = {"view": 1, "addtocart": 2, "transaction": 3}  # Assign weights
event_df["event_encoded"] = event_df["event"].map(event_weight)

# Compute extra visitor-level features directly in event_df before splitting
visitor_features = event_df.groupby("visitorid").agg(
    total_interactions=("event", "count"),
    transaction_frequency=("is_transaction", "sum"),
    distinct_items=("itemid", "nunique")
).reset_index()

# Merge back into the original dataset
event_df = event_df.merge(visitor_features, on="visitorid", how="left")

# Ensure there are no NaN values (should not happen, but just in case)
#event_df.fillna(0, inplace=True)


event_df.head()

Unnamed: 0,visitorid,timestamp,itemid,event,is_transaction,categoryid,parentid,availability,event_encoded,total_interactions,transaction_frequency,distinct_items
0,1076270,1970-01-01 00:23:53.222147345,262799,view,0,324.0,1145.0,0.0,1,12,2,1
1,1076270,1970-01-01 00:23:53.222147345,262799,view,0,324.0,1145.0,1.0,1,12,2,1
2,692195,1970-01-01 00:23:53.223138106,112792,view,0,958.0,1041.0,0.0,1,1,0,1
3,392042,1970-01-01 00:23:53.223790254,16813,view,0,1173.0,805.0,0.0,1,8,0,2
4,392042,1970-01-01 00:23:53.223790254,16813,view,0,1173.0,805.0,1.0,1,8,0,2


In [None]:
# Convert visitorid and itemid to string
event_df['visitorid'] = event_df['visitorid'].astype(str)
event_df['itemid'] = event_df['itemid'].astype(str)

# Convert categoryid and parentid to integers (removing decimals) unless they are 'Unknown'
event_df['categoryid'] = event_df['categoryid'].apply(lambda x: str(int(float(x))) if x != 'Unknown' else x)
event_df['parentid'] = event_df['parentid'].apply(lambda x: str(int(float(x))) if x != 'Unknown' else x)

# Convert availability to integer (ensuring 0 and 1)
event_df['availability'] = event_df['availability'].astype(int)

# Verify dtypes
print(event_df.dtypes)

visitorid                        object
timestamp                datetime64[ns]
itemid                           object
event                            object
is_transaction                    int64
categoryid                       object
parentid                         object
availability                      int64
event_encoded                     int64
total_interactions                int64
transaction_frequency             int64
distinct_items                    int64
dtype: object


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

event_df[['total_interactions', 'transaction_frequency', 'distinct_items']] = scaler.fit_transform(
    event_df[['total_interactions', 'transaction_frequency', 'distinct_items']]
)


In [None]:
filtered_event_df = event_df.copy()

In [None]:
# Randomly shuffle the data
filtered_event_df = filtered_event_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Get all unique visitorids and itemids
all_users = set(filtered_event_df['visitorid'].unique())
all_items = set(filtered_event_df['itemid'].unique())

# Split the data into train/test based on the number of unique users and items
split_fraction = 0.8
split_index = int(len(filtered_event_df) * split_fraction)
train_data = filtered_event_df[:split_index]
test_data = filtered_event_df[split_index:]

# Identify seen users and items in the train data
seen_users = set(train_data['visitorid'].unique())
seen_items = set(train_data['itemid'].unique())

# Filter test data to include only users and items seen in the training data
test_data = test_data[test_data['visitorid'].isin(seen_users) & test_data['itemid'].isin(seen_items)]

# Ensure the test set does not have any new users/items
print("Number of unique visitors in train:", len(seen_users))
print("Number of unique visitors in test:", len(test_data['visitorid'].unique()))
print("Number of unique items in train:", len(seen_items))
print("Number of unique items in test:", len(test_data['itemid'].unique()))

Number of unique visitors in train: 190956
Number of unique visitors in test: 64200
Number of unique items in train: 30429
Number of unique items in test: 16384


In [None]:
# Ensure there are no unseen visitors or items in the test set
unseen_visitors = set(test_data['visitorid'].unique()) - seen_users
unseen_items = set(test_data['itemid'].unique()) - seen_items

# Print the results
if unseen_visitors:
    print(f"Unseen visitors in test data: {unseen_visitors}")
else:
    print("No unseen visitors in test data.")

if unseen_items:
    print(f"Unseen items in test data: {unseen_items}")
else:
    print("No unseen items in test data.")


No unseen visitors in test data.
No unseen items in test data.


In [None]:
train_data.shape

(980286, 12)

In [None]:
test_data.shape

(221316, 12)

In [None]:
pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
import torch
from torch_geometric.data import HeteroData
import pandas as pd

# Create HeteroData object
data = HeteroData()

# --- Encode Identifiers as Integer Indices using train_data---
# Create an explicit copy to avoid potential chaining issues
train_data_encoded = train_data.copy()

# Factorize using .loc to avoid SettingWithCopyWarning on the new copy
train_data_encoded.loc[:, 'visitorid_encoded'], visitor_mapping = pd.factorize(train_data_encoded['visitorid'])
train_data_encoded.loc[:, 'itemid_encoded'], item_mapping = pd.factorize(train_data_encoded['itemid'])
train_data_encoded.loc[:, 'categoryid_encoded'], category_mapping = pd.factorize(train_data_encoded['categoryid'])
train_data_encoded.loc[:, 'parentid_encoded'], parent_mapping = pd.factorize(train_data_encoded['parentid'])

# --- Add Visitor Nodes ---
visitor_data = train_data_encoded.groupby('visitorid_encoded').agg(
    total_interactions=('visitorid_encoded', 'size'),
    transaction_frequency=('is_transaction', 'sum'),
    distinct_items=('itemid_encoded', 'nunique')
).reset_index()

# Adding Visitor Node Features
data['visitor'].x = torch.tensor(visitor_data[['total_interactions', 'transaction_frequency', 'distinct_items']].values, dtype=torch.float)

# --- Add Item Nodes ---
item_data = train_data_encoded[['itemid_encoded', 'availability', 'categoryid_encoded', 'parentid_encoded']].drop_duplicates(subset=['itemid_encoded'])

# Adding Item Node Features
data['item'].x = torch.tensor(item_data[['availability']].values, dtype=torch.float)

# --- Add Category Nodes ---
category_data = train_data_encoded[['categoryid_encoded', 'parentid_encoded']].drop_duplicates(subset=['categoryid_encoded'])

# Adding Category Node Features
data['category'].x = torch.tensor(category_data[['parentid_encoded']].values, dtype=torch.float)

In [None]:
# --- Add Edges ---

# 1. Visitor → Item (interaction edges)
train_data_encoded.loc[:, 'event_encoded'] = pd.to_numeric(train_data_encoded['event_encoded'], errors='coerce').fillna(0).astype(float)
train_data_encoded.loc[:, 'is_transaction'] = pd.to_numeric(train_data_encoded['is_transaction'], errors='coerce').fillna(0).astype(float)
train_data_encoded.loc[:, 'timestamp'] = pd.to_numeric(train_data_encoded['timestamp'], errors='coerce').fillna(0).astype(float)

visitor_item_edge_index = torch.tensor(
    train_data_encoded[['visitorid_encoded', 'itemid_encoded']].values,
    dtype=torch.long
).t().contiguous()

visitor_item_edge_attr = torch.tensor(
    train_data_encoded[['event_encoded', 'is_transaction', 'timestamp']].values,
    dtype=torch.float
)

data['visitor', 'interaction', 'item'].edge_index = visitor_item_edge_index
data['visitor', 'interaction', 'item'].edge_attr = visitor_item_edge_attr

# 2. Item → Category (item-category edges)
item_category_edge_index = torch.tensor(
    item_data[['itemid_encoded', 'categoryid_encoded']].dropna().astype(int).values,
    dtype=torch.long
).t().contiguous()

data['item', 'belongs_to', 'category'].edge_index = item_category_edge_index

# 3. Category → Parent Category (category hierarchy)
category_data['categoryid_encoded'] = pd.to_numeric(category_data['categoryid_encoded'], errors='coerce').fillna(0).astype(int)
category_data['parentid_encoded'] = pd.to_numeric(category_data['parentid_encoded'], errors='coerce').fillna(0).astype(int)

category_parent_edge_index = torch.tensor(
    category_data[['categoryid_encoded', 'parentid_encoded']].dropna().astype(int).values,
    dtype=torch.long
).t().contiguous()

data['category', 'sub_category_of', 'category'].edge_index = category_parent_edge_index

# 4. Visitor → Category (visitor-category edges)
visitor_category_edge_index = torch.tensor(
    train_data_encoded[['visitorid_encoded', 'categoryid_encoded']].dropna().astype(int).values,
    dtype=torch.long
).t().contiguous()

visitor_category_edge_attr = torch.tensor(
    train_data_encoded[['event_encoded', 'is_transaction', 'timestamp']].values,
    dtype=torch.float
)

data['visitor', 'visitor-category', 'category'].edge_index = visitor_category_edge_index
data['visitor', 'visitor-category', 'category'].edge_attr = visitor_category_edge_attr

# 5. Item → Parent Category (item-parent category edges)
item_parent_edge_index = torch.tensor(
    item_data[['itemid_encoded', 'parentid_encoded']].dropna().astype(int).values,
    dtype=torch.long
).t().contiguous()

data['item', 'belongs_to_parent', 'category'].edge_index = item_parent_edge_index


 1.43388115e+12 1.43606983e+12]' has dtype incompatible with datetime64[ns], please explicitly cast to a compatible dtype first.
  train_data_encoded.loc[:, 'timestamp'] = pd.to_numeric(train_data_encoded['timestamp'], errors='coerce').fillna(0).astype(float)


In [None]:
import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.nn import HGTConv
from torch_geometric.nn import Linear as GNNLinear
from torch_geometric.data import HeteroData
from torch_geometric.loader import NeighborLoader
import random

# Define the HGTModel
class HGTModel(torch.nn.Module):
    def __init__(self, hidden_channels, num_classes, metadata):
        super().__init__()

        self.conv1 = HGTConv(
            in_channels=hidden_channels,
            out_channels=hidden_channels,
            metadata=metadata,
            heads=2
        )
        self.conv2 = HGTConv(
            in_channels=hidden_channels,
            out_channels=hidden_channels,
            metadata=metadata,
            heads=2
        )

        # Linear layer for classification
        self.lin = GNNLinear(hidden_channels, num_classes)

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}

        # We predict for 'visitor' nodes
        return self.lin(x_dict['visitor'])  # Example: Classification on 'visitor' nodes


In [None]:
!pip install torch-sparse
!pip install torch-scatter
!pip install torch-cluster
!pip install torch-spline-conv
!pip install pyg-lib
!pip install torch-geometric

Collecting torch-sparse
  Downloading torch_sparse-0.6.18.tar.gz (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.0/210.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-sparse
  Building wheel for torch-sparse (setup.py) ... [?25l[?25hdone
  Created wheel for torch-sparse: filename=torch_sparse-0.6.18-cp311-cp311-linux_x86_64.whl size=1122943 sha256=013810be987ca2ceecc47ad2e43bf24fc5bdc3f61be4238d8f9e7784e0b68276
  Stored in directory: /root/.cache/pip/wheels/75/e2/1e/299c596063839303657c211f587f05591891cc6cf126d94d21
Successfully built torch-sparse
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18
Collecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata

In [None]:
!pip install git+https://github.com/pyg-team/pyg-lib.git

Collecting git+https://github.com/pyg-team/pyg-lib.git
  Cloning https://github.com/pyg-team/pyg-lib.git to /tmp/pip-req-build-2kl7m3px
  Running command git clone --filter=blob:none --quiet https://github.com/pyg-team/pyg-lib.git /tmp/pip-req-build-2kl7m3px
  Resolved https://github.com/pyg-team/pyg-lib.git to commit a2c6b7fa386ae64a50e81b38b64a8461f8a3a4db
  Running command git submodule update --init --recursive -q
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyg_lib
  Building wheel for pyg_lib (setup.py) ... [?25l[?25hdone
  Created wheel for pyg_lib: filename=pyg_lib-0.4.0-cp311-cp311-linux_x86_64.whl size=1555123 sha256=fa0958b81ebd00dc45f0e1757b6720f10ef0b21c87621a0370f9d46cf8d29ac8
  Stored in directory: /tmp/pip-ephem-wheel-cache-yy5l4ojw/wheels/6e/86/cc/7b01a1bebb7ed0c9e95b8b7d590e91c052363b9f1ebf446298
Successfully built pyg_lib
Installing collected packages: pyg_lib
Successfully installed pyg_lib-0.4.0


In [None]:
import torch_geometric, torch_sparse, torch_scatter
print(torch_geometric.__version__)
print(torch_sparse.__version__)
print(torch_scatter.__version__)

2.6.1
0.6.18
2.1.2


In [None]:
!pip install torch-geometric==2.3.1 torch-sparse==0.6.17 torch-scatter==2.1.1 torch-cluster==1.6.0

Collecting torch-geometric==2.3.1
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting torch-sparse==0.6.17
  Downloading torch_sparse-0.6.17.tar.gz (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-scatter==2.1.1
  Downloading torch_scatter-2.1.1.tar.gz (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-cluster==1.6.0
  Downloading torch_cluster-1.6.0.tar.gz (43 kB)
[2K  

In [None]:
print(data['visitor'].num_nodes)  # Check if there are any visitors
print(data['item'].num_nodes)  # Check if there are any items

190956
30429


In [None]:
print("Edge Types:", data.edge_types)  # Should print the edge types as a list of tuples
print("Metadata:", data.metadata())  # This will show you the node and edge types

Edge Types: [('visitor', 'interaction', 'item'), ('item', 'belongs_to', 'category'), ('category', 'sub_category_of', 'category'), ('visitor', 'visitor-category', 'category'), ('item', 'belongs_to_parent', 'category')]
Metadata: (['visitor', 'item', 'category'], [('visitor', 'interaction', 'item'), ('item', 'belongs_to', 'category'), ('category', 'sub_category_of', 'category'), ('visitor', 'visitor-category', 'category'), ('item', 'belongs_to_parent', 'category')])


In [None]:
print("Number of visitors:", data['visitor'].num_nodes)
print("Number of items:", data['item'].num_nodes)
print("Number of categories:", data['category'].num_nodes)

Number of visitors: 190956
Number of items: 30429
Number of categories: 849


In [None]:
print("Number of visitor nodes:", data['visitor'].num_nodes)


Number of visitor nodes: 190956


In [None]:
print("Edge index for ('visitor', 'interaction', 'item'):", data['visitor', 'interaction', 'item'].edge_index.shape)


Edge index for ('visitor', 'interaction', 'item'): torch.Size([2, 980286])


In [None]:
import torch_geometric
print(torch_geometric.__version__)


2.6.1


In [None]:
print(data)  # Check the entire structure of the data object


HeteroData(
  visitor={ x=[190956, 3] },
  item={ x=[30429, 1] },
  category={ x=[849, 1] },
  (visitor, interaction, item)={
    edge_index=[2, 980286],
    edge_attr=[980286, 3],
  },
  (item, belongs_to, category)={ edge_index=[2, 30429] },
  (category, sub_category_of, category)={ edge_index=[2, 849] },
  (visitor, visitor-category, category)={
    edge_index=[2, 980286],
    edge_attr=[980286, 3],
  },
  (item, belongs_to_parent, category)={ edge_index=[2, 30429] }
)


In [None]:
print(data['visitor', 'interaction', 'item'].edge_index)


tensor([[    0,     1,     2,  ...,   368,  3452, 11621],
        [    0,     1,     2,  ...,   320,   393,  2218]])


In [None]:
# Define the HGT Model instance
hidden_channels = 64  # You can adjust this parameter
num_classes = 5  # Number of event types (view, addtocart, transaction)
metadata = (data.metadata())

model = HGTModel(hidden_channels, num_classes, metadata)

# Set up optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Include all edge types in num_neighbors
num_neighbors = {
    ('visitor', 'interaction', 'item'): [10],  # List of length 1 for each edge type
    ('item', 'belongs_to', 'category'): [10],
    ('category', 'sub_category_of', 'category'): [10],
    ('visitor', 'visitor-category', 'category'): [10],
    ('item', 'belongs_to_parent', 'category'): [10]
}

train_loader = NeighborLoader(
    data,
    num_neighbors=num_neighbors,  # Ensure the number of neighbors is correctly set
    input_nodes=('visitor', torch.arange(data['visitor'].num_nodes)),  # Use visitor nodes correctly
    batch_size=64,
    shuffle=True
)



  # Convert the graph data into CSC format for sampling:


In [None]:
print("num_neighbors:", num_neighbors)
print("data:", data)


num_neighbors: {('visitor', 'interaction', 'item'): [10], ('item', 'belongs_to', 'category'): [10], ('category', 'sub_category_of', 'category'): [10], ('visitor', 'visitor-category', 'category'): [10], ('item', 'belongs_to_parent', 'category'): [10]}
data: HeteroData(
  visitor={ x=[190956, 3] },
  item={ x=[30429, 1] },
  category={ x=[849, 1] },
  (visitor, interaction, item)={
    edge_index=[2, 980286],
    edge_attr=[980286, 3],
  },
  (item, belongs_to, category)={ edge_index=[2, 30429] },
  (category, sub_category_of, category)={ edge_index=[2, 849] },
  (visitor, visitor-category, category)={
    edge_index=[2, 980286],
    edge_attr=[980286, 3],
  },
  (item, belongs_to_parent, category)={ edge_index=[2, 30429] }
)


In [None]:
print("Edge types in data:", data.edge_types)

Edge types in data: [('visitor', 'interaction', 'item'), ('item', 'belongs_to', 'category'), ('category', 'sub_category_of', 'category'), ('visitor', 'visitor-category', 'category'), ('item', 'belongs_to_parent', 'category')]


In [None]:
for edge_type in data.edge_types:
    print(f"Edge type {edge_type}: {data[edge_type].edge_index.shape}")

Edge type ('visitor', 'interaction', 'item'): torch.Size([2, 980286])
Edge type ('item', 'belongs_to', 'category'): torch.Size([2, 30429])
Edge type ('category', 'sub_category_of', 'category'): torch.Size([2, 849])
Edge type ('visitor', 'visitor-category', 'category'): torch.Size([2, 980286])
Edge type ('item', 'belongs_to_parent', 'category'): torch.Size([2, 30429])


In [None]:
# Training Loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(1, 201):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch)

        # Example target: event_encoded as ground truth
        event_encoded = batch.event_encoded.to(device)
        loss = torch.nn.CrossEntropyLoss()(out, event_encoded)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(train_loader)}')

NameError: name 'torch' is not defined