In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Obtaining dependency information for torch_geometric from https://files.pythonhosted.org/packages/65/4e/6f9a75548a93fedcd4514ae2de9bee1e91bade6b73252b4da32f0e42ac52/torch_geometric-2.4.0-py3-none-any.whl.metadata
  Downloading torch_geometric-2.4.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m681.7 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

import torch
from torch_geometric.nn import LightGCN
from torch_geometric.utils import negative_sampling
import random

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)



In [3]:
def load_dataset(dirname):
    path = f"/kaggle/input/otto-recsys-short/kaggle/working/otto_exploded_dataset/{dirname}/"
    return {
        "train": pd.read_parquet(path + "train"),
        "test": pd.read_parquet(path + "test")
    }

In [4]:
TARGET = "clicks"

In [5]:
d = load_dataset(TARGET)
train_df = d["train"][["session", "aid"]]
train_df.head()

Unnamed: 0,session,aid
54,0,1521766
55,0,1725503
56,0,528847
57,0,1816325
58,0,984597


In [6]:
le = LabelEncoder()
le.fit(train_df.to_numpy().reshape(-1))

In [7]:
edge_index = torch.tensor([
    le.transform(train_df["session"]),
    le.transform(train_df["aid"])
], dtype=torch.int64)

edge_index

  edge_index = torch.tensor([


tensor([[     0,      0,      0,  ..., 241049, 241049, 241049],
        [859711, 944834, 382799,  ..., 773781, 190523, 110764]])

In [8]:
edge_index_ud = torch.cat((edge_index, edge_index[[1, 0]]), axis=1)
edge_index_ud

tensor([[     0,      0,      0,  ..., 773781, 190523, 110764],
        [859711, 944834, 382799,  ..., 241049, 241049, 241049]])

In [9]:
users = edge_index[0].unique()
items = edge_index[1].unique()

len(users), len(items)

(385470, 774068)

In [16]:
class cfg:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embedding_dim = 64
    num_layers = 2
    epochs = 2
    batch_size = 1024
    lambda_reg = 100
    learning_rate = 2e-3
    seed = 52

def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

seed_everything(cfg.seed)

# Setting up model

In [17]:
model = LightGCN(
    num_nodes=len(users) + len(items),
    embedding_dim=cfg.embedding_dim,
    num_layers=cfg.num_layers
)
model = model.to(cfg.device)
model.train()

optimizer = torch.optim.SGD(model.parameters(), lr=cfg.learning_rate)
print(cfg.device)

cuda


# Training

In [18]:
for e in range(1, cfg.epochs + 1):
    idx = np.arange(len(edge_index[0]))
    idx = np.random.permutation(idx)
    t_edge_index = edge_index_ud[:,idx]

    index_range = list(range(len(t_edge_index[0]) // cfg.batch_size))
    range_len = len(index_range)
    for i in index_range:
        tem = min(cfg.batch_size * (i+1), len(t_edge_index[0]))

        pos_edge = t_edge_index[:, cfg.batch_size*i : tem].to(cfg.device)
        neg_edge = negative_sampling(
            edge_index,
            num_nodes=(len(users), len(items)),
            num_neg_samples=cfg.batch_size,
            force_undirected=True
        ).to(cfg.device)

        pos_pred = model(pos_edge)
        neg_pred = model(neg_edge)

        loss = model.recommendation_loss(pos_pred, neg_pred, lambda_reg=cfg.lambda_reg)

        if i % 100 == 0:
            print(f"Iteration {i:04}/{range_len}. Loss: {loss.detach().cpu().item()}", flush=True)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


    with torch.no_grad(): 
        print(
            f" * In epoch {(e+1):04}, loss={loss:.03f}"
        )

Iteration 0000/5998. Loss: 13.192195892333984
Iteration 0100/5998. Loss: 12.252700805664062
Iteration 0200/5998. Loss: 11.383824348449707
Iteration 0300/5998. Loss: 10.580257415771484
Iteration 0400/5998. Loss: 9.837087631225586
Iteration 0500/5998. Loss: 9.1497802734375
Iteration 0600/5998. Loss: 8.51413631439209
Iteration 0700/5998. Loss: 7.926270008087158
Iteration 0800/5998. Loss: 7.3825883865356445
Iteration 0900/5998. Loss: 6.879775047302246
Iteration 1000/5998. Loss: 6.414755344390869
Iteration 1100/5998. Loss: 5.984689235687256
Iteration 1200/5998. Loss: 5.586948871612549
Iteration 1300/5998. Loss: 5.219104290008545
Iteration 1400/5998. Loss: 4.878909111022949
Iteration 1500/5998. Loss: 4.564284801483154
Iteration 1600/5998. Loss: 4.27331018447876
Iteration 1700/5998. Loss: 4.004205703735352
Iteration 1800/5998. Loss: 3.7553293704986572
Iteration 1900/5998. Loss: 3.5251598358154297
Iteration 2000/5998. Loss: 3.3122901916503906
Iteration 2100/5998. Loss: 3.1154215335845947
Itera

In [19]:
torch.save(model, f"LightGCN_trained_{TARGET}.pt")

# Inference

In [20]:
model_d = torch.load(f"LightGCN_trained_{TARGET}.pt").to(cfg.device)
model_d

LightGCN(1159538, 64, num_layers=2)

In [21]:
edge_index_udd = edge_index_ud.to(cfg.device)
src_index_d = torch.LongTensor(users)
dst_index_d = torch.LongTensor(items).to(cfg.device)


In [22]:
!mkdir {TARGET}

In [23]:
from tqdm import tqdm
left = 0
right = cfg.batch_size

final_df_list = []
pbar = tqdm(total=len(src_index_d))
i = 0
while right < len(src_index_d):
    src_users = src_index_d[left: min(right, len(src_index_d))].to(cfg.device)

    cpu_users = le.inverse_transform(
        src_users.cpu().numpy()
    )
    
    cpu_recs = model_d.recommend(
        edge_index=edge_index_udd,
        src_index=src_users,
        dst_index=dst_index_d,
        k=2000
    ).cpu().numpy()
    
    cpu_recs = [
        le.inverse_transform(cpu_rec).tolist() for cpu_rec in cpu_recs
    ]
    
    df = pd.DataFrame.from_dict(
        {"session": cpu_users, "aid": cpu_recs},
        orient="columns"
    )
    
    df.to_parquet(f"/kaggle/working/{TARGET}/part_{i}.parquet")

    i += 1
    left += cfg.batch_size
    right += cfg.batch_size
    pbar.update(cfg.batch_size)
pbar.close()

100%|█████████▉| 385024/385470 [1:59:09<00:08, 53.85it/s]


In [24]:
!zip -r {TARGET}.zip /kaggle/working/{TARGET}

  adding: kaggle/working/clicks/ (stored 0%)
  adding: kaggle/working/clicks/part_193.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_225.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_8.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_348.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_33.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_166.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_243.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_149.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_274.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_161.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_306.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_32.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_278.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_5.parquet (deflated 19%)
  adding: kaggle/working/clicks/part_73.parquet (deflated 19%)
  

### TBD:
- Get predictions for each target:
    - `session`, top 2048 `aid`
- Save it in csv
- Leave items that are in test
- Calculate metrics raw
- Calculate sequential metrics