In [1]:
# Tahap 1: Load data
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import time
import random

# Baca file CSV
train = pd.read_csv("train.csv")

# Hapus baris dengan nilai NaN di 'item_id'
initial_len = len(train)
train.dropna(subset=['item_id'], inplace=True)
print(f"Total interactions (after dropping NaNs): {len(train)} (Dropped {initial_len - len(train)} rows)")

Total interactions (after dropping NaNs): 239375 (Dropped 1 rows)


In [2]:
# Tahap 2: Split Data jadi train-test per user
train_list, test_list = [], []

for uid, group in train.groupby("user_id"):
    if len(group) < 2:
        train_list.append(group)
        continue
    tr, te = train_test_split(group, test_size=0.2, random_state=42)
    train_list.append(tr)
    test_list.append(te)

train_split = pd.concat(train_list)
test_split = pd.concat(test_list)
print(f"Train: {len(train_split)}, Test: {len(test_split)}")

Train: 187052, Test: 52323


In [3]:
# Tahap 3: Encode user & item ke index numerik
user_to_index = {u: i for i, u in enumerate(train_split['user_id'].unique())}
item_to_index = {i: j for j, i in enumerate(train_split['item_id'].unique())}
index_to_user = {i: u for u, i in user_to_index.items()}
index_to_item = {j: i for i, j in item_to_index.items()}

user_index = train_split['user_id'].map(user_to_index)
item_index = train_split['item_id'].map(item_to_index)

n_users = len(user_to_index)
n_items = len(item_to_index)
print(f"Users: {n_users}, Items: {n_items}")

Users: 12160, Items: 95211


In [4]:
# Tahap 4: Buat weighted user-item matrix (TF-IDF style)
item_freq = train_split['item_id'].value_counts()
item_weight = 1.0 / np.log1p(item_freq)  # Semakin populer, bobot makin kecil
train_split['weight'] = train_split['item_id'].map(item_weight)

data = train_split['weight'].values.astype(np.float32)

user_item_csr = csr_matrix((data, (user_index, item_index)), shape=(n_users, n_items))
print("CSR matrix shape:", user_item_csr.shape)

CSR matrix shape: (12160, 95211)


In [48]:
# Tahap 5: Persiapan Data untuk BPR
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Membentuk pasangan positif (user_idx, item_idx)
pos_pairs = list(
    zip(train_split['user_id'].map(user_to_index),
        train_split['item_id'].map(item_to_index))
)

# Set item yang pernah di-like per user
user_pos = defaultdict(set)
for u, i in pos_pairs:
    user_pos[u].add(i)

# Hitung frekuensi item untuk popularity negative sampling
item_freq_series = train_split['item_id'].map(item_to_index).value_counts()
item_freq = np.zeros(n_items, dtype=np.float32)
for idx, cnt in item_freq_series.items():
    item_freq[idx] = cnt

# Distribusi peluang sampling (power 0.75)
power = 0.75
freq = item_freq / item_freq.sum()
item_prob_np = freq ** 0.75
item_prob_np = item_prob_np / item_prob_np.sum()
popular_item_indices = np.arange(n_items)
pos_pairs_array = np.array(pos_pairs, dtype=np.int32)
n_pos = len(pos_pairs_array)

In [49]:
# Tahap 6: Defenisi Model BPR
class BPRModel(nn.Module):
    def __init__(self, n_users, n_items, n_factors=128):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, n_factors)
        self.item_emb = nn.Embedding(n_items, n_factors)

        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)

    def forward(self, u, i, j):
        u_vec = self.user_emb(u)
        i_vec = self.item_emb(i)
        j_vec = self.item_emb(j)
        x_ui = torch.sum(u_vec * i_vec, dim=1)
        x_uj = torch.sum(u_vec * j_vec, dim=1)
        return x_ui, x_uj

In [51]:
# Tahap 7:Training BPR
factors = 256
lr = 0.001
warmup_epochs = 5
batch_size = 4096
neg_samples = 2
epochs = 200
reg = 1e-4

model = BPRModel(n_users, n_items, n_factors=factors).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

def sample_batch(batch_size):
    idx = np.random.randint(0, n_pos, size=batch_size)
    batch = pos_pairs_array[idx]

    users = batch[:, 0]
    pos_items = batch[:, 1]

    neg_items = np.random.choice(
        popular_item_indices,
        size=batch_size * neg_samples,
        p=item_prob_np
    ).reshape(batch_size, neg_samples)

    for b in range(batch_size):
        u = users[b]
        for k in range(neg_samples):
            neg = neg_items[b, k]
            attempt = 0
            while neg in user_pos[u] and attempt < 10:
                neg = np.random.choice(popular_item_indices, p=item_prob_np)
                attempt += 1
            neg_items[b, k] = neg

    return users.astype(np.int64), pos_items.astype(np.int64), neg_items[:, 0].astype(np.int64)

In [52]:
print("Mulai training BPR...\n")
start_time = time.time()

for epoch in range(1, epochs + 1):
    epoch_loss = 0
    iters_per_epoch = max(1, n_pos // batch_size)

    # ============================
    # Warmup Learning Rate
    # ============================
    if epoch <= warmup_epochs:
        for g in optimizer.param_groups:
            g['lr'] = lr * (epoch / warmup_epochs)

    # ============================
    # Training Loop per epoch
    # ============================
    for _ in range(iters_per_epoch):
        users_b, pos_b, neg_b = sample_batch(batch_size)

        u = torch.tensor(users_b, dtype=torch.long, device=device)
        i = torch.tensor(pos_b, dtype=torch.long, device=device)
        j = torch.tensor(neg_b, dtype=torch.long, device=device)

        optimizer.zero_grad()

        x_ui, x_uj = model(u, i, j)
        x_diff = x_ui - x_uj

        loss = nn.functional.softplus(-x_diff).mean()

        reg_term = (
            model.user_emb(u).norm(2).pow(2) +
            model.item_emb(i).norm(2).pow(2) +
            model.item_emb(j).norm(2).pow(2)
        ) / batch_size

        loss = loss + reg * reg_term.sum()

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()

        epoch_loss += loss.item()

        scheduler.step()

    print(f"Epoch {epoch}/{epochs} — Loss: {epoch_loss:.4f}")

print("\nTraining selesai.")

Mulai training BPR...

Epoch 1/200 — Loss: 31.1782
Epoch 2/200 — Loss: 31.1231
Epoch 3/200 — Loss: 30.9959
Epoch 4/200 — Loss: 30.7270
Epoch 5/200 — Loss: 30.4555
Epoch 6/200 — Loss: 30.3687
Epoch 7/200 — Loss: 30.0013
Epoch 8/200 — Loss: 28.8662
Epoch 9/200 — Loss: 26.4299
Epoch 10/200 — Loss: 23.0468
Epoch 11/200 — Loss: 19.8981
Epoch 12/200 — Loss: 17.9608
Epoch 13/200 — Loss: 17.2047
Epoch 14/200 — Loss: 17.0878
Epoch 15/200 — Loss: 16.8902
Epoch 16/200 — Loss: 16.0490
Epoch 17/200 — Loss: 14.3193
Epoch 18/200 — Loss: 12.2486
Epoch 19/200 — Loss: 10.3091
Epoch 20/200 — Loss: 8.9200
Epoch 21/200 — Loss: 8.1583
Epoch 22/200 — Loss: 7.9076
Epoch 23/200 — Loss: 7.8626
Epoch 24/200 — Loss: 7.7297
Epoch 25/200 — Loss: 7.2763
Epoch 26/200 — Loss: 6.4751
Epoch 27/200 — Loss: 5.5723
Epoch 28/200 — Loss: 4.7650
Epoch 29/200 — Loss: 4.2128
Epoch 30/200 — Loss: 3.9305
Epoch 31/200 — Loss: 3.8267
Epoch 32/200 — Loss: 3.8160
Epoch 33/200 — Loss: 3.7523
Epoch 34/200 — Loss: 3.5532
Epoch 35/200 — 

In [53]:
# Tahap 8: Rekomendasi
user_embeddings = model.user_emb.weight.detach().cpu().numpy()
item_embeddings = model.item_emb.weight.detach().cpu().numpy()

def recommend_bpr_topk_for_user(u_idx, top_k=10):
    u_vec = user_embeddings[u_idx]
    scores = item_embeddings.dot(u_vec)

    if u_idx in user_pos:
        scores[list(user_pos[u_idx])] = -1e12

    topk_idx = np.argpartition(-scores, top_k)[:top_k]
    topk_sorted = topk_idx[np.argsort(-scores[topk_idx])]
    return [index_to_item[i] for i in topk_sorted]

# Generate rekomendasi
bpr_results = {}
for uid in test_truth:
    if uid in user_to_index:
        u = user_to_index[uid]
        bpr_results[uid] = recommend_bpr_topk_for_user(u, top_k=10)

In [None]:
# Tahap 9: Evaluasi MAP@10
def average_precision_at_k(actual, predicted, k=10):
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual:
            hits += 1.0
            score += hits / (i + 1.0)
    return score / min(len(actual), k)

test_truth = test_split.groupby('user_id')['item_id'].apply(set).to_dict()

bpr_results = {}
for uid in test_truth:
    if uid in user_to_index:
        bpr_results[uid] = recommend_bpr_topk_for_user(user_to_index[uid], top_k=10)

scores = []
for uid, actual in test_truth.items():
    preds = bpr_results.get(uid, [])
    ap = average_precision_at_k(actual, preds, k=10)
    scores.append(ap)

bpr_map10 = np.mean(scores)
print("MAP@10 BPR:", bpr_map10)


global_top10 = train_split['item_id'].value_counts().head(10).index.tolist()

out_users = sorted(user_to_index.keys())

rows = []
for uid in out_users:
    recs = bpr_results.get(uid, global_top10)
    rows.append({'user_id': uid, 'item_id': " ".join(map(str, recs))})

sub_bpr = pd.DataFrame(rows)
sub_bpr.to_csv("bpr_submission.csv", index=False)

print("Submission saved: bpr_submission.csv")
pd.read_csv("bpr_submission.csv")

MAP@10 BPR: 0.013983759013626163
Submission saved: bpr_submission.csv
