In [1]:
# Tahap 1: Load data
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import time
import random

# Baca file CSV
train = pd.read_csv("train.csv")

# Hapus baris dengan nilai NaN di 'item_id'
initial_len = len(train)
train.dropna(subset=['item_id'], inplace=True)
print(f"Total interactions (after dropping NaNs): {len(train)} (Dropped {initial_len - len(train)} rows)")

Total interactions (after dropping NaNs): 239375 (Dropped 1 rows)


In [2]:
# Tahap 2: Split Data jadi train-test per user
train_list, test_list = [], []

for uid, group in train.groupby("user_id"):
    if len(group) < 2:
        train_list.append(group)
        continue
    tr, te = train_test_split(group, test_size=0.2, random_state=42)
    train_list.append(tr)
    test_list.append(te)

train_split = pd.concat(train_list)
test_split = pd.concat(test_list)
print(f"Train: {len(train_split)}, Test: {len(test_split)}")

Train: 187052, Test: 52323


In [3]:
# Tahap 3: Encode user & item ke index numerik
user_to_index = {u: i for i, u in enumerate(train_split['user_id'].unique())}
item_to_index = {i: j for j, i in enumerate(train_split['item_id'].unique())}
index_to_user = {i: u for u, i in user_to_index.items()}
index_to_item = {j: i for i, j in item_to_index.items()}

user_index = train_split['user_id'].map(user_to_index)
item_index = train_split['item_id'].map(item_to_index)

n_users = len(user_to_index)
n_items = len(item_to_index)
print(f"Users: {n_users}, Items: {n_items}")

Users: 12160, Items: 95211


In [4]:
# Tahap 4: Buat weighted user-item matrix (TF-IDF style)
item_freq = train_split['item_id'].value_counts()
item_weight = 1.0 / np.log1p(item_freq)  # Semakin populer, bobot makin kecil
train_split['weight'] = train_split['item_id'].map(item_weight)

data = train_split['weight'].values.astype(np.float32)

user_item_csr = csr_matrix((data, (user_index, item_index)), shape=(n_users, n_items))
print("CSR matrix shape:", user_item_csr.shape)

CSR matrix shape: (12160, 95211)


In [48]:
# Tahap 5: Persiapan Data untuk BPR
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Membentuk pasangan positif (user_idx, item_idx)
pos_pairs = list(
    zip(train_split['user_id'].map(user_to_index),
        train_split['item_id'].map(item_to_index))
)

# Set item yang pernah di-like per user
user_pos = defaultdict(set)
for u, i in pos_pairs:
    user_pos[u].add(i)

# Hitung frekuensi item untuk popularity negative sampling
item_freq_series = train_split['item_id'].map(item_to_index).value_counts()
item_freq = np.zeros(n_items, dtype=np.float32)
for idx, cnt in item_freq_series.items():
    item_freq[idx] = cnt

# Distribusi peluang sampling (power 0.75)
power = 0.75
freq = item_freq / item_freq.sum()
item_prob_np = freq ** 0.75
item_prob_np = item_prob_np / item_prob_np.sum()
popular_item_indices = np.arange(n_items)
pos_pairs_array = np.array(pos_pairs, dtype=np.int32)
n_pos = len(pos_pairs_array)

In [49]:
# Tahap 6: Defenisi Model BPR
class BPRModel(nn.Module):
    def __init__(self, n_users, n_items, n_factors=128):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, n_factors)
        self.item_emb = nn.Embedding(n_items, n_factors)

        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)

    def forward(self, u, i, j):
        u_vec = self.user_emb(u)
        i_vec = self.item_emb(i)
        j_vec = self.item_emb(j)
        x_ui = torch.sum(u_vec * i_vec, dim=1)
        x_uj = torch.sum(u_vec * j_vec, dim=1)
        return x_ui, x_uj

In [51]:
# Tahap 7:Training BPR
factors = 256
lr = 0.001
warmup_epochs = 5
batch_size = 4096
neg_samples = 2
epochs = 200
reg = 1e-4

model = BPRModel(n_users, n_items, n_factors=factors).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

def sample_batch(batch_size):
    idx = np.random.randint(0, n_pos, size=batch_size)
    batch = pos_pairs_array[idx]

    users = batch[:, 0]
    pos_items = batch[:, 1]

    neg_items = np.random.choice(
        popular_item_indices,
        size=batch_size * neg_samples,
        p=item_prob_np
    ).reshape(batch_size, neg_samples)

    for b in range(batch_size):
        u = users[b]
        for k in range(neg_samples):
            neg = neg_items[b, k]
            attempt = 0
            while neg in user_pos[u] and attempt < 10:
                neg = np.random.choice(popular_item_indices, p=item_prob_np)
                attempt += 1
            neg_items[b, k] = neg

    return users.astype(np.int64), pos_items.astype(np.int64), neg_items[:, 0].astype(np.int64)

In [52]:
print("Mulai training BPR...\n")
start_time = time.time()

for epoch in range(1, epochs + 1):
    epoch_loss = 0
    iters_per_epoch = max(1, n_pos // batch_size)

    # ============================
    # Warmup Learning Rate
    # ============================
    if epoch <= warmup_epochs:
        for g in optimizer.param_groups:
            g['lr'] = lr * (epoch / warmup_epochs)

    # ============================
    # Training Loop per epoch
    # ============================
    for _ in range(iters_per_epoch):
        users_b, pos_b, neg_b = sample_batch(batch_size)

        u = torch.tensor(users_b, dtype=torch.long, device=device)
        i = torch.tensor(pos_b, dtype=torch.long, device=device)
        j = torch.tensor(neg_b, dtype=torch.long, device=device)

        optimizer.zero_grad()

        x_ui, x_uj = model(u, i, j)
        x_diff = x_ui - x_uj

        loss = nn.functional.softplus(-x_diff).mean()

        reg_term = (
            model.user_emb(u).norm(2).pow(2) +
            model.item_emb(i).norm(2).pow(2) +
            model.item_emb(j).norm(2).pow(2)
        ) / batch_size

        loss = loss + reg * reg_term.sum()

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()

        epoch_loss += loss.item()

        scheduler.step()

    print(f"Epoch {epoch}/{epochs} â€” Loss: {epoch_loss:.4f}")

print("\nTraining selesai.")

Mulai training BPR...

Epoch 1/200 â€” Loss: 31.1782
Epoch 2/200 â€” Loss: 31.1231
Epoch 3/200 â€” Loss: 30.9959
Epoch 4/200 â€” Loss: 30.7270
Epoch 5/200 â€” Loss: 30.4555
Epoch 6/200 â€” Loss: 30.3687
Epoch 7/200 â€” Loss: 30.0013
Epoch 8/200 â€” Loss: 28.8662
Epoch 9/200 â€” Loss: 26.4299
Epoch 10/200 â€” Loss: 23.0468
Epoch 11/200 â€” Loss: 19.8981
Epoch 12/200 â€” Loss: 17.9608
Epoch 13/200 â€” Loss: 17.2047
Epoch 14/200 â€” Loss: 17.0878
Epoch 15/200 â€” Loss: 16.8902
Epoch 16/200 â€” Loss: 16.0490
Epoch 17/200 â€” Loss: 14.3193
Epoch 18/200 â€” Loss: 12.2486
Epoch 19/200 â€” Loss: 10.3091
Epoch 20/200 â€” Loss: 8.9200
Epoch 21/200 â€” Loss: 8.1583
Epoch 22/200 â€” Loss: 7.9076
Epoch 23/200 â€” Loss: 7.8626
Epoch 24/200 â€” Loss: 7.7297
Epoch 25/200 â€” Loss: 7.2763
Epoch 26/200 â€” Loss: 6.4751
Epoch 27/200 â€” Loss: 5.5723
Epoch 28/200 â€” Loss: 4.7650
Epoch 29/200 â€” Loss: 4.2128
Epoch 30/200 â€” Loss: 3.9305
Epoch 31/200 â€” Loss: 3.8267
Epoch 32/200 â€” Loss: 3.8160
Epoch 3

In [53]:
# Tahap 8: Rekomendasi
user_embeddings = model.user_emb.weight.detach().cpu().numpy()
item_embeddings = model.item_emb.weight.detach().cpu().numpy()

def recommend_bpr_topk_for_user(u_idx, top_k=10):
    u_vec = user_embeddings[u_idx]
    scores = item_embeddings.dot(u_vec)

    if u_idx in user_pos:
        scores[list(user_pos[u_idx])] = -1e12

    topk_idx = np.argpartition(-scores, top_k)[:top_k]
    topk_sorted = topk_idx[np.argsort(-scores[topk_idx])]
    return [index_to_item[i] for i in topk_sorted]

# Generate rekomendasi
bpr_results = {}
for uid in test_truth:
    if uid in user_to_index:
        u = user_to_index[uid]
        bpr_results[uid] = recommend_bpr_topk_for_user(u, top_k=10)

In [None]:
# Tahap 9: Evaluasi MAP@10
def average_precision_at_k(actual, predicted, k=10):
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual:
            hits += 1.0
            score += hits / (i + 1.0)
    return score / min(len(actual), k)

test_truth = test_split.groupby('user_id')['item_id'].apply(set).to_dict()

bpr_results = {}
for uid in test_truth:
    if uid in user_to_index:
        bpr_results[uid] = recommend_bpr_topk_for_user(user_to_index[uid], top_k=10)

scores = []
for uid, actual in test_truth.items():
    preds = bpr_results.get(uid, [])
    ap = average_precision_at_k(actual, preds, k=10)
    scores.append(ap)

bpr_map10 = np.mean(scores)
print("MAP@10 BPR:", bpr_map10)


global_top10 = train_split['item_id'].value_counts().head(10).index.tolist()

out_users = sorted(user_to_index.keys())

rows = []
for uid in out_users:
    recs = bpr_results.get(uid, global_top10)
    rows.append({'user_id': uid, 'item_id': " ".join(map(str, recs))})

sub_bpr = pd.DataFrame(rows)
sub_bpr.to_csv("bpr_submission.csv", index=False)

print("Submission saved: bpr_submission.csv")
pd.read_csv("bpr_submission.csv")

MAP@10 BPR: 0.013983759013626163
Submission saved: bpr_submission.csv


In [None]:
def fine_tune_bpr(n_users, n_items, train_data, test_data, user_item_matrix):

    param_grid = {
        "embedding_dim": [32, 64, 128],
        "lr": [0.0005, 0.001, 0.005],
        "lambda_reg": [1e-4, 1e-5],
        "num_negative": [1, 3, 5]
    }

    best_auc = -1
    best_params = None

    for emb in param_grid["embedding_dim"]:
        for lr in param_grid["lr"]:
            for reg in param_grid["lambda_reg"]:
                for neg in param_grid["num_negative"]:

                    print(f"\nðŸ”§ Testing: emb={emb}, lr={lr}, reg={reg}, neg={neg}")

                    model = BPR(n_users, n_items, embedding_dim=emb)
                    train_bpr(
                        model, train_data, user_item_matrix,
                        lr=lr, lambda_reg=reg, num_negative=neg, epochs=5
                    )

                    auc = auc_score(model, test_data)
                    print(f"âž¡ AUC: {auc:.4f}")

                    if auc > best_auc:
                        best_auc = auc
                        best_params = {
                            "embedding_dim": emb,
                            "lr": lr,
                            "lambda_reg": reg,
                            "num_negative": neg
                        }

    print("\n==============================")
    print("ðŸ”¥ BEST HYPERPARAMETERS FOUND")
    print(best_params)
    print(f"Best AUC = {best_auc:.4f}")
    print("==============================")

    return best_params


In [None]:
best_params = fine_tune_bpr(
    n_users, 
    n_items, 
    train_data, 
    test_data, 
    user_item_matrix
)

best_model = BPR(n_users, n_items, embedding_dim=best_params["embedding_dim"])

train_bpr(
    best_model, train_data, user_item_matrix,
    lr=best_params["lr"],
    lambda_reg=best_params["lambda_reg"],
    num_negative=best_params["num_negative"],
    epochs=30
)


In [None]:
def average_precision_at_k(actual, predicted, k=10):
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual:
            hits += 1.0
            score += hits / (i + 1.0)
    return score / min(len(actual), k)

def recommend_bpr_topk_for_user_tuned(u_idx, user_emb_weights, item_emb_weights, user_positive_items, index_to_item_map, top_k=10):
    u_vec = user_emb_weights[u_idx]
    scores = item_emb_weights.dot(u_vec)

    if u_idx in user_positive_items:
        scores[list(user_positive_items[u_idx])] = -1e12 # Mask already interacted items

    topk_idx = np.argpartition(-scores, top_k)[:top_k]
    topk_sorted = topk_idx[np.argsort(-scores[topk_idx])]
    return [index_to_item_map[i] for i in topk_sorted]

def evaluate_map_at_k(model, test_df, user_to_idx, item_to_idx, idx_to_item, user_pos_dict, top_k=10):
    user_embeddings = model.user_emb.weight.detach().cpu().numpy()
    item_embeddings = model.item_emb.weight.detach().cpu().numpy()

    test_truth = test_df.groupby('user_id')['item_id'].apply(set).to_dict()
    scores_list = []

    for uid, actual_items in test_truth.items():
        if uid in user_to_idx:
            u_idx = user_to_idx[uid]
            preds = recommend_bpr_topk_for_user_tuned(u_idx, user_embeddings, item_embeddings, user_pos_dict, idx_to_item, top_k=top_k)
            ap = average_precision_at_k(actual_items, preds, k=top_k)
            scores_list.append(ap)
    return np.mean(scores_list) if scores_list else 0.0

In [None]:
# Tahap 10: Hyperparameter Tuning for BPR
from tqdm.notebook import tqdm

def fine_tune_bpr(n_users, n_items, train_df, test_df, user_to_idx, item_to_idx, idx_to_user, idx_to_item,
                  user_pos_dict, pos_pairs_arr, popular_item_indices_arr, item_prob_np_arr, n_pos_interactions,
                  device_obj, batch_size, epochs_per_trial=5, eval_top_k=10):

    param_grid = {
        "n_factors": [32, 64, 128], # Embedding dimensions
        "lr": [0.0005, 0.001, 0.005],
        "reg": [1e-4, 1e-5],        # Regularization strength
        "neg_samples": [1, 3, 5]    # Number of negative samples per positive
    }

    best_map_score = -1
    best_params = None

    all_param_combinations = []
    for f in param_grid["n_factors"]:
        for l in param_grid["lr"]:
            for r in param_grid["reg"]:
                for n in param_grid["neg_samples"]:
                    all_param_combinations.append({"n_factors": f, "lr": l, "reg": r, "neg_samples": n})

    for params in tqdm(all_param_combinations, desc="Tuning Hyperparameters"):
        n_factors = params["n_factors"]
        current_lr = params["lr"]
        current_reg = params["reg"]
        current_neg_samples = params["neg_samples"]

        print(f"\nðŸ”§ Testing: Factors={n_factors}, LR={current_lr}, Reg={current_reg}, NegSamples={current_neg_samples}")

        model = BPRModel(n_users, n_items, n_factors=n_factors).to(device_obj)
        optimizer = optim.Adam(model.parameters(), lr=current_lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs_per_trial)

        # Inner training loop for hyperparameter tuning
        for epoch in range(1, epochs_per_trial + 1):
            epoch_loss = 0
            iters_per_epoch = max(1, n_pos_interactions // batch_size)

            for _ in range(iters_per_epoch):
                # Sample batch logic adapted for current_neg_samples
                idx = np.random.randint(0, n_pos_interactions, size=batch_size)
                batch = pos_pairs_arr[idx]

                users_b = batch[:, 0]
                pos_b = batch[:, 1]

                neg_items = np.random.choice(
                    popular_item_indices_arr,
                    size=batch_size * current_neg_samples,
                    p=item_prob_np_arr
                ).reshape(batch_size, current_neg_samples)

                for b in range(batch_size):
                    u = users_b[b]
                    for k in range(current_neg_samples):
                        neg = neg_items[b, k]
                        attempt = 0
                        while neg in user_pos_dict[u] and attempt < 10: # Avoid sampling positive items as negative
                            neg = np.random.choice(popular_item_indices_arr, p=item_prob_np_arr)
                            attempt += 1
                        neg_items[b, k] = neg

                u_tensor = torch.tensor(users_b, dtype=torch.long, device=device_obj)
                i_tensor = torch.tensor(pos_b, dtype=torch.long, device=device_obj)
                j_tensor = torch.tensor(neg_items[:, 0], dtype=torch.long, device=device_obj) # Use only first negative sample for BPR loss

                optimizer.zero_grad()

                x_ui, x_uj = model(u_tensor, i_tensor, j_tensor)
                x_diff = x_ui - x_uj

                loss = nn.functional.softplus(-x_diff).mean()

                reg_term = (
                    model.user_emb(u_tensor).norm(2).pow(2) +
                    model.item_emb(i_tensor).norm(2).pow(2) +
                    model.item_emb(j_tensor).norm(2).pow(2)
                ) / batch_size

                loss = loss + current_reg * reg_term.sum()

                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
                optimizer.step()

            scheduler.step() # Scheduler step per epoch

        # Evaluation
        current_map = evaluate_map_at_k(model, test_df, user_to_idx, item_to_idx, idx_to_item, user_pos_dict, eval_top_k)
        print(f"âž¡ MAP@{eval_top_k}: {current_map:.4f}")

        if current_map > best_map_score:
            best_map_score = current_map
            best_params = params

    print("\n==============================")
    print("ðŸ”¥ BEST HYPERPARAMETERS FOUND")
    print(best_params)
    print(f"Best MAP@{eval_top_k} = {best_map_score:.4f}")
    print("==============================")

    return best_params

In [None]:
best_params = fine_tune_bpr(
    n_users=n_users,
    n_items=n_items,
    train_df=train_split,
    test_df=test_split,
    user_to_idx=user_to_index,
    item_to_idx=item_to_index,
    idx_to_user=index_to_user,
    idx_to_item=index_to_item,
    user_pos_dict=user_pos,
    pos_pairs_arr=pos_pairs_array,
    popular_item_indices_arr=popular_item_indices,
    item_prob_np_arr=item_prob_np,
    n_pos_interactions=n_pos,
    device_obj=device,
    batch_size=batch_size,
    epochs_per_trial=5,
    eval_top_k=10
)

# After finding best_params, train the final model with these parameters for more epochs
best_model = BPRModel(n_users, n_items, n_factors=best_params["n_factors"]).to(device)
optimizer = optim.Adam(best_model.parameters(), lr=best_params["lr"])
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30) # Train for more epochs

# Final training loop for the best model
print("\nMulai training final BPR dengan hyperparameter terbaik...")
for epoch in range(1, 30 + 1):
    epoch_loss = 0
    iters_per_epoch = max(1, n_pos // batch_size)

    for _ in range(iters_per_epoch):
        # Sample batch logic from sample_batch function
        idx = np.random.randint(0, n_pos, size=batch_size)
        batch = pos_pairs_array[idx]

        users_b = batch[:, 0]
        pos_b = batch[:, 1]

        neg_items = np.random.choice(
            popular_item_indices,
            size=batch_size * best_params["neg_samples"],
            p=item_prob_np
        ).reshape(batch_size, best_params["neg_samples"])

        for b in range(batch_size):
            u = users_b[b]
            for k in range(best_params["neg_samples"]):
                neg = neg_items[b, k]
                attempt = 0
                while neg in user_pos[u] and attempt < 10:
                    neg = np.random.choice(popular_item_indices, p=item_prob_np)
                    attempt += 1
                neg_items[b, k] = neg

        u_tensor = torch.tensor(users_b, dtype=torch.long, device=device)
        i_tensor = torch.tensor(pos_b, dtype=torch.long, device=device)
        j_tensor = torch.tensor(neg_items[:, 0], dtype=torch.long, device=device)

        optimizer.zero_grad()

        x_ui, x_uj = best_model(u_tensor, i_tensor, j_tensor)
        x_diff = x_ui - x_uj

        loss = nn.functional.softplus(-x_diff).mean()

        reg_term = (
            best_model.user_emb(u_tensor).norm(2).pow(2) +
            best_model.item_emb(i_tensor).norm(2).pow(2) +
            best_model.item_emb(j_tensor).norm(2).pow(2)
        ) / batch_size

        loss = loss + best_params["reg"] * reg_term.sum()

        loss.backward()
        nn.utils.clip_grad_norm_(best_model.parameters(), max_norm=5.0)
        optimizer.step()

    scheduler.step()
    print(f"Epoch {epoch}/30 â€” Loss: {epoch_loss:.4f}")

print("Training final model selesai.")