In [None]:
# Tahap 1: Load data
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Baca file CSV
train = pd.read_csv("/kaggle/input/data-train-book/train.csv")
print(f"Total interactions: {len(train)}")


Total interactions: 269764


In [2]:
# Tahap 2: Split Data jadi train-test per user
train_list, test_list = [], []

for uid, group in train.groupby("user_id"):
    if len(group) < 2:
        train_list.append(group)
        continue
    tr, te = train_test_split(group, test_size=0.2, random_state=42)
    train_list.append(tr)
    test_list.append(te)

train_split = pd.concat(train_list)
test_split = pd.concat(test_list)
print(f"Train: {len(train_split)}, Test: {len(test_split)}")

Train: 210734, Test: 59030


In [3]:
# Tahap 3: Encode user & item ke index numerik
user_to_index = {u: i for i, u in enumerate(train_split['user_id'].unique())}
item_to_index = {i: j for j, i in enumerate(train_split['item_id'].unique())}
index_to_user = {i: u for u, i in user_to_index.items()}
index_to_item = {j: i for i, j in item_to_index.items()}

user_index = train_split['user_id'].map(user_to_index)
item_index = train_split['item_id'].map(item_to_index)

n_users = len(user_to_index)
n_items = len(item_to_index)
print(f"Users: {n_users}, Items: {n_items}")

Users: 13876, Items: 103454


In [4]:
# Tahap 4: Buat weighted user-item matrix (TF-IDF style)
item_freq = train_split['item_id'].value_counts()
item_weight = 1.0 / np.log1p(item_freq)  # Semakin populer, bobot makin kecil
train_split['weight'] = train_split['item_id'].map(item_weight)

data = train_split['weight'].values.astype(np.float32)

user_item_csr = csr_matrix((data, (user_index, item_index)), shape=(n_users, n_items))
print("CSR matrix shape:", user_item_csr.shape)

CSR matrix shape: (13876, 103454)


In [5]:
# Tahap 5: Fit NearestNeighbors untuk Item Similarity
top_k = 100
item_matrix = user_item_csr.T  # item x user

print("Fitting NearestNeighbors (cosine similarity)...")
model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=top_k + 1, n_jobs=-1)
model.fit(item_matrix)

print("Menghitung neighbors untuk semua item...")
distances, neighbors = model.kneighbors(item_matrix, return_distance=True)
similarities = 1.0 - distances
neighbors = neighbors[:, 1:]  # buang diri sendiri
similarities = similarities[:, 1:]

print("Neighbors shape:", neighbors.shape)

Fitting NearestNeighbors (cosine similarity)...
Menghitung neighbors untuk semua item...
Neighbors shape: (103454, 100)


In [6]:
# Tahap 6: Precompute Global Top-10 Item (fallback)
global_top10 = train_split['item_id'].value_counts().head(10).index.tolist()


# Tahap 7: Fungsi Rekomendasi per User
def recommend_for_user(u_idx, top_n=10):
    user_vec = user_item_csr[u_idx]
    interacted = user_vec.indices
    if len(interacted) == 0:
        return global_top10

    scores = {}
    for item in interacted:
        neighs = neighbors[item]
        sims = similarities[item]
        for nbr, sim in zip(neighs, sims):
            if nbr in interacted:
                continue
            scores[nbr] = scores.get(nbr, 0.0) + float(sim)

    if not scores:
        return global_top10

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    top_items_idx = [idx for idx, _ in ranked[:top_n]]
    top_items = [index_to_item[i] for i in top_items_idx]
    return top_items

# Tahap 8: Hasil Rekomendasi untuk Semua User
topN = 10
results = []

for u_idx in tqdm(range(n_users), desc="Membuat rekomendasi"):
    uid = index_to_user[u_idx]
    recs = recommend_for_user(u_idx, top_n=topN)
    results.append({'user_id': uid, 'item_id': " ".join(map(str, recs))})

recommend_df = pd.DataFrame(results)
recommend_df.to_csv("ibcf_top10_submission.csv", index=False)
print("File rekomendasi 'ibcf_top10_submission.csv' telah dibuat.")

Membuat rekomendasi: 100%|██████████| 13876/13876 [01:58<00:00, 116.61it/s]


File rekomendasi 'ibcf_top10_submission.csv' telah dibuat.


In [7]:
# Tahap 9: Evaluasi MAP@10
def average_precision_at_k(actual, predicted, k=10):
    if not actual:
        return 0.0
    predicted = predicted[:k]
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual:
            hits += 1.0
            score += hits / (i + 1.0)
    return score / min(len(actual), k)

# Buat ground truth dari test set
test_truth = test_split.groupby('user_id')['item_id'].apply(set).to_dict()

map_scores = []
for _, row in recommend_df.iterrows():
    uid = row['user_id']
    preds = row['item_id'].split()
    actual = test_truth.get(uid, set())
    ap = average_precision_at_k(actual, preds, k=10)
    map_scores.append(ap)

map10 = np.mean(map_scores)
print(f"\nMAP@10: {map10:.6f}")


MAP@10: 0.007244
