In [1]:
# --- 1Ô∏è‚É£ Setup & Install Dependencies
!pip install implicit==0.7.2 tqdm --quiet
!pip install implicit==0.7.2 --quiet

In [2]:
# --- Fix OpenBLAS Warning ---
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

# --- 2Ô∏è‚É£ Import Libraries
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from tqdm import tqdm
import implicit
import torch

In [3]:
# --- 3Ô∏è‚É£ Cek GPU Availability
use_gpu = torch.cuda.is_available()
print(f"üîç GPU detected: {use_gpu}")
if use_gpu:
    print("‚úÖ GPU mode aktif (CUDA T4)")
else:
    print("‚öôÔ∏è CPU mode aktif)")

# --- 4Ô∏è‚É£ Load Data
train = pd.read_csv("/kaggle/input/data-train-book/train.csv")
print("Data sample:")
display(train.head())

# --- 5Ô∏è‚É£ Encode user dan item
user_map = {u: i for i, u in enumerate(train['user_id'].unique())}
item_map = {i: j for j, i in enumerate(train['item_id'].unique())}

train['user_idx'] = train['user_id'].map(user_map)
train['item_idx'] = train['item_id'].map(item_map)


üîç GPU detected: True
‚úÖ GPU mode aktif (CUDA T4)
Data sample:


Unnamed: 0,user_id,item_id
0,8,0002005018
1,8,074322678X
2,8,0887841740
3,8,1552041778
4,8,1567407781


In [4]:
# --- 6Ô∏è‚É£ Split Train / Validation

# Fungsi aman untuk sampling per-user
def safe_sample(x, frac=0.8):
    if len(x) == 1:
        return x  # jangan buang kalau cuma 1 interaksi
    else:
        return x.sample(frac=frac, random_state=42)

# Terapkan ke seluruh user
train_data = train.groupby('user_idx', group_keys=False).apply(safe_sample)
val_data = train[~train.index.isin(train_data.index)]

print("üìä Jumlah data:")
print(f"  Total data asli : {len(train):,}")
print(f"  Train data       : {len(train_data):,}")
print(f"  Validation data  : {len(val_data):,}")

# Pastikan jumlah user & item tetap sama
n_users = len(user_map)
n_items = len(item_map)

# Buat matriks sparse
train_mat = sparse.csr_matrix(
    (np.ones(len(train_data)), (train_data['user_idx'], train_data['item_idx'])),
    shape=(n_users, n_items)
)
val_mat = sparse.csr_matrix(
    (np.ones(len(val_data)), (val_data['user_idx'], val_data['item_idx'])),
    shape=(n_users, n_items)
)

print(f"\n‚úÖ Matriks Train: {train_mat.shape[0]} users √ó {train_mat.shape[1]} items")
print(f"‚úÖ Matriks Val  : {val_mat.shape[0]} users √ó {val_mat.shape[1]} items")


üìä Jumlah data:
  Total data asli : 269,764
  Train data       : 215,489
  Validation data  : 54,275

‚úÖ Matriks Train: 13876 users √ó 123069 items
‚úÖ Matriks Val  : 13876 users √ó 123069 items


  train_data = train.groupby('user_idx', group_keys=False).apply(safe_sample)


In [5]:
# --- 7Ô∏è‚É£ Fungsi Evaluasi MAP@K ---
def mapk(actual, predicted, k=10):
    """Mean Average Precision at K"""
    scores = []
    for a, p in zip(actual, predicted):
        if len(a) == 0:
            scores.append(0)
        else:
            relevance = [1 if i in a else 0 for i in p[:k]]
            if sum(relevance) == 0:
                scores.append(0)
            else:
                precisions = [
                    np.mean(relevance[:i+1]) for i, r in enumerate(relevance) if r == 1
                ]
                scores.append(np.mean(precisions))
    return np.mean(scores)

In [None]:
# üî∑ VERSI  IMPROVED IBCF ‚Äî Cosine + BM25 + Cold-Start Handling
import warnings
warnings.filterwarnings("ignore")

from implicit.nearest_neighbours import CosineRecommender, bm25_weight
from tqdm import tqdm
import numpy as np
import pandas as pd

print("\nüöÄ Training Improved IBCF (Cosine + BM25) ...")

# --- 1Ô∏è‚É£ BM25 weighting ---
train_weighted = bm25_weight(train_mat.T, K1=100, B=0.8).T.tocsr()

# --- 2Ô∏è‚É£ Fit model ---
ibcf_model = CosineRecommender(K=100)
ibcf_model.fit(train_weighted.T)

# --- 3Ô∏è‚É£ Generate rekomendasi ---
ibcf_preds = []
top_n = 10

for u in tqdm(range(n_users)):
    # Rekomendasi untuk user u
    recs, _ = ibcf_model.recommend(
        userid=u,
        user_items=train_mat,             # harus users √ó items
        N=top_n,
        filter_already_liked_items=True
    )

    # Jika kosong (cold-start), fallback pakai item populer
    if len(recs) == 0:
        recs, _ = ibcf_model.similar_items(0, N=top_n)

    ibcf_preds.append(recs)

# --- 4Ô∏è‚É£ Evaluasi MAP@10 ---
actual = [val_data[val_data['user_idx'] == u]['item_idx'].tolist() for u in range(n_users)]

def mapk(actual, predicted, k=10):
    scores = []
    for a, p in zip(actual, predicted):
        if len(a) == 0:
            scores.append(0)
        else:
            relevance = [1 if i in a else 0 for i in p[:k]]
            precisions = [np.mean(relevance[:i+1]) for i, r in enumerate(relevance) if r == 1]
            scores.append(np.mean(precisions) if len(precisions) > 0 else 0)
    return np.mean(scores)

map10 = mapk(actual, ibcf_preds, k=10)
print(f"\nüìà MAP@10 : {map10:.4f}")

# --- 5Ô∏è‚É£ Simpan hasil ke CSV ---
rec_df = pd.DataFrame({
    "user_id": [list(user_map.keys())[u] for u in range(n_users)],
    **{f"item_{i+1}": [list(item_map.keys())[r[i]] if i < len(r) else None
                        for r in ibcf_preds] for i in range(10)}
})
rec_df.to_csv("/kaggle/working/ibcf_top10_fixed.csv", index=False)
print("‚úÖ Hasil rekomendasi disimpan ke '/kaggle/working/ibcf_top10_fixed.csv'")

# --- 6Ô∏è‚É£ Contoh 5 user pertama ---
display(rec_df.head())


In [6]:
# üî∑ VERSI 1 ‚Äî MODEL-BASED RECOMMENDER (ALS Optimized & Fixed + MAP@10)
from implicit.als import AlternatingLeastSquares
from tqdm import tqdm
import numpy as np

# Pastikan matriks CSR dan beri bobot (alpha)
alpha = 40
train_weighted = (train_mat * alpha).astype("double").tocsr()
val_mat = val_mat.tocsr()

print(f"\n‚úÖ Train matrix shape: {train_weighted.shape}")
print(f"‚úÖ Val matrix shape  : {val_mat.shape}")
print(f"‚úÖ Matrix weighted with alpha={alpha}")

# Training ALS (PAKSA PAKAI CPU agar tidak error "No CUDA extension ...")
print("\nüöÄ Training ALS model (CPU mode, use_gpu=False) ...")
als_model = AlternatingLeastSquares(
    factors=256,
    regularization=0.1,
    iterations=30,
    num_threads=8,
    use_gpu=False   # <- PENTING: paksa CPU untuk menghindari error CUDA
)


# NOTE: penting -> fit dengan matriks berorientasi (users x items) tanpa .T
als_model.fit(train_weighted)

# Sinkronisasi dimensi (cek cepat)
n_users, n_items = train_weighted.shape
print(f"üìä Model factors shapes: user_factors={als_model.user_factors.shape}, item_factors={als_model.item_factors.shape}")
print(f"üìä Expect: {n_users} users, {n_items} items")

# Generate rekomendasi per-user (pakai satu baris user_items setiap iterasi)
print("\nüéÅ Membuat rekomendasi untuk tiap user ...")
als_preds = []
for u in tqdm(range(n_users)):
    # ambil single-row CSR untuk user u
    user_row = train_weighted[u]            # shape (1, n_items)
    recs, _ = als_model.recommend(
        userid=u,
        user_items=user_row,
        N=10,
        filter_already_liked_items=True
    )
    als_preds.append(list(map(int, recs)))  # pastikan int list

# Siapkan ground-truth dari val_data
val_actual = [
    val_data[val_data['user_idx'] == u]['item_idx'].tolist()
    for u in range(n_users)
]

# Fungsi APK / MAPK (sama seperti yang kamu punya)
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k) if actual else 0.0

def mapk(actual_list, predicted_list, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual_list, predicted_list)])

map10 = mapk(val_actual, als_preds, k=10)
print(f"\nüéØ MAP@10 (ALS, CPU mode): {map10:.4f}")

# Buat mapping inverse (kalau belum ada)
item_inv_map = {v: k for k, v in item_map.items()}
user_inv_map = {v: k for k, v in user_map.items()}

# --- Buat dataframe hasil rekomendasi ---
als_df = pd.DataFrame({
    'user_id': [user_inv_map[u] for u in range(n_users)],
    **{f'item_{i+1}': [
        item_inv_map[recs[i]] if len(recs) > i else None for recs in als_preds
    ] for i in range(10)}
})

display(als_df.head())

# Simpan ke CSV
als_df.to_csv("als_recommendations.csv", index=False)
print("üíæ Hasil rekomendasi disimpan ke: als_recommendations.csv")



‚úÖ Train matrix shape: (13876, 123069)
‚úÖ Val matrix shape  : (13876, 123069)
‚úÖ Matrix weighted with alpha=40

üöÄ Training ALS model (CPU mode, use_gpu=False) ...


  check_blas_config()


  0%|          | 0/30 [00:00<?, ?it/s]

üìä Model factors shapes: user_factors=(13876, 256), item_factors=(123069, 256)
üìä Expect: 13876 users, 123069 items

üéÅ Membuat rekomendasi untuk tiap user ...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13876/13876 [01:27<00:00, 158.02it/s]



üéØ MAP@10 (ALS, CPU mode): 0.0208


Unnamed: 0,user_id,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10
0,8,345459202,038082101X,316601950,440212561,743225406,20811853,2244098,140132708,770428827,068483376X
1,99,385505833,0385335482,312966806,375414053,671786458,60519134,345356535,553569910,316096199,0399144587
2,114,515135062,0345450728,373218192,743411250,440222656,553272837,440234743,440236673,60504110,0671034022
3,160,316096199,0679781587,671027387,375700757,312422156,60934913,452281881,312278586,312195516,0440235162
4,183,9725752228,9722000020,385511612,9725781481,9726954657,9277102980,9723814021,9722018841,9722906542,9722325841


üíæ Hasil rekomendasi disimpan ke: als_recommendations.csv


In [None]:
# üî∑ VERSI Tuning ‚Äî MODEL-BASED RECOMMENDER (ALS Optimized & Fixed + MAP@10)
from implicit.als import AlternatingLeastSquares
from tqdm import tqdm
import numpy as np
import pandas as pd

# --- 1Ô∏è‚É£ Tentukan range hyperparameter ---
factors_list = [256]         # jumlah latent factors
regularization_list = [0.1,0.3, 0.05] # regularisasi
iterations_list =[20,30,40]  # jumlah iterasi
alpha_list = [40,80]  # weighting user-item matrix

best_map10 = -1
best_params = {}
best_preds = None

# --- 2Ô∏è‚É£ Fungsi MAP@10 ---
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k) if actual else 0.0

def mapk(actual_list, predicted_list, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual_list, predicted_list)])

# --- 3Ô∏è‚É£ Mulai grid search tuning ---
for factors in factors_list:
    for reg in regularization_list:
        for iter_num in iterations_list:
            for alpha in alpha_list:
                # Bobot matrix
                train_weighted = train_mat.multiply(alpha).astype("double").tocsr()
                val_mat_csr = val_mat.tocsr()

                # Inisialisasi model ALS
                als_model = AlternatingLeastSquares(
                    factors=factors,
                    regularization=reg,
                    iterations=iter_num,
                    num_threads=8,
                    use_gpu=False
                )
                
                # Fit ALS
                als_model.fit(train_weighted)

                # Generate rekomendasi untuk tiap user
                als_preds = []
                for u in range(n_users):
                    user_row = train_weighted[u]
                    recs, _ = als_model.recommend(
                        userid=u,
                        user_items=user_row,
                        N=10,
                        filter_already_liked_items=True
                    )
                    als_preds.append(list(map(int, recs)))

                # Ground truth
                val_actual = [
                    val_data[val_data['user_idx'] == u]['item_idx'].tolist()
                    for u in range(n_users)
                ]

                # Hitung MAP@10
                map10_score = mapk(val_actual, als_preds, k=10)
                print(f"factors={factors}, reg={reg}, iter={iter_num}, alpha={alpha} => MAP@10={map10_score:.4f}")

                # Simpan kombinasi terbaik
                if map10_score > best_map10:
                    best_map10 = map10_score
                    best_params = {"factors": factors, "regularization": reg, "iterations": iter_num, "alpha": alpha}
                    best_preds = als_preds.copy()

# --- 4Ô∏è‚É£ Tampilkan hasil terbaik ---
print("\n‚úÖ Best hyperparameters:")
print(best_params)
print(f"üéØ Best MAP@10: {best_map10:.4f}")

# --- 5Ô∏è‚É£ Buat DataFrame hasil rekomendasi ---
item_inv_map = {v: k for k, v in item_map.items()}
user_inv_map = {v: k for k, v in user_map.items()}

als_df = pd.DataFrame({
    'user_id': [user_inv_map[u] for u in range(n_users)],
    **{f'item_{i+1}': [
        item_inv_map[recs[i]] if len(recs) > i else None for recs in best_preds
    ] for i in range(10)}
})
display(als_df.head())
als_df.to_csv("als_recommendations_tuned.csv", index=False)

In [None]:
print("üíæ Hasil rekomendasi tuning disimpan ke: als_recommendations_tuned.csv")

  0%|          | 0/20 [00:00<?, ?it/s]

factors=256, reg=0.1, iter=20, alpha=40 => MAP@10=0.0213


  0%|          | 0/20 [00:00<?, ?it/s]

factors=256, reg=0.1, iter=20, alpha=80 => MAP@10=0.0198


  0%|          | 0/30 [00:00<?, ?it/s]

factors=256, reg=0.1, iter=30, alpha=40 => MAP@10=0.0208


  0%|          | 0/30 [00:00<?, ?it/s]

factors=256, reg=0.1, iter=30, alpha=80 => MAP@10=0.0196


  0%|          | 0/40 [00:00<?, ?it/s]

factors=256, reg=0.1, iter=40, alpha=40 => MAP@10=0.0209


  0%|          | 0/40 [00:00<?, ?it/s]

factors=256, reg=0.1, iter=40, alpha=80 => MAP@10=0.0196


  0%|          | 0/20 [00:00<?, ?it/s]

factors=256, reg=0.3, iter=20, alpha=40 => MAP@10=0.0206


  0%|          | 0/20 [00:00<?, ?it/s]

factors=256, reg=0.3, iter=20, alpha=80 => MAP@10=0.0199


  0%|          | 0/30 [00:00<?, ?it/s]

factors=256, reg=0.3, iter=30, alpha=40 => MAP@10=0.0207


  0%|          | 0/30 [00:00<?, ?it/s]

factors=256, reg=0.3, iter=30, alpha=80 => MAP@10=0.0200


  0%|          | 0/40 [00:00<?, ?it/s]

factors=256, reg=0.3, iter=40, alpha=40 => MAP@10=0.0207


  0%|          | 0/40 [00:00<?, ?it/s]

factors=256, reg=0.3, iter=40, alpha=80 => MAP@10=0.0200


  0%|          | 0/20 [00:00<?, ?it/s]

factors=256, reg=0.05, iter=20, alpha=40 => MAP@10=0.0204


  0%|          | 0/20 [00:00<?, ?it/s]

factors=256, reg=0.05, iter=20, alpha=80 => MAP@10=0.0196


  0%|          | 0/30 [00:00<?, ?it/s]

factors=256, reg=0.05, iter=30, alpha=40 => MAP@10=0.0208


  0%|          | 0/30 [00:00<?, ?it/s]

factors=256, reg=0.05, iter=30, alpha=80 => MAP@10=0.0197


  0%|          | 0/40 [00:00<?, ?it/s]

factors=256, reg=0.05, iter=40, alpha=40 => MAP@10=0.0203


  0%|          | 0/40 [00:00<?, ?it/s]

factors=256, reg=0.05, iter=40, alpha=80 => MAP@10=0.0194

‚úÖ Best hyperparameters:
{'factors': 256, 'regularization': 0.1, 'iterations': 20, 'alpha': 40}
üéØ Best MAP@10: 0.0213


Unnamed: 0,user_id,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10
0,8,316601950,345459202,038082101X,440212561,0020811853,0770428827,0002244098,380788152,143012738,0140132708
1,99,385505833,743418174,0312966806,385335482,0375414053,0060976845,038550120X,399144587,671786458,0380714752
2,114,515135062,553576801,0553272837,440234743,0373218192,0553578685,0440236673,61015725,345450728,0446613843
3,160,60175400,446364193,0345378490,385492081,0446532231,0449212602,0440221919,330332775,60926317,0440213525
4,183,9724126056,9721009512,0446523569,440225825,067091021X,972461316X,9724612023,9722023020,8532605087,48804/91


üíæ Hasil rekomendasi tuning disimpan ke: als_recommendations_tuned.csv


In [32]:
# üî∑ VERSI FineTuning ‚Äî MODEL-BASED RECOMMENDER (ALS Optimized & Fixed + MAP@10)
from implicit.als import AlternatingLeastSquares
from tqdm import tqdm
import numpy as np
import pandas as pd

# --- 1Ô∏è‚É£ Siapkan matrix weighted (alpha) ---
alpha = 40
train_weighted = (train_mat * alpha).astype("double").tocsr()
val_mat_csr = val_mat.tocsr()

# --- 2Ô∏è‚É£ Misal model ALS pretrained (dari training sebelumnya) ---
# Kalau belum ada, kita bisa latihan dulu 5 iterasi untuk ‚Äúpretrain‚Äù
pretrain_model = AlternatingLeastSquares(
    factors=256,
    regularization=0.1,
    iterations=20,    # iterasi sedikit untuk pretrain
    num_threads=8,
    use_gpu=False
)
pretrain_model.fit(train_weighted)
print("‚úÖ Pretrained ALS selesai!")

# --- 3Ô∏è‚É£ Finetuning ---
# Kita ambil bobot dari pretrain_model
finetune_model = AlternatingLeastSquares(
    factors=128,
    regularization=0.05,
    iterations=10,   # iterasi tambahan untuk finetuning
    num_threads=8,
    use_gpu=False
)
# Copy bobot dari pretrain_model
finetune_model.user_factors = pretrain_model.user_factors.copy()
finetune_model.item_factors = pretrain_model.item_factors.copy()

# Latih lagi dengan dataset yang sama atau subset baru
finetune_model.fit(train_weighted)
print("üéØ Finetuning ALS selesai!")

# --- 4Ô∏è‚É£ Generate rekomendasi per user ---
n_users, n_items = train_weighted.shape
als_preds = []
for u in tqdm(range(n_users)):
    user_row = train_weighted[u]
    recs, _ = finetune_model.recommend(
        userid=u,
        user_items=user_row,
        N=10,
        filter_already_liked_items=True
    )
    als_preds.append(list(map(int, recs)))

# --- 5Ô∏è‚É£ Evaluasi MAP@10 ---
val_actual = [
    val_data[val_data['user_idx'] == u]['item_idx'].tolist()
    for u in range(n_users)
]

def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k) if actual else 0.0

def mapk(actual_list, predicted_list, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual_list, predicted_list)])

map10 = mapk(val_actual, als_preds, k=10)
print(f"\nüìä MAP@10 (ALS Finetuned): {map10:.4f}")

# --- 6Ô∏è‚É£ Buat DataFrame hasil rekomendasi ---
item_inv_map = {v: k for k, v in item_map.items()}
user_inv_map = {v: k for k, v in user_map.items()}

als_df = pd.DataFrame({
    'user_id': [user_inv_map[u] for u in range(n_users)],
    **{f'item_{i+1}': [
        item_inv_map[recs[i]] if len(recs) > i else None for recs in als_preds
    ] for i in range(10)}
})

display(als_df.head())
als_df.to_csv("als_recommendations_finetuned.csv", index=False)
print("üíæ Hasil rekomendasi finetuning disimpan ke: als_recommendations_finetuned.csv")


  0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ Pretrained ALS selesai!


  0%|          | 0/10 [00:00<?, ?it/s]

üéØ Finetuning ALS selesai!


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13876/13876 [01:27<00:00, 158.25it/s]



üìä MAP@10 (ALS Finetuned): 0.0212


Unnamed: 0,user_id,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10
0,8,316601950,038082101X,345459202,440212561,0020811853,0002244098,743225406,0770428827,0060936363,688077080
1,99,385505833,0385335482,375414053,743418174,0312966806,0060519134,312422156,038529929X,0446677477,385486804
2,114,515135062,0446613436,440235162,440234743,0553272837,0440236673,553576801,0345450728,0061015725,515131741
3,160,446364193,0385508417,140298479,316769487,0312995423,031298328X,61097101,0142004235,0425147622,142001805
4,183,9725752228,9724126056,552998834,9722000020,038542471X,9722018841,9725781481,9723814021,972711167X,9722325841


üíæ Hasil rekomendasi finetuning disimpan ke: als_recommendations_finetuned.csv
