In [24]:
# Tahap 1: Load data
import pandas as pd

# Baca file CSV
train = pd.read_csv("train.csv")
target_users = pd.read_csv("data_target_users_test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Tampilkan beberapa baris pertama
print("Train data:")
display(train.head())

print("\nTarget users:")
display(target_users.head())

print("\nSample submission:")
display(sample_submission.head())

print(f"\nTotal interaksi: {len(train)}")


Train data:


Unnamed: 0,user_id,item_id
0,8,0002005018
1,8,074322678X
2,8,0887841740
3,8,1552041778
4,8,1567407781



Target users:


Unnamed: 0,user_id
0,8
1,99
2,114
3,160
4,183



Sample submission:


Unnamed: 0,user_id,item_id
0,8,item1 item2 item3 item4 item5 item6 item7 item...
1,99,item1 item2 item3 item4 item5 item6 item7 item...



Total interaksi: 269764


In [25]:
# Tahap 2: Eksplorasi data
n_users = train['user_id'].nunique()
n_items = train['item_id'].nunique()

print(f"Jumlah user unik: {n_users}")
print(f"Jumlah item unik: {n_items}")

# Distribusi interaksi per user
interaksi_user = train['user_id'].value_counts()
print("\nRata-rata interaksi per user:", interaksi_user.mean())

# Distribusi interaksi per item
interaksi_item = train['item_id'].value_counts()
print("\nRata-rata interaksi per item:", interaksi_item.mean())


Jumlah user unik: 13876
Jumlah item unik: 123069

Rata-rata interaksi per user: 19.441049293744594

Rata-rata interaksi per item: 2.1919736083010344


In [41]:
# Tahap 3: Baseline - Rekomendasi berdasarkan popularitas item
item_popularity = train['item_id'].value_counts().reset_index()
item_popularity.columns = ['item_id', 'popularity']

# Ambil 10 item terpopuler
top10_items = item_popularity['item_id'].head(10).tolist()
print("Top 10 items:", top10_items)

# Buat rekomendasi 10 item populer untuk semua user target
baseline_recommendations = {
    user: top10_items for user in target_users['user_id']
}

# Konversi ke DataFrame hasil
baseline_df = pd.DataFrame({
    'user_id': list(baseline_recommendations.keys()),
    'item_id': [','.join(map(str, top10_items))] * len(baseline_recommendations)
})

display(baseline_df.head(10))


Top 10 items: ['0316666343', '0385504209', '0312195516', '0142001740', '059035342X', '0060928336', '0446672211', '0345337662', '0452282152', '0316601950']


Unnamed: 0,user_id,item_id
0,8,"0316666343,0385504209,0312195516,0142001740,05..."
1,99,"0316666343,0385504209,0312195516,0142001740,05..."
2,114,"0316666343,0385504209,0312195516,0142001740,05..."
3,160,"0316666343,0385504209,0312195516,0142001740,05..."
4,183,"0316666343,0385504209,0312195516,0142001740,05..."
5,232,"0316666343,0385504209,0312195516,0142001740,05..."
6,242,"0316666343,0385504209,0312195516,0142001740,05..."
7,243,"0316666343,0385504209,0312195516,0142001740,05..."
8,244,"0316666343,0385504209,0312195516,0142001740,05..."
9,254,"0316666343,0385504209,0312195516,0142001740,05..."


In [27]:
!pip install implicit==0.7.2

Collecting implicit==0.7.2
  Downloading implicit-0.7.2.tar.gz (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: implicit
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.7.2-cp312-cp312-linux_x86_64.whl size=10855017 sha256=401a0c1272c65fe6ddb3352ad3c9d9443f6646edd6f0735df95720dc1c32f46b
  Stored in directory: /root/.cache/pip/wheels/b2/00/4f/9ff8af07a0a53ac6007ea5d739da19cfe147a2df542b6899f8
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [28]:
# Tahap 4: Model ALS
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k

from sklearn.preprocessing import LabelEncoder

# Encode user dan item ke ID numerik
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

train['user_idx'] = user_encoder.fit_transform(train['user_id'])
train['item_idx'] = item_encoder.fit_transform(train['item_id'])

# Buat matrix user-item (implicit feedback)
rows = train['item_idx']
cols = train['user_idx']
data = [1] * len(train)

user_item_matrix = sparse.csr_matrix((data, (rows, cols)))
print("Matrix shape (items x users):", user_item_matrix.shape)

# Buat model ALS
model = AlternatingLeastSquares(
    factors=50,          # jumlah latent factors
    regularization=0.01, # regularisasi
    iterations=15,       # jumlah iterasi
    use_gpu=False
)

# Latih model
model.fit(user_item_matrix)
print("Model training selesai ✅")




Matrix shape (items x users): (123069, 13876)


  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

Model training selesai ✅


In [43]:
# ✅ Tahap 5: Generate rekomendasi (filter_already_liked_items=False)
from scipy.sparse import csr_matrix

# ... (Pastikan user_item_matrix adalah Users x Items dari Tahap 4) ...

recommendations = []

for user_id in target_users['user_id']:
    if user_id in user_encoder.classes_:
        user_idx = int(user_encoder.transform([user_id])[0])
        user_interactions = user_item_matrix[user_idx]

        # Dapatkan rekomendasi (N=10)
        recs = model.recommend(
            userid=user_idx,
            user_items=user_interactions,
            N=10,
            # ✅ PERUBAHAN: Sesuai permintaan Anda
            filter_already_liked_items=False
        )

        rec_item_idxs = [int(i[0]) for i in recs]
        rec_items = item_encoder.inverse_transform(rec_item_idxs)
    else:
        rec_items = top10_items

    recommendations.append({
        'user_id': user_id,
        # Menggunakan koma (',') sebagai pemisah
        'item_id': ','.join(map(str, rec_items))
    })

# Buat DataFrame hasil rekomendasi
recommendation_df = pd.DataFrame(recommendations)
display(recommendation_df.head(10))

Unnamed: 0,user_id,item_id
0,8,"0006379583, 9022906116"
1,99,"0060927860, 9022906116"
2,114,"0060574607, 9022906116"
3,160,"0006379583, 9022906116"
4,183,"014027491X, 9022906116"
5,232,"0002246376, 9022906116"
6,242,"0006379583, 9022906116"
7,243,"0060508663, 9022906116"
8,244,"0060574607, 9022906116"
9,254,"0060574607, 9022906116"


In [30]:
# ✅ Tahap 6: Evaluasi MAP@10
from tqdm import tqdm
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

# --- 1️⃣ Split data menjadi train_data dan val_data ---
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
print(f"Train size: {len(train_data)} | Validasi size: {len(val_data)}")

# --- 2️⃣ Buat matrix user-item untuk train dan val ---
n_users = train['user_idx'].nunique()
n_items = train['item_idx'].nunique()

train_mat = csr_matrix(
    (np.ones(len(train_data), dtype=np.float32),
     (train_data['user_idx'].to_numpy(), train_data['item_idx'].to_numpy())),
    shape=(n_users, n_items)
)

val_mat = csr_matrix(
    (np.ones(len(val_data), dtype=np.float32),
     (val_data['user_idx'].to_numpy(), val_data['item_idx'].to_numpy())),
    shape=(n_users, n_items)
)

print("Matrix dibuat ✅")
print("train_mat shape:", train_mat.shape)
print("val_mat shape:", val_mat.shape)


# --- 3️⃣ Fungsi MAP@K ---
def mapk(model, train_mat, val_mat, k=10):
    """
    Hitung MAP@K manual untuk model implicit ALS
    """
    average_precisions = []

    n_users = val_mat.shape[0]
    for user_id in tqdm(range(n_users)):
        # Ambil item relevan dari data validasi
        true_items = val_mat[user_id].indices
        if len(true_items) == 0:
            continue

        # Rekomendasi dari model
        recs = model.recommend(
            userid=user_id,
            user_items=train_mat[user_id],
            N=k,
            filter_already_liked_items=False
        )

        # Tangani format keluaran
        if isinstance(recs, np.ndarray):
            rec_items = [int(r[0]) for r in recs] if recs.ndim > 1 else recs.tolist()
        else:
            rec_items = [int(i[0]) for i in recs]

        # Hitung precision kumulatif
        score = 0.0
        hits = 0
        for idx, item in enumerate(rec_items):
            if item in true_items:
                hits += 1
                score += hits / (idx + 1.0)
        average_precisions.append(score / min(len(true_items), k))

    return np.mean(average_precisions)


# --- 4️⃣ Jalankan evaluasi ---
map10 = mapk(model, train_mat, val_mat, k=10)
print(f"\n✅ MAP@10: {map10:.4f}")

Train size: 215811 | Validasi size: 53953
Matrix dibuat ✅
train_mat shape: (13876, 123069)
val_mat shape: (13876, 123069)


100%|██████████| 13876/13876 [00:03<00:00, 4602.94it/s]


✅ MAP@10: 0.0000





In [31]:
# Tahap 7: Simpan hasil sesuai sample_submission
recommendation_df.to_csv("submission.csv", index=False)
print("File submission.csv berhasil disimpan ✅")


File submission.csv berhasil disimpan ✅
