In [2]:
# Tahap 1: Load data
import pandas as pd

# Baca file CSV
train = pd.read_csv("train.csv")
target_users = pd.read_csv("data_target_users_test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Tampilkan beberapa baris pertama
print("Train data:")
display(train.head())

print("\nTarget users:")
display(target_users.head())

print("\nSample submission:")
display(sample_submission.head())

print(f"\nTotal interaksi: {len(train)}")


Train data:


Unnamed: 0,user_id,item_id
0,8,0002005018
1,8,074322678X
2,8,0887841740
3,8,1552041778
4,8,1567407781



Target users:


Unnamed: 0,user_id
0,8
1,99
2,114
3,160
4,183



Sample submission:


Unnamed: 0,user_id,item_id
0,8,item1 item2 item3 item4 item5 item6 item7 item...
1,99,item1 item2 item3 item4 item5 item6 item7 item...



Total interaksi: 269764


In [3]:
# Tahap 2: Eksplorasi data
n_users = train['user_id'].nunique()
n_items = train['item_id'].nunique()

print(f"Jumlah user unik: {n_users}")
print(f"Jumlah item unik: {n_items}")

# Distribusi interaksi per user
interaksi_user = train['user_id'].value_counts()
print("\nRata-rata interaksi per user:", interaksi_user.mean())

# Distribusi interaksi per item
interaksi_item = train['item_id'].value_counts()
print("\nRata-rata interaksi per item:", interaksi_item.mean())


Jumlah user unik: 13876
Jumlah item unik: 123069

Rata-rata interaksi per user: 19.441049293744594

Rata-rata interaksi per item: 2.1919736083010344


In [4]:
# Tahap 3: Baseline - Rekomendasi berdasarkan popularitas item
item_popularity = train['item_id'].value_counts().reset_index()
item_popularity.columns = ['item_id', 'popularity']

# Ambil 10 item terpopuler
top10_items = item_popularity['item_id'].head(10).tolist()
print("Top 10 items:", top10_items)

# Buat rekomendasi 10 item populer untuk semua user target
baseline_recommendations = {
    user: top10_items for user in target_users['user_id']
}

# Konversi ke DataFrame hasil
baseline_df = pd.DataFrame({
    'user_id': list(baseline_recommendations.keys()),
    'item_id': [','.join(map(str, top10_items))] * len(baseline_recommendations)
})

display(baseline_df.head())


Top 10 items: ['0316666343', '0385504209', '0312195516', '0142001740', '059035342X', '0060928336', '0446672211', '0345337662', '0452282152', '0316601950']


Unnamed: 0,user_id,item_id
0,8,"0316666343,0385504209,0312195516,0142001740,05..."
1,99,"0316666343,0385504209,0312195516,0142001740,05..."
2,114,"0316666343,0385504209,0312195516,0142001740,05..."
3,160,"0316666343,0385504209,0312195516,0142001740,05..."
4,183,"0316666343,0385504209,0312195516,0142001740,05..."


In [9]:
!pip install implicit==0.7.2 --only-binary=:all:

[31mERROR: Could not find a version that satisfies the requirement implicit==0.7.2 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for implicit==0.7.2[0m[31m
[0m

In [7]:
# Tahap 4: Model ALS
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k

from sklearn.preprocessing import LabelEncoder

# Encode user dan item ke ID numerik
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

train['user_idx'] = user_encoder.fit_transform(train['user_id'])
train['item_idx'] = item_encoder.fit_transform(train['item_id'])

# Buat matrix user-item (implicit feedback)
rows = train['item_idx']
cols = train['user_idx']
data = [1] * len(train)

user_item_matrix = sparse.csr_matrix((data, (rows, cols)))
print("Matrix shape (items x users):", user_item_matrix.shape)

# Buat model ALS
model = AlternatingLeastSquares(
    factors=50,          # jumlah latent factors
    regularization=0.01, # regularisasi
    iterations=15,       # jumlah iterasi
    use_gpu=False
)

# Latih model
model.fit(user_item_matrix)
print("Model training selesai ✅")


ModuleNotFoundError: No module named 'implicit'

In [None]:
# Tahap 5: Generate rekomendasi untuk 10 item per user target
recommendations = []

for user_id in target_users['user_id']:
    if user_id in user_encoder.classes_:
        user_idx = user_encoder.transform([user_id])[0]
        recs = model.recommend(
            user_idx,
            user_item_matrix.T,  # transpose karena model expect item-user matrix
            N=10
        )
        rec_items = [item_encoder.inverse_transform([i[0]])[0] for i in recs]
    else:
        # jika user baru (tidak ada di train), pakai rekomendasi populer
        rec_items = top10_items
    recommendations.append({
        'user_id': user_id,
        'item_id': ','.join(map(str, rec_items))
    })

recommendation_df = pd.DataFrame(recommendations)
display(recommendation_df.head())


In [None]:
# Tahap 6: Evaluasi MAP@10 (opsional)
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
print("Train size:", len(train_data), "Validasi size:", len(val_data))

# Buat matrix baru dari train_data dan val_data
train_mat = sparse.csr_matrix(
    ([1]*len(train_data),
     (train_data['item_idx'], train_data['user_idx']))
)

val_mat = sparse.csr_matrix(
    ([1]*len(val_data),
     (val_data['item_idx'], val_data['user_idx']))
)

# Hitung MAP@10
map10 = mean_average_precision_at_k(model, train_mat, val_mat, K=10)
print(f"MAP@10: {map10:.4f}")


In [None]:
# Tahap 7: Simpan hasil sesuai sample_submission
recommendation_df.to_csv("submission.csv", index=False)
print("File submission.csv berhasil disimpan ✅")
