In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import mlflow

DATA_DIR = "/usr/mlflow/data"

anime = pd.read_csv(os.path.join(DATA_DIR, "anime_clean.csv"))
ratings = pd.read_csv(os.path.join(DATA_DIR, "ratings_clean.csv"))

print("Anime:", anime.shape)
print("Ratings:", ratings.shape)

Anime: (12294, 7)
Ratings: (6337241, 3)


In [2]:
# 先抽出 15% (train+test)，剩下 85% 存為 back_up
ratings_sample, ratings_back_up = train_test_split(
    ratings, test_size=0.85, random_state=42
)

# 再拆出 train=10% 和 test=5%
ratings_train, ratings_test = train_test_split(
    ratings_sample, test_size=0.3333, random_state=42
)  # 因為 5%/15% ≈ 1/3

print("Train:", ratings_train.shape)
print("Test:", ratings_test.shape)
print("Back-up:", ratings_back_up.shape)

ratings_train.to_csv(os.path.join(DATA_DIR, "ratings_train.csv"), index=False)
ratings_test.to_csv(os.path.join(DATA_DIR, "ratings_test.csv"), index=False)
ratings_back_up.to_csv(os.path.join(DATA_DIR, "ratings_back_up.csv"), index=False)

Train: (633755, 3)
Test: (316831, 3)
Back-up: (5386655, 3)


In [3]:
user_item_matrix = ratings_train.pivot_table(
    index="user_id", 
    columns="anime_id", 
    values="rating"
).fillna(0)

print("User-Item shape:", user_item_matrix.shape)

User-Item shape: (57308, 7800)


In [4]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=6, n_jobs=-1)
knn.fit(user_item_matrix)

def recommend(user_id, top_n=10):
    if user_id not in user_item_matrix.index:
        return pd.DataFrame(columns=["name", "genre"])
    
    # 找出鄰居
    user_vector = user_item_matrix.loc[[user_id]]
    distances, indices = knn.kneighbors(user_vector, n_neighbors=6)
    
    neighbor_ids = user_item_matrix.index[indices.flatten()[1:]]  # 排除自己
    neighbor_ratings = user_item_matrix.loc[neighbor_ids]
    
    # 平均分數作為推薦依據
    mean_scores = neighbor_ratings.mean().sort_values(ascending=False)
    
    # 排除 user 已在 train 看過的動畫
    seen = user_item_matrix.loc[user_id]
    seen = seen[seen > 0].index
    recommendations = mean_scores.drop(seen).head(top_n)
    
    return anime[anime["anime_id"].isin(recommendations.index)][["name", "genre"]]

In [7]:
def precision_recall_at_k(user_id, k=10):
    recs = recommend(user_id, top_n=k)
    if recs.empty:
        return np.nan, np.nan
    
    # 測試集中的喜好動畫
    user_test = ratings_test[ratings_test["user_id"] == user_id]
    liked = set(user_test[user_test["rating"] > 7]["anime_id"])
    
    if len(liked) == 0:
        return np.nan, np.nan
    
    # 命中數
    hit = len(set(recs.index) & liked)
    
    precision = hit / k
    recall = hit / len(liked)
    
    return precision, recall

# 抽樣部分使用者做評估
sample_users = np.random.choice(user_item_matrix.index, 100, replace=False)

precisions, recalls = [], []
for u in sample_users:
    p, r = precision_recall_at_k(u, 10)
    if not np.isnan(p):
        precisions.append(p)
    if not np.isnan(r):
        recalls.append(r)

mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)

print("Baseline Precision@10:", mean_precision)
print("Baseline Recall@10:", mean_recall)


Baseline Precision@10: 0.001282051282051282
Baseline Recall@10: 0.00031269543464665416


In [11]:
import os, mlflow
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://mlflow:5000"))

mlflow.set_experiment("anime-recommender-baseline")

with mlflow.start_run(run_name="user_based_cf_small"):
    mlflow.log_param("train_pct", 10)
    mlflow.log_param("test_pct", 5)
    mlflow.log_param("neighbors", 5)
    mlflow.log_param("top_n", 10)
    
    mlflow.log_metric("precision_at_10", mean_precision)
    mlflow.log_metric("recall_at_10", mean_recall)

2025/09/15 11:54:42 INFO mlflow.tracking.fluent: Experiment with name 'anime-recommender-baseline' does not exist. Creating a new experiment.
