In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import mlflow

DATA_DIR = "/usr/mlflow/data"

anime = pd.read_csv(os.path.join(DATA_DIR, "anime_clean.csv"))
ratings_train = pd.read_csv(os.path.join(DATA_DIR, "ratings_train.csv"))
ratings_test = pd.read_csv(os.path.join(DATA_DIR, "ratings_test.csv"))

print("Anime:", anime.shape)
print("Train:", ratings_train.shape)
print("Test:", ratings_test.shape)

Anime: (12294, 7)
Train: (633755, 3)
Test: (316831, 3)


In [2]:
anime["text"] = anime["genre"].fillna("") + " " + anime["type"].fillna("")
anime[["name", "text"]].head()

Unnamed: 0,name,text
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural Movie"
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."
3,Steins;Gate,"Sci-Fi, Thriller TV"
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S..."


In [3]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(anime["text"])

print("TF-IDF shape:", tfidf_matrix.shape)

TF-IDF shape: (12294, 52)


In [4]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print("Cosine matrix shape:", cosine_sim.shape)

Cosine matrix shape: (12294, 12294)


In [5]:
indices = pd.Series(anime.index, index=anime["anime_id"]).drop_duplicates()

def recommend_item_based(user_id, top_n=10):
    # 找出 user 在 train 裡喜歡的動畫
    user_ratings = ratings_train[ratings_train["user_id"] == user_id]
    liked = user_ratings[user_ratings["rating"] > 7]["anime_id"].tolist()
    
    if len(liked) == 0:
        return pd.DataFrame(columns=["name", "genre"])
    
    sim_scores = np.zeros(cosine_sim.shape[0])
    
    for anime_id in liked:
        if anime_id in indices:
            idx = indices[anime_id]
            sim_scores += cosine_sim[idx]
    
    sim_scores = sim_scores / len(liked)
    sim_indices = sim_scores.argsort()[::-1]
    
    # 排除已看過的動畫
    seen = set(user_ratings["anime_id"])
    rec_ids = [anime.loc[i, "anime_id"] for i in sim_indices if anime.loc[i, "anime_id"] not in seen]
    
    rec_ids = rec_ids[:top_n]
    return anime[anime["anime_id"].isin(rec_ids)][["name", "genre"]]

In [6]:
def precision_recall_at_k_item(user_id, k=10):
    recs = recommend_item_based(user_id, top_n=k)
    if recs.empty:
        return np.nan, np.nan
    
    user_test = ratings_test[ratings_test["user_id"] == user_id]
    liked = set(user_test[user_test["rating"] > 7]["anime_id"])
    
    if len(liked) == 0:
        return np.nan, np.nan
    
    hit = len(set(recs.index) & liked)
    
    precision = hit / k
    recall = hit / len(liked)
    
    return precision, recall

# 抽樣部分使用者
sample_users = np.random.choice(ratings_train["user_id"].unique(), 100, replace=False)

precisions, recalls = [], []
for u in sample_users:
    p, r = precision_recall_at_k_item(u, 10)
    if not np.isnan(p):
        precisions.append(p)
    if not np.isnan(r):
        recalls.append(r)

mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)

print("Item-based Precision@10:", mean_precision)
print("Item-based Recall@10:", mean_recall)

Item-based Precision@10: 0.0013513513513513514
Item-based Recall@10: 0.0015015015015015015


In [10]:
import os, mlflow
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://mlflow:5000"))

mlflow.set_experiment("anime-recommender-item-based")

with mlflow.start_run(run_name="item_based_tfidf"):
    mlflow.log_param("top_n", 10)
    mlflow.log_param("feature", "genre+type TF-IDF")
    
    mlflow.log_metric("precision_at_10", mean_precision)
    mlflow.log_metric("recall_at_10", mean_recall)

In [8]:
# 隨機挑一個 test user
test_user = ratings_test["user_id"].sample(1, random_state=42).iloc[0]
print("測試使用者 ID:", test_user)

# 該使用者在 test set 裡喜歡的動畫
user_test = ratings_test[(ratings_test["user_id"] == test_user) & (ratings_test["rating"] > 7)]
liked_anime = anime[anime["anime_id"].isin(user_test["anime_id"])][["name", "genre"]]

print("\n🎯 Test set 裡這位使用者喜歡的動畫：")
print(liked_anime.head(10))  # 最多列 10 部

# 模型推薦的動畫
recommended = recommend_item_based(test_user, top_n=10)

print("\n🤖 模型推薦的前 10 部動畫：")
print(recommended)

測試使用者 ID: 71942

🎯 Test set 裡這位使用者喜歡的動畫：
                                    name  \
4                          Gintama&#039;   
113                     Noragami Aragoto   
353     Phantom: Requiem for the Phantom   
595                     Plastic Memories   
621            Akagami no Shirayuki-hime   
668     Yamada-kun to 7-nin no Majo (TV)   
724                  High School DxD New   
1333  Bleach Movie 1: Memories of Nobody   
1442                         Date A Live   
2159           Neppuu Kairiku Bushi Road   

                                                  genre  
4     Action, Comedy, Historical, Parody, Samurai, S...  
113            Action, Adventure, Shounen, Supernatural  
353                     Action, Drama, Seinen, Thriller  
595                              Drama, Romance, Sci-Fi  
621                     Drama, Fantasy, Romance, Shoujo  
668   Comedy, Harem, Mystery, Romance, School, Shoun...  
724   Action, Comedy, Demons, Ecchi, Harem, Romance,...  
1333      