In [8]:
import os
import pandas as pd
import numpy as np
import mlflow
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

DATA_DIR = "/usr/mlflow/data"

anime = pd.read_csv(os.path.join(DATA_DIR, "anime_clean.csv"))
ratings_train = pd.read_csv(os.path.join(DATA_DIR, "ratings_train.csv"))
ratings_test = pd.read_csv(os.path.join(DATA_DIR, "ratings_test.csv"))

print("Anime:", anime.shape)
print("Train:", ratings_train.shape)
print("Test:", ratings_test.shape)


Anime: (12294, 7)
Train: (633755, 3)
Test: (316831, 3)


In [9]:
# 建立文字描述欄位
anime["text"] = anime["genre"].fillna("") + " " + anime["type"].fillna("")

# TF-IDF 向量化
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(anime["text"])

# 動畫相似度矩陣
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# anime_id ↔ index 對照表
indices = pd.Series(anime.index, index=anime["anime_id"]).drop_duplicates()


In [10]:
def recommend_item_based(user_id, top_n=10):
    user_ratings = ratings_train[ratings_train["user_id"] == user_id]
    liked = user_ratings[user_ratings["rating"] > 7]["anime_id"].tolist()
    
    if len(liked) == 0:
        return pd.DataFrame(columns=["name", "genre"])
    
    sim_scores = np.zeros(cosine_sim.shape[0])
    for anime_id in liked:
        if anime_id in indices:
            idx = indices[anime_id]
            sim_scores += cosine_sim[idx]
    
    sim_scores = sim_scores / len(liked)
    sim_indices = sim_scores.argsort()[::-1]
    
    seen = set(user_ratings["anime_id"])
    rec_ids = [anime.loc[i, "anime_id"] for i in sim_indices if anime.loc[i, "anime_id"] not in seen]
    rec_ids = rec_ids[:top_n]
    
    return anime[anime["anime_id"].isin(rec_ids)][["name", "genre"]]


In [11]:
sample_users = ratings_test["user_id"].drop_duplicates().sample(5, random_state=42)

results = []
for u in sample_users:
    liked = ratings_test[(ratings_test["user_id"] == u) & (ratings_test["rating"] > 7)]
    liked_list = anime[anime["anime_id"].isin(liked["anime_id"])]["name"].tolist()
    
    recs = recommend_item_based(u, top_n=10)["name"].tolist()
    
    results.append({
        "user_id": u,
        "liked_in_test": liked_list,
        "recommended": recs
    })

df_results = pd.DataFrame(results)
print(df_results.head())


   user_id                                      liked_in_test  \
0    43880                                           [Naruto]   
1    12909  [Binbougami ga!, Dungeon ni Deai wo Motomeru n...   
2    69001  [Higashi no Eden Movie I: The King of Eden, K,...   
3    53456  [Code Geass: Hangyaku no Lelouch R2, Tiger &am...   
4    60002  [Shijou Saikyou no Deshi Kenichi, Clannad: Aft...   

                                         recommended  
0  [Gungrave, Guilty Crown, Kurau Phantom Memory,...  
1  [Wolf&#039;s Rain, Bungou Stray Dogs, GetBacke...  
2  [High School DxD BorN, Trinity Seven, Omamori ...  
3  [xxxHOLiC Kei, Durarara!!x2 Ketsu, Durarara!!x...  
4  [Tsubasa: Shunraiki, Shakugan no Shana II (Sec...  


In [12]:
artifact_path = os.path.join(DATA_DIR, "day7_recommendations.csv")
df_results.to_csv(artifact_path, index=False)

In [15]:
import os, mlflow
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://mlflow:5000"))

mlflow.set_experiment("anime-recommender-artifacts")

with mlflow.start_run(run_name="day7_item_based_artifact"):
    mlflow.log_param("model_type", "item_based_tfidf")
    mlflow.log_param("top_n", 10)
    
    # 假設 Precision/Recall 是 Day 6 算出來的
    mlflow.log_metric("precision_at_10", 0.0013)
    mlflow.log_metric("recall_at_10", 0.0015)
    
    # 上傳推薦清單 CSV
    mlflow.log_artifact(artifact_path, artifact_path="recommendations")
    print("Artifact URI:", mlflow.get_artifact_uri())


2025/09/21 06:22:18 INFO mlflow.tracking.fluent: Experiment with name 'anime-recommender-artifacts' does not exist. Creating a new experiment.


Artifact URI: mlflow-artifacts:/5/8b8b4860ecfd4ef88b18077c5a5f0906/artifacts
