In [1]:
import os
import pandas as pd

DATA_DIR = "/usr/mlflow/data"

anime = pd.read_csv(os.path.join(DATA_DIR, "anime_clean.csv"))
ratings_train = pd.read_csv(os.path.join(DATA_DIR, "ratings_train.csv"))
ratings_test = pd.read_csv(os.path.join(DATA_DIR, "ratings_test.csv"))

print("Anime:", anime.shape)
print("Train:", ratings_train.shape)
print("Test:", ratings_test.shape)

Anime: (12294, 7)
Train: (633755, 3)
Test: (316831, 3)


In [2]:
import numpy as np

def precision_at_k(recommended, relevant, k=10):
    return len(set(recommended[:k]) & set(relevant)) / k

In [3]:
import mlflow
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 抽樣 1000 筆，控制計算時間
anime_sample = anime.sample(1000, random_state=42).reset_index(drop=True)

mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("anime-recsys-optuna")

def objective(trial):
    # 1️⃣ 抽樣參數
    max_features = trial.suggest_int("max_features", 500, 1500)
    ngram = trial.suggest_categorical("ngram_range", [(1,1), (1,2)])
    min_df = trial.suggest_int("min_df", 1, 3)

    # 2️⃣ 訓練 TF-IDF
    vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=max_features,
        ngram_range=ngram,
        min_df=min_df
    )
    tfidf = vectorizer.fit_transform(anime_sample["genre"].fillna(""))

    # 3️⃣ 相似度
    sim_matrix = cosine_similarity(tfidf)

    # 4️⃣ 隨機測試 50 部動畫
    test_idx = np.random.choice(len(anime_sample), 50, replace=False)
    scores = []
    for idx in test_idx:
        sim_scores = list(enumerate(sim_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_idx = [i for i, _ in sim_scores[1:11]]
        recommended = anime_sample.iloc[top_idx]["name"].tolist()
        relevant = anime_sample[anime_sample["genre"] == anime_sample.iloc[idx]["genre"]]["name"].tolist()
        if len(relevant) > 1:
            scores.append(precision_at_k(recommended, relevant, k=10))

    avg_precision = np.mean(scores)

    # 5️⃣ 記錄到 MLflow
    with mlflow.start_run(nested=True):
        mlflow.log_params({
            "max_features": max_features,
            "ngram_range": ngram,
            "min_df": min_df
        })
        mlflow.log_metric("precision_at_10", avg_precision)

    return avg_precision

# 只跑 10 trials，約 3 分鐘內完成
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("最佳參數:", study.best_params)


  from .autonotebook import tqdm as notebook_tqdm
2025/09/22 16:52:11 INFO mlflow.tracking.fluent: Experiment with name 'anime-recsys-optuna' does not exist. Creating a new experiment.
[I 2025-09-22 16:52:11,692] A new study created in memory with name: no-name-70912cc4-cdad-4d81-9d57-7efb6223a175
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

[I 2025-09-22 16:52:11,952] Trial 0 finished with value: 0.5642857142857142 and parameters: {'max_features

最佳參數: {'max_features': 1447, 'ngram_range': (1, 1), 'min_df': 3}


In [6]:
from mlflow import pyfunc
from mlflow.tracking import MlflowClient

best_params = study.best_params

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=best_params["max_features"],
    ngram_range=best_params["ngram_range"],
    min_df=best_params["min_df"]
)
tfidf = vectorizer.fit_transform(anime_sample["genre"].fillna(""))
sim_matrix = cosine_similarity(tfidf)

class ItemBasedTFIDF(pyfunc.PythonModel):
    def __init__(self, df, sim_matrix):
        self.df = df
        self.sim_matrix = sim_matrix
    def predict(self, context, model_input):
        anime_title = model_input[0]
        idx = self.df[self.df["name"] == anime_title].index[0]
        sim_scores = list(enumerate(self.sim_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_idx = [i for i, _ in sim_scores[1:11]]
        return self.df.iloc[top_idx]["name"].tolist()

with mlflow.start_run(run_name="best-item-tfidf") as run:
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=ItemBasedTFIDF(anime_sample, sim_matrix),
        registered_model_name="AnimeRecsysModel"
    )
    print("Artifacts URI:", run.info.artifact_uri)

# 把最新版本升級到 Staging
client = MlflowClient()
latest = client.get_latest_versions("AnimeRecsysModel", stages=["None"])[-1]
client.transition_model_version_stage(
    name="AnimeRecsysModel",
    version=latest.version,
    stage="Staging"
)


Registered model 'AnimeRecsysModel' already exists. Creating a new version of this model...
2025/09/22 16:55:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: AnimeRecsysModel, version 4
Created version '4' of model 'AnimeRecsysModel'.
  latest = client.get_latest_versions("AnimeRecsysModel", stages=["None"])[-1]
  client.transition_model_version_stage(


Artifacts URI: mlflow-artifacts:/10/1cf9c1d7ae5145d1acf6d560827ed69a/artifacts


<ModelVersion: aliases=[], creation_timestamp=1758560101564, current_stage='Staging', description='', last_updated_timestamp=1758560101722, name='AnimeRecsysModel', run_id='1cf9c1d7ae5145d1acf6d560827ed69a', run_link='', source='mlflow-artifacts:/10/1cf9c1d7ae5145d1acf6d560827ed69a/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='4'>