In [None]:
# data_clean/movie/build_movie_sample_50_toprated.py
# 选“评分更高 & 更知名”的 50 部电影（TMDB5000）
# 运行位置：在 data_clean/movie/ 下运行，或在仓库根目录运行都可
# 如果在仓库根目录运行，把 INPUT_PATH 改回 raw_data/ 开头即可

import pandas as pd
from pathlib import Path
import numpy as np

# —— 相对路径：从 data_clean/movie/ 回到 raw_data/ ——
INPUT_PATH = Path("../../raw_data/tmdb_5000_movies.csv")
OUTPUT_DIR = Path(".")                     # 输出到当前目录
OUTPUT_CSV = OUTPUT_DIR / "movie_sample_50_toprated.csv"

# 可调阈值（你可以根据效果微调）
MIN_OVERVIEW_WORDS = 40     # 简介至少 40 词
MIN_RUNTIME = 70            # 片长下限
MAX_RUNTIME = 190           # 片长上限
MIN_VOTECOUNT = 1000        # 至少多少人评分（提升“知名度”）
MIN_VOTEAVG = 7.0           # 平均分下限（提升“质量感”）
TARGET_N = 50               # 目标样本数

def wc(s: str) -> int:
    if not isinstance(s, str): return 0
    return len(s.split())

def main():
    df = pd.read_csv(INPUT_PATH)

    # 基础清洗
    for col in ["runtime","vote_average","vote_count","popularity"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    df["overview"] = df["overview"].fillna("").astype(str)
    df["overview_wc"] = df["overview"].apply(wc)

    # 质量闸门
    mask = (
        (df["overview_wc"] >= MIN_OVERVIEW_WORDS) &
        (df["runtime"].between(MIN_RUNTIME, MAX_RUNTIME, inclusive="both")) &
        (df["vote_count"] >= MIN_VOTECOUNT) &
        (df["vote_average"] >= MIN_VOTEAVG)
    )
    if "status" in df.columns:
        mask &= (df["status"].fillna("") == "Released")
    df = df[mask].copy()

    # 去重
    if "original_title" in df.columns:
        df = df.drop_duplicates(subset=["original_title"])

    # —— 加权评分（IMDb/Bayesian 公式）——
    # WR = (v/(v+m))*R + (m/(v+m))*C
    # R: 电影的平均分; v: 评分人数; C: 全局平均分; m: vote_count 的分位门槛
    C = df["vote_average"].mean()
    m = int(df["vote_count"].quantile(0.75))  # 你也可以改成 0.60~0.80 之间
    v = df["vote_count"]
    R = df["vote_average"]
    df["weighted_rating"] = (v/(v+m))*R + (m/(v+m))*C

    # 轻微考虑热度（popularity），合成一个 final_score
    # 注意：权重很小，避免“只看热度”
    pop_z = (df["popularity"] - df["popularity"].mean()) / (df["popularity"].std(ddof=0) + 1e-9)
    df["final_score"] = df["weighted_rating"] + 0.05 * np.clip(pop_z, -2, 2)

    # 排序取前 N
    df_top = df.sort_values(["final_score","vote_count"], ascending=False).head(TARGET_N)

    keep_cols = [
        "title","original_title","overview","genres","keywords",
        "production_countries","release_date","runtime",
        "original_language","vote_average","vote_count","popularity",
        "weighted_rating","final_score"
    ]
    keep_cols = [c for c in keep_cols if c in df_top.columns]
    out = df_top[keep_cols].reset_index(drop=True)
    out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"[OK] Saved {len(out)} rows -> {OUTPUT_CSV}")

if __name__ == "__main__":
    main()
