In [None]:
# data_clean/movie/build_movie_sample_50_toprated.py
import pandas as pd
from pathlib import Path
import numpy as np

INPUT_PATH = Path("../../raw_data/tmdb_5000_movies.csv")
OUTPUT_DIR = Path(".")                    
OUTPUT_CSV = OUTPUT_DIR / "movie_sample_50_toprated.csv"

MIN_OVERVIEW_WORDS = 40    
MIN_RUNTIME = 70          
MAX_RUNTIME = 190           
MIN_VOTECOUNT = 1000        
MIN_VOTEAVG = 7.0          
TARGET_N = 50              

def wc(s: str) -> int:
    if not isinstance(s, str): return 0
    return len(s.split())

def main():
    df = pd.read_csv(INPUT_PATH)


    for col in ["runtime","vote_average","vote_count","popularity"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    df["overview"] = df["overview"].fillna("").astype(str)
    df["overview_wc"] = df["overview"].apply(wc)

    mask = (
        (df["overview_wc"] >= MIN_OVERVIEW_WORDS) &
        (df["runtime"].between(MIN_RUNTIME, MAX_RUNTIME, inclusive="both")) &
        (df["vote_count"] >= MIN_VOTECOUNT) &
        (df["vote_average"] >= MIN_VOTEAVG)
    )
    if "status" in df.columns:
        mask &= (df["status"].fillna("") == "Released")
    df = df[mask].copy()


    if "original_title" in df.columns:
        df = df.drop_duplicates(subset=["original_title"])

    C = df["vote_average"].mean()
    m = int(df["vote_count"].quantile(0.75)) 
    v = df["vote_count"]
    R = df["vote_average"]
    df["weighted_rating"] = (v/(v+m))*R + (m/(v+m))*C


    pop_z = (df["popularity"] - df["popularity"].mean()) / (df["popularity"].std(ddof=0) + 1e-9)
    df["final_score"] = df["weighted_rating"] + 0.05 * np.clip(pop_z, -2, 2)

    df_top = df.sort_values(["final_score","vote_count"], ascending=False).head(TARGET_N)

    keep_cols = [
        "title","original_title","overview","genres","keywords",
        "production_countries","release_date","runtime",
        "original_language","vote_average","vote_count","popularity",
        "weighted_rating","final_score"
    ]
    keep_cols = [c for c in keep_cols if c in df_top.columns]
    out = df_top[keep_cols].reset_index(drop=True)
    out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"[OK] Saved {len(out)} rows -> {OUTPUT_CSV}")

if __name__ == "__main__":
    main()
