# 🎬 高票房预测模型（使用 SVM / KNN / Voting 等）
基于 `train_movies.csv`, `validation_movies.csv`, `test_movies.csv`

In [1]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
)
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
)
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import warnings
warnings.filterwarnings("ignore")

# 读取数据
train_df = pd.read_csv("train_movies.csv")
valid_df = pd.read_csv("validation_movies.csv")
test_df = pd.read_csv("test_movies.csv")

# 合并用于一致预处理
full_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

# 转换数据类型
full_df['budget'] = pd.to_numeric(full_df['budget'], errors='coerce')
full_df['revenue'] = pd.to_numeric(full_df['revenue'], errors='coerce')
full_df['runtime'] = pd.to_numeric(full_df['runtime'], errors='coerce')
full_df['popularity'] = pd.to_numeric(full_df['popularity'], errors='coerce')
full_df['vote_average'] = pd.to_numeric(full_df['vote_average'], errors='coerce')
full_df['vote_count'] = pd.to_numeric(full_df['vote_count'], errors='coerce')

# 提取年份作为特征
full_df['release_date'] = pd.to_datetime(full_df['release_date'], errors='coerce')
full_df['release_year'] = full_df['release_date'].dt.year

# 构造目标变量（是否为高票房）
median_revenue = full_df['revenue'].median()
full_df['target'] = (full_df['revenue'] > median_revenue).astype(int)

# 选择特征
features = ['budget', 'runtime', 'popularity', 'vote_average', 'vote_count', 'release_year']
full_df = full_df[features + ['target']].dropna()

# 拆分为 train / valid / test（按原始大小）
train_len = len(train_df)
valid_len = len(valid_df)
test_len = len(test_df)

X_all = full_df[features]
y_all = full_df['target']

X_train = X_all[:train_len]
y_train = y_all[:train_len]
X_valid = X_all[train_len:train_len+valid_len]
y_valid = y_all[train_len:train_len+valid_len]
X_test = X_all[train_len+valid_len:]
y_test = y_all[train_len+valid_len:]

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)


In [3]:

models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    "LightGBM": LGBMClassifier(verbose=-1),
    "CatBoost": CatBoostClassifier(verbose=0),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "NaiveBayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=300)
}


In [4]:

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test_scaled)[:,1]
    else:
        y_prob = y_pred  # fallback

    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }

# 输出结果
result_df = pd.DataFrame(results).T
display(result_df)


Unnamed: 0,accuracy,precision,recall,f1,roc_auc
RandomForest,0.91561,0.813358,0.624298,0.706397,0.926834
GradientBoosting,0.917437,0.819508,0.63132,0.713209,0.932496
XGBoost,0.915839,0.803177,0.639045,0.711772,0.931844
LightGBM,0.919379,0.820536,0.645365,0.722484,0.933692
CatBoost,0.918922,0.820467,0.641854,0.720252,0.933355
SVM,0.913897,0.863341,0.558989,0.678602,0.885364
KNN,0.907959,0.785055,0.597612,0.678628,0.871162
AdaBoost,0.913555,0.826002,0.593399,0.690642,0.926137
NaiveBayes,0.908987,0.798856,0.588483,0.677719,0.894999
MLP,0.919036,0.817214,0.64677,0.72207,0.930601


In [5]:

# 使用 VotingClassifier 融合多个模型
voting_clf = VotingClassifier(
    estimators=[
        ("rf", models["RandomForest"]),
        ("xgb", models["XGBoost"]),
        ("svm", models["SVM"]),
        ("mlp", models["MLP"]),
        ("knn", models["KNN"]),
    ],
    voting="soft"
)

voting_clf.fit(X_train_scaled, y_train)
y_pred_vote = voting_clf.predict(X_test_scaled)

print("VotingClassifier Accuracy:", accuracy_score(y_test, y_pred_vote))


VotingClassifier Accuracy: 0.9184652278177458
