准备工作：安装必要的库

In [None]:
%pip install pandas 
%pip install scikit-learn 
%pip install nltk



第二步：数据预处理与清洗代码
这一步的目标是将原始文本转换为干净的单词列表

In [3]:
import pandas as pd
import re
import nltk
import ast
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# 下载停用词表
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# 修正后的解析函数
def get_first_genre(genre_str):
    if pd.isna(genre_str):
        return "Unknown"
    try:
        # 将字符串 "['Drama', 'Crime']" 转换为真正的列表 ['Drama', 'Crime']
        genre_list = ast.literal_eval(genre_str)
        if isinstance(genre_list, list) and len(genre_list) > 0:
            return genre_list[0]
        return "Unknown"
    except (ValueError, SyntaxError):
        # 如果解析失败（例如格式不是标准的列表字符串），尝试简单的正则提取
        found = re.findall(r"'\s*([^']+)\s*'", str(genre_str))
        if found:
            return found[0]
        return "Unknown"
    
def clean_text(text):
    if pd.isna(text):
        return ""
    # 1. 移除特殊字符和数字
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    # 2. 转换为小写
    text = text.lower()
    # 3. 分词
    words = text.split()
    # 4. 去除停用词并进行词干提取 (Stemming)
    words = [ps.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

# 加载你的 Kaggle 数据集
df = pd.read_csv('movies_info.csv') 

# 应用修正后的函数
df['main_genre'] = df['genres'].apply(get_first_genre)
df['clean_overview'] = df['overview'].apply(clean_text)

print("修正后的数据前几行：")
print(df[['original_title', 'main_genre', 'clean_overview']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


修正后的数据前几行：
             original_title main_genre  \
0  The Shawshank Redemption      Drama   
1             The Godfather      Drama   
2     The Godfather Part II      Drama   
3          Schindler's List      Drama   
4              12 Angry Men      Drama   

                                      clean_overview  
0  imprison doubl murder wife lover upstand banke...  
1  span year chronicl fiction italian american co...  
2  continu saga corleon crime famili young vito c...  
3  true stori businessman oskar schindler save th...  
4  defens prosecut rest juri file juri room decid...  


第三步：模型训练代码
我们将使用 TF-IDF 将文本转为数字，并使用 SVM 进行分类，这是电影类型预测中表现较好的模型

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import joblib

# 1. 特征提取 (TF-IDF)
# 我们将文字转换为计算机能理解的 5000 个重要单词特征
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_overview'])
y = df['main_genre']

# 2. 划分数据集 (80% 训练，20% 测试)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 训练 SVM 模型
# SVM 在短文本分类（如电影简介）中效果非常好
model = LinearSVC()
model.fit(X_train, y_train)

# 4. 评估模型
y_pred = model.predict(X_test)
print(f"模型准确率: {accuracy_score(y_test, y_pred):.2f}")
print("\n分类详细报告:")
print(classification_report(y_test, y_pred))

# 5. 保存模型 (网站开发时需要用到)
joblib.dump(model, 'movie_genre_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
print("模型已成功保存！")

模型准确率: 0.40

分类详细报告:
                 precision    recall  f1-score   support

         Action       0.40      0.46      0.43       246
      Adventure       0.23      0.21      0.22        80
      Animation       0.32      0.31      0.31       108
         Comedy       0.43      0.57      0.49       392
          Crime       0.17      0.14      0.15        74
          Drama       0.43      0.54      0.48       472
         Family       0.29      0.12      0.17        57
        Fantasy       0.22      0.05      0.08        41
        History       0.00      0.00      0.00        19
         Horror       0.54      0.52      0.53       191
          Music       0.00      0.00      0.00        15
        Mystery       0.29      0.05      0.09        39
        Romance       0.27      0.08      0.12        75
Science Fiction       0.18      0.17      0.18        46
       TV Movie       0.00      0.00      0.00         1
       Thriller       0.19      0.09      0.12       117
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
