# 1. 数据加载和预处理

In [27]:
import re
import pandas as pd

texts = []  # 存储文本内容
labels = []  # 存储标签

with open(r'filtered_cnews.train.txt',  encoding='utf-8') as file:
    for line in file:
        # 去除首尾空白字符（如换行符）
        line = line.strip()
        if line:  # 确保非空行
            # 使用正则表达式提取标签和文本
            match = re.match(r'^(\S+)\s+(.*)', line)
            if match:
                label, text = match.groups()
                labels.append(label)
                texts.append(text)

# 查看数据基本信息
print(f"总样本数: {len(texts)}")
print(f"类别分布: {pd.Series(labels).value_counts()}")

总样本数: 30000
类别分布: 体育    5000
家居    5000
房产    5000
教育    5000
科技    5000
财经    5000
Name: count, dtype: int64


# 2. 数据划分

In [29]:
from sklearn.model_selection import train_test_split

# 划分训练集和临时集 (4000训练, 1000临时)
X_train, X_temp, y_train, y_temp = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels)

# 划分验证集和测试集 (各500)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"训练集大小: {len(X_train)}")
print(f"验证集大小: {len(X_val)}")
print(f"测试集大小: {len(X_test)}")

训练集大小: 24000
验证集大小: 3000
测试集大小: 3000


# 3. 中文分词和特征提取

In [31]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer

def chinese_tokenizer(text):
    return list(jieba.cut(text))
# 使用TF-IDF提取特征
vectorizer = TfidfVectorizer(
    tokenizer=chinese_tokenizer,
    max_features=5000,  # 限制特征数量
    ngram_range=(1, 2)  # 包含unigram和bigram
    token_pattern=None
)
print("\n正在提取特征...")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)
print(f"特征维度: {X_train_tfidf.shape[1]}")


正在提取特征...




特征维度: 5000


# 4. 模型训练和评估（函数）

In [33]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def train_and_evaluate(model, model_name, X_train, y_train, X_val, y_val):
    print(f"\n正在训练 {model_name}...")
    model.fit(X_train, y_train)
    
    # 在验证集上评估
    y_pred = model.predict(X_val)
    
    # 计算指标
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred, output_dict=True)
    
    # 微平均和宏平均
    micro_avg = report['micro avg']
    macro_avg = report['macro avg']
    
    print(f"\n{model_name} 验证集准确率: {accuracy:.4f}")
    print("\n分类报告:")
    print(classification_report(y_val, y_pred))
    
    # 混淆矩阵
    cm = confusion_matrix(y_val, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=np.unique(labels), 
                yticklabels=np.unique(labels))
    plt.title(f'{model_name} 混淆矩阵')
    plt.xlabel('预测标签')
    plt.ylabel('真实标签')
    plt.show()
    
    return {
        'model': model,
        'name': model_name,
        'accuracy': accuracy,
        'micro_avg': micro_avg,
        'macro_avg': macro_avg,
        'confusion_matrix': cm
    }

## 4.1 朴素贝叶斯

In [35]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_results = train_and_evaluate(nb_model, "朴素贝叶斯", 
                               X_train_tfidf, y_train, 
                               X_val_tfidf, y_val)


正在训练 朴素贝叶斯...


KeyError: 'micro avg'

## 4.2 KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_results = train_and_evaluate(knn_model, "KNN", 
                                X_train_tfidf, y_train, 
                                X_val_tfidf, y_val)

## 4.3 GBDT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbdt_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
gbdt_results = train_and_evaluate(gbdt_model, "GBDT", 
                                 X_train_tfidf, y_train, 
                                 X_val_tfidf, y_val)

# 5. 模型比较和选择

In [None]:
results = [nb_results, knn_results, gbdt_results]

print("\n模型性能比较:")
comparison = pd.DataFrame({
    '模型': [r['name'] for r in results],
    '准确率': [r['accuracy'] for r in results],
    '微平均F1': [r['micro_avg']['f1-score'] for r in results],
    '宏平均F1': [r['macro_avg']['f1-score'] for r in results]
})
print(comparison)

# 选择最佳模型
best_model = gbdt_results['model']

# 6. 测试集评估最佳模型

In [None]:
# print("\n在测试集上评估最佳模型 (GBDT)...")
# y_test_pred = best_model.predict(X_test_tfidf)

# print("\n测试集分类报告:")
# print(classification_report(y_test, y_test_pred))

# # 测试集混淆矩阵
# cm_test = confusion_matrix(y_test, y_test_pred)
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', 
#             xticklabels=np.unique(labels), 
#             yticklabels=np.unique(labels))
# plt.title('GBDT 测试集混淆矩阵')
# plt.xlabel('预测标签')
# plt.ylabel('真实标签')
# plt.show()