In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import logging
from sklearn.preprocessing import StandardScaler
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer

# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 下载必要的NLTK数据
nltk.download('stopwords')
nltk.download('wordnet')

# 定义自定义停用词
general_stopwords = {
    'one', 'would', 'could', 'also', 'must', 'wa', 'ha', 'see',
    'think', 'make', 'made', 'like', 'said', 'new', 'use',
    'may', 'might', 'shall', 'get', 'got', 'doe', 'do', 'did',
    'say', 'way', 'day', 'time', 'year', 'week',
    'take', 'give', 'call', 'look', 'come', 'go',
    'many', 'much', 'several', 'other', 'another',
    'even', 'still', 'back', 'well', 'way', 'thing',
    'seem', 'rather', 'sure', 'likely', 'unlikely',
    'although', 'though', 'per','product','including','number',
    'include','using','item','standard','rule','clearly','used',
    'issue','public'
}

pronouns = {
    'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves',
    'this', 'that', 'these', 'those',
    'he', 'him', 'his', 'himself',
    'she', 'her', 'hers', 'herself'
}

complaint_common = {
    'complaint', 'complaints', 'complainant', 'complainants',
    'advertisement', 'advertisements', 'advertising', 'advertiser', 'advertisers',
    'ad', 'ads', 'advert', 'adverts',
    'concerned', 'concerning', 'concerns',
    'believe', 'believes', 'believed',
    'claim', 'claims', 'claimed',
    'content', 'contents',
    'page', 'pages',
    'show', 'shows', 'shown',
    'view', 'views', 'viewed'
}

class SVMTextClassifier:
    def __init__(self):
        # 合并所有停用词
        self.custom_stopwords = general_stopwords.union(
            pronouns,
            complaint_common
        )
        
        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            min_df=2,
            max_df=0.95,
            tokenizer=self.tokenize_and_preprocess
        )
        
        self.scaler = StandardScaler(with_mean=False)
        self.model = SVC(
            kernel='rbf',
            C=1.0,
            probability=True,
            random_state=42
        )

    def clean_text(self, text):
        """基础文本清理"""
        if pd.isna(text):
            return ""
        # 转换为小写
        text = text.lower()
        # 移除特殊字符和多余的空格
        text = re.sub(r'\|', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'http\S+|www.\S+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        # 移除标点符号
        text = text.translate(str.maketrans('', '', string.punctuation))
        return text.strip()

    def lemmatize_text(self, text):
        """词性还原"""
        if pd.isna(text):
            return ""
        lemmatizer = WordNetLemmatizer()
        words = text.split()
        return ' '.join([lemmatizer.lemmatize(word) for word in words])

    def remove_stopwords(self, text):
        """移除停用词"""
        if pd.isna(text):
            return ""
        # 合并NLTK停用词和自定义停用词
        stop_words = set(stopwords.words('english')).union(self.custom_stopwords)
        words = text.split()
        return ' '.join([word for word in words if word.lower() not in stop_words])

    def tokenize_and_preprocess(self, text):
        """文本预处理和分词"""
        text = self.clean_text(text)
        text = self.lemmatize_text(text)
        text = self.remove_stopwords(text)
        return text.split()

    def prepare_features(self, texts, is_training=True):
        """将文本转换为TF-IDF特征"""
        if is_training:
            features = self.vectorizer.fit_transform(texts)
            features = self.scaler.fit_transform(features)
        else:
            features = self.vectorizer.transform(texts)
            features = self.scaler.transform(features)
        return features

    def train_fold(self, X_train, y_train, X_val, y_val, fold_idx):
        """训练单个fold"""
        logger.info(f'Training fold {fold_idx + 1}')
        
        # 训练模型
        self.model.fit(X_train, y_train)
        
        # 评估训练集性能
        train_preds = self.model.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_preds)
        
        # 评估验证集性能
        val_preds = self.model.predict(X_val)
        val_accuracy = accuracy_score(y_val, val_preds)
        val_report = classification_report(y_val, val_preds)
        
        logger.info(f'Train Accuracy: {train_accuracy:.4f}')
        logger.info(f'Validation Accuracy: {val_accuracy:.4f}')
        logger.info(f'Validation Classification Report:\n{val_report}')
        
        return val_accuracy

    def train_kfold(self, csv_path, n_splits=5):
        """执行K折交叉验证训练"""
        # 读取数据
        df = pd.read_csv(csv_path)
        texts = df['text'].values
        labels = df['target'].values
        
        # 检查并打印标签分布
        unique_labels = np.unique(labels)
        logger.info(f"数据集中的标签类别: {unique_labels}")
        logger.info("标签分布:")
        for label in unique_labels:
            count = np.sum(labels == label)
            logger.info(f"标签 {label}: {count} 个样本 ({count/len(labels)*100:.2f}%)")
        
        # 初始化K折交叉验证
        kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        fold_scores = []
        
        # 对每个fold进行训练
        for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(texts)):
            logger.info(f'\nTraining Fold {fold_idx + 1}/{n_splits}')
            
            # 准备当前fold的数据
            train_texts, val_texts = texts[train_idx], texts[val_idx]
            train_labels, val_labels = labels[train_idx], labels[val_idx]
            
            # 特征提取
            logger.info("Preparing training features...")
            X_train = self.prepare_features(train_texts, is_training=True)
            logger.info("Preparing validation features...")
            X_val = self.prepare_features(val_texts, is_training=False)
            
            # 训练当前fold
            val_accuracy = self.train_fold(
                X_train, 
                train_labels,
                X_val,
                val_labels,
                fold_idx
            )
            
            fold_scores.append(val_accuracy)
            logger.info(f'Fold {fold_idx + 1} Accuracy: {val_accuracy:.4f}')
            
        # 输出总体结果
        mean_score = np.mean(fold_scores)
        std_score = np.std(fold_scores)
        logger.info(f'\nK-Fold Cross Validation Results:')
        logger.info(f'Mean Accuracy: {mean_score:.4f} (+/- {std_score:.4f})')
        logger.info(f'Individual Fold Scores: {fold_scores}')
        
        return mean_score, std_score, fold_scores

def main():
    # 设置随机种子
    np.random.seed(42)
    
    # 初始化分类器
    classifier = SVMTextClassifier()
    
    # 开始5折交叉验证训练
    mean_score, std_score, fold_scores = classifier.train_kfold(
        csv_path='/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/combination.csv',
        n_splits=5
    )

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niwenyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/niwenyu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO:__main__:数据集中的标签类别: [0 1 2 3 4 5]
INFO:__main__:标签分布:
INFO:__main__:标签 0: 289 个样本 (14.49%)
INFO:__main__:标签 1: 256 个样本 (12.84%)
INFO:__main__:标签 2: 641 个样本 (32.15%)
INFO:__main__:标签 3: 178 个样本 (8.93%)
INFO:__main__:标签 4: 94 个样本 (4.71%)
INFO:__main__:标签 5: 536 个样本 (26.88%)
INFO:__main__:
Training Fold 1/5
INFO:__main__:Preparing training features...
INFO:__main__:Preparing validation features...
INFO:__main__:Training fold 1
INFO:__main__:Train Accuracy: 0.9567
INFO:__main__:Validation Accuracy: 0.4962
INFO:__main__:Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.21      0.35        52
           1       1.00      0.18      0.31        55
         

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import logging
from sklearn.preprocessing import StandardScaler
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer

# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 下载必要的NLTK数据
nltk.download('stopwords')
nltk.download('wordnet')

# 定义自定义停用词
general_stopwords = {
    'one', 'would', 'could', 'also', 'must', 'wa', 'ha', 'see',
    'think', 'make', 'made', 'like', 'said', 'new', 'use',
    'may', 'might', 'shall', 'get', 'got', 'doe', 'do', 'did',
    'say', 'way', 'day', 'time', 'year', 'week',
    'take', 'give', 'call', 'look', 'come', 'go',
    'many', 'much', 'several', 'other', 'another',
    'even', 'still', 'back', 'well', 'way', 'thing',
    'seem', 'rather', 'sure', 'likely', 'unlikely',
    'although', 'though', 'per','product','including','number',
    'include','using','item','standard','rule','clearly','used',
    'issue','public'
}

pronouns = {
    'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves',
    'this', 'that', 'these', 'those',
    'he', 'him', 'his', 'himself',
    'she', 'her', 'hers', 'herself'
}

complaint_common = {
    'complaint', 'complaints', 'complainant', 'complainants',
    'advertisement', 'advertisements', 'advertising', 'advertiser', 'advertisers',
    'ad', 'ads', 'advert', 'adverts',
    'concerned', 'concerning', 'concerns',
    'believe', 'believes', 'believed',
    'claim', 'claims', 'claimed',
    'content', 'contents',
    'page', 'pages',
    'show', 'shows', 'shown',
    'view', 'views', 'viewed'
}

class SVMTextClassifier:
    def __init__(self):
        # 合并所有停用词
        self.custom_stopwords = general_stopwords.union(
            pronouns,
            complaint_common
        )
        
        self.vectorizer = TfidfVectorizer(
            max_features=3000,  # 减少特征数量
            ngram_range=(1, 1), # 只使用单个词
            min_df=3,          # 增加最小文档频率
            max_df=0.9,        # 降低最大文档频率
            tokenizer=self.tokenize_and_preprocess,
            norm='l2',         # 使用L2归一化
            sublinear_tf=True  # 使用次线性TF缩放
        )
        
        self.scaler = StandardScaler(with_mean=False)
        self.model = SVC(
            kernel='rbf',
            C=0.1,           # 降低C值，增加正则化强度
            gamma='scale',    # 使用自动缩放的gamma
            probability=True,
            class_weight='balanced', # 处理类别不平衡
            random_state=42
        )

    def clean_text(self, text):
        """基础文本清理"""
        if pd.isna(text):
            return ""
        # 转换为小写
        text = text.lower()
        # 移除特殊字符和多余的空格
        text = re.sub(r'\|', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'http\S+|www.\S+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        # 移除标点符号
        text = text.translate(str.maketrans('', '', string.punctuation))
        return text.strip()

    def lemmatize_text(self, text):
        """词性还原"""
        if pd.isna(text):
            return ""
        lemmatizer = WordNetLemmatizer()
        words = text.split()
        return ' '.join([lemmatizer.lemmatize(word) for word in words])

    def remove_stopwords(self, text):
        """移除停用词"""
        if pd.isna(text):
            return ""
        # 合并NLTK停用词和自定义停用词
        stop_words = set(stopwords.words('english')).union(self.custom_stopwords)
        words = text.split()
        return ' '.join([word for word in words if word.lower() not in stop_words])

    def tokenize_and_preprocess(self, text):
        """文本预处理和分词"""
        text = self.clean_text(text)
        text = self.lemmatize_text(text)
        text = self.remove_stopwords(text)
        return text.split()

    def prepare_features(self, texts, is_training=True):
        """将文本转换为TF-IDF特征"""
        if is_training:
            features = self.vectorizer.fit_transform(texts)
            features = self.scaler.fit_transform(features)
        else:
            features = self.vectorizer.transform(texts)
            features = self.scaler.transform(features)
        return features

    def save_model(self, save_dir, fold_idx):
        """保存模型和特征提取器"""
        import os
        import joblib
        
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
            
        # 保存SVM模型
        model_path = os.path.join(save_dir, f'svm_model_fold_{fold_idx}.joblib')
        joblib.dump(self.model, model_path)
        
        # 保存TF-IDF vectorizer
        vectorizer_path = os.path.join(save_dir, f'tfidf_vectorizer_fold_{fold_idx}.joblib')
        joblib.dump(self.vectorizer, vectorizer_path)
        
        # 保存scaler
        scaler_path = os.path.join(save_dir, f'scaler_fold_{fold_idx}.joblib')
        joblib.dump(self.scaler, scaler_path)
        
        logger.info(f'Model and preprocessing objects saved in {save_dir}')

    def load_model(self, save_dir, fold_idx):
        """加载模型和特征提取器"""
        import joblib
        
        # 加载SVM模型
        model_path = os.path.join(save_dir, f'svm_model_fold_{fold_idx}.joblib')
        self.model = joblib.load(model_path)
        
        # 加载TF-IDF vectorizer
        vectorizer_path = os.path.join(save_dir, f'tfidf_vectorizer_fold_{fold_idx}.joblib')
        self.vectorizer = joblib.load(vectorizer_path)
        
        # 加载scaler
        scaler_path = os.path.join(save_dir, f'scaler_fold_{fold_idx}.joblib')
        self.scaler = joblib.load(scaler_path)
        
        logger.info(f'Model and preprocessing objects loaded from {save_dir}')
        
    def train_fold(self, X_train, y_train, X_val, y_val, fold_idx, save_dir):
        """训练单个fold"""
        logger.info(f'Training fold {fold_idx + 1}')
        
        # 训练模型
        self.model.fit(X_train, y_train)
        
        # 评估训练集性能
        train_preds = self.model.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_preds)
        
        # 评估验证集性能
        val_preds = self.model.predict(X_val)
        val_accuracy = accuracy_score(y_val, val_preds)
        val_report = classification_report(y_val, val_preds)
        
        logger.info(f'Train Accuracy: {train_accuracy:.4f}')
        logger.info(f'Validation Accuracy: {val_accuracy:.4f}')
        logger.info(f'Validation Classification Report:\n{val_report}')
        
        # 保存当前fold的模型
        self.save_model(save_dir, fold_idx)
        logger.info(f'Model for fold {fold_idx + 1} saved.')
        
        return val_accuracy

    def train_kfold(self, csv_path, n_splits=5, save_dir='model_saves'):
        """执行K折交叉验证训练"""
        # 读取数据
        df = pd.read_csv(csv_path)
        texts = df['text'].values
        labels = df['target'].values
        
        # 检查并打印标签分布
        unique_labels = np.unique(labels)
        logger.info(f"数据集中的标签类别: {unique_labels}")
        logger.info("标签分布:")
        for label in unique_labels:
            count = np.sum(labels == label)
            logger.info(f"标签 {label}: {count} 个样本 ({count/len(labels)*100:.2f}%)")
        
        # 初始化K折交叉验证
        kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        fold_scores = []
        
        # 对每个fold进行训练
        for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(texts)):
            logger.info(f'\nTraining Fold {fold_idx + 1}/{n_splits}')
            
            # 准备当前fold的数据
            train_texts, val_texts = texts[train_idx], texts[val_idx]
            train_labels, val_labels = labels[train_idx], labels[val_idx]
            
            # 特征提取
            logger.info("Preparing training features...")
            X_train = self.prepare_features(train_texts, is_training=True)
            logger.info("Preparing validation features...")
            X_val = self.prepare_features(val_texts, is_training=False)
            
            # 训练当前fold
            val_accuracy = self.train_fold(
                X_train, 
                train_labels,
                X_val,
                val_labels,
                fold_idx,
                save_dir
            )
            
            fold_scores.append(val_accuracy)
            logger.info(f'Fold {fold_idx + 1} Accuracy: {val_accuracy:.4f}')
            
        # 输出总体结果
        mean_score = np.mean(fold_scores)
        std_score = np.std(fold_scores)
        logger.info(f'\nK-Fold Cross Validation Results:')
        logger.info(f'Mean Accuracy: {mean_score:.4f} (+/- {std_score:.4f})')
        logger.info(f'Individual Fold Scores: {fold_scores}')
        
        return mean_score, std_score, fold_scores

def main():
    # 设置随机种子
    np.random.seed(42)
    
    # 初始化分类器
    classifier = SVMTextClassifier()
    
    # 开始5折交叉验证训练
    mean_score, std_score, fold_scores = classifier.train_kfold(
        csv_path='combination.csv',
        n_splits=5
    )

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niwenyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/niwenyu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO:__main__:数据集中的标签类别: [0 1 2 3 4 5]
INFO:__main__:标签分布:
INFO:__main__:标签 0: 289 个样本 (14.49%)
INFO:__main__:标签 1: 256 个样本 (12.84%)
INFO:__main__:标签 2: 641 个样本 (32.15%)
INFO:__main__:标签 3: 178 个样本 (8.93%)
INFO:__main__:标签 4: 94 个样本 (4.71%)
INFO:__main__:标签 5: 536 个样本 (26.88%)
INFO:__main__:
Training Fold 1/5
INFO:__main__:Preparing training features...
INFO:__main__:Preparing validation features...
INFO:__main__:Training fold 1
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
INFO:__main__:Train Accuracy: 0.3323
INFO:__main__:Validation Accuracy: 0.2281
I

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import logging
from sklearn.preprocessing import StandardScaler
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import joblib

# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 下载必要的NLTK数据
nltk.download('stopwords')
nltk.download('wordnet')

class TextClassifier:
    def __init__(self):
        # TF-IDF特征提取器
        self.vectorizer = TfidfVectorizer(
            max_features=1000,        # 减少特征数量
            min_df=2,                 # 最小文档频率
            max_df=0.85,              # 最大文档频率
            ngram_range=(1, 2),       # 使用unigram和bigram
            sublinear_tf=True,        # 使用次线性TF缩放
            stop_words='english'      # 使用英语停用词
        )
        
        # 随机森林分类器
        self.model = RandomForestClassifier(
            n_estimators=200,         # 增加树的数量
            max_depth=20,             # 限制树的深度
            min_samples_split=5,      # 分裂所需的最小样本数
            min_samples_leaf=2,       # 叶节点所需的最小样本数
            max_features='sqrt',      # 特征选择方式
            class_weight='balanced',  # 处理类别不平衡
            n_jobs=-1,               # 使用所有CPU
            random_state=42
        )
        
    def clean_text(self, text):
        """文本预处理"""
        if pd.isna(text):
            return ""
        
        # 转换为小写
        text = text.lower()
        
        # 移除特殊字符和标点符号
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # 移除数字
        text = re.sub(r'\d+', '', text)
        
        # 移除多余空格
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def prepare_features(self, texts, is_training=True):
        """特征提取"""
        # 文本清理
        cleaned_texts = [self.clean_text(text) for text in texts]
        
        # TF-IDF特征提取
        if is_training:
            features = self.vectorizer.fit_transform(cleaned_texts)
        else:
            features = self.vectorizer.transform(cleaned_texts)
            
        return features

    def save_model(self, save_dir, fold_idx):
        """保存模型"""
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        # 保存模型和vectorizer
        model_path = os.path.join(save_dir, f'model_fold_{fold_idx}.joblib')
        vectorizer_path = os.path.join(save_dir, f'vectorizer_fold_{fold_idx}.joblib')
        
        joblib.dump(self.model, model_path)
        joblib.dump(self.vectorizer, vectorizer_path)
        
        logger.info(f'Model and vectorizer saved in {save_dir}')
        
    def load_model(self, save_dir, fold_idx):
        """加载模型"""
        model_path = os.path.join(save_dir, f'model_fold_{fold_idx}.joblib')
        vectorizer_path = os.path.join(save_dir, f'vectorizer_fold_{fold_idx}.joblib')
        
        self.model = joblib.load(model_path)
        self.vectorizer = joblib.load(vectorizer_path)
        
        logger.info(f'Model and vectorizer loaded from {save_dir}')

    def train_fold(self, X_train, y_train, X_val, y_val, fold_idx, save_dir):
        """训练单个fold"""
        logger.info(f'Training fold {fold_idx + 1}')
        
        # 训练模型
        self.model.fit(X_train, y_train)
        
        # 评估训练集性能
        train_preds = self.model.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_preds)
        train_report = classification_report(y_train, train_preds)
        
        # 评估验证集性能
        val_preds = self.model.predict(X_val)
        val_accuracy = accuracy_score(y_val, val_preds)
        val_report = classification_report(y_val, val_preds)
        
        # 输出特征重要性
        feature_names = self.vectorizer.get_feature_names_out()
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': self.model.feature_importances_
        })
        feature_importance = feature_importance.sort_values('importance', ascending=False)
        
        logger.info(f'Train Accuracy: {train_accuracy:.4f}')
        logger.info(f'Train Classification Report:\n{train_report}')
        logger.info(f'Validation Accuracy: {val_accuracy:.4f}')
        logger.info(f'Validation Classification Report:\n{val_report}')
        logger.info('\nTop 10 most important features:')
        logger.info(feature_importance.head(10))
        
        # 保存模型
        self.save_model(save_dir, fold_idx)
        return val_accuracy

    def train_kfold(self, csv_path, n_splits=5, save_dir='model_saves'):
        """K折交叉验证训练"""
        # 读取数据
        df = pd.read_csv(csv_path)
        texts = df['text'].values
        labels = df['target'].values
        
        # 打印标签分布
        unique_labels = np.unique(labels)
        logger.info(f"数据集中的标签类别: {unique_labels}")
        logger.info("标签分布:")
        for label in unique_labels:
            count = np.sum(labels == label)
            logger.info(f"标签 {label}: {count} 个样本 ({count/len(labels)*100:.2f}%)")
            
        # K折交叉验证
        kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        fold_scores = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(texts)):
            logger.info(f'\nTraining Fold {fold_idx + 1}/{n_splits}')
            
            # 准备数据
            train_texts, val_texts = texts[train_idx], texts[val_idx]
            train_labels, val_labels = labels[train_idx], labels[val_idx]
            
            # 特征提取
            logger.info("Preparing training features...")
            X_train = self.prepare_features(train_texts, is_training=True)
            logger.info("Preparing validation features...")
            X_val = self.prepare_features(val_texts, is_training=False)
            
            # 训练和评估
            val_accuracy = self.train_fold(
                X_train, train_labels,
                X_val, val_labels,
                fold_idx, save_dir
            )
            
            fold_scores.append(val_accuracy)
            
        # 输出总体结果
        logger.info('\nK-Fold Cross Validation Results:')
        logger.info(f'Mean Accuracy: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})')
        logger.info(f'Individual Fold Scores: {fold_scores}')

def main():
    # 设置随机种子
    np.random.seed(42)
    
    # 初始化分类器
    classifier = TextClassifier()
    
    # 训练模型
    classifier.train_kfold(
        csv_path='combination.csv',
        n_splits=5,
        save_dir='model_saves'
    )

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niwenyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/niwenyu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO:__main__:数据集中的标签类别: [0 1 2 3 4 5]
INFO:__main__:标签分布:
INFO:__main__:标签 0: 289 个样本 (14.49%)
INFO:__main__:标签 1: 256 个样本 (12.84%)
INFO:__main__:标签 2: 641 个样本 (32.15%)
INFO:__main__:标签 3: 178 个样本 (8.93%)
INFO:__main__:标签 4: 94 个样本 (4.71%)
INFO:__main__:标签 5: 536 个样本 (26.88%)
INFO:__main__:
Training Fold 1/5
INFO:__main__:Preparing training features...
INFO:__main__:Preparing validation features...
INFO:__main__:Training fold 1
INFO:__main__:Train Accuracy: 0.7066
INFO:__main__:Train Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.56      0.70       237
           1       0.90      0.58      0.71       201
           2       0.74      0.67      0.70       530
  

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import logging
from sklearn.preprocessing import StandardScaler
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import joblib
import lightgbm as lgb
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class EnsembleTextClassifier:
    def __init__(self):
        # TF-IDF特征提取器配置
        self.vectorizer = TfidfVectorizer(
            max_features=2000,
            ngram_range=(1, 3),       # 使用1-3gram特征
            min_df=3,                 # 最小文档频率
            max_df=0.8,               # 最大文档频率
            sublinear_tf=True,        # 使用次线性TF缩放
            stop_words='english'      # 使用英语停用词
        )
        
        # 随机森林分类器配置
        self.rf = RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            class_weight='balanced',
            n_jobs=-1,
            random_state=42
        )
        
        # LightGBM分类器配置
        self.lgb = lgb.LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            num_leaves=31,
            class_weight='balanced',
            random_state=42
        )
        
        # XGBoost分类器配置
        self.xgb = XGBClassifier(
            n_estimators=200,
            max_depth=7,
            learning_rate=0.1,
            random_state=42
        )
        
        # 集成分类器
        self.model = VotingClassifier(
            estimators=[
                ('rf', self.rf),
                ('lgb', self.lgb),
                ('xgb', self.xgb)
            ],
            voting='soft'
        )
        
        # SMOTE过采样
        self.smote = SMOTE(random_state=42)

    def clean_text(self, text):
        """文本清理"""
        if pd.isna(text):
            return ""
        
        # 转换为小写
        text = text.lower()
        
        # 移除特殊字符和标点
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # 移除数字
        text = re.sub(r'\d+', '', text)
        
        # 移除多余空格
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def prepare_features(self, texts, labels=None, is_training=True):
        """特征提取和处理"""
        # 文本清理
        cleaned_texts = [self.clean_text(text) for text in texts]
        
        # TF-IDF特征提取
        if is_training:
            features = self.vectorizer.fit_transform(cleaned_texts)
            if labels is not None:
                # 使用SMOTE平衡数据
                features, labels = self.smote.fit_resample(features, labels)
        else:
            features = self.vectorizer.transform(cleaned_texts)
            
        return features, labels if labels is not None else None

    def optimize_parameters(self, X_train, y_train):
        """使用网格搜索优化参数"""
        # 随机森林参数网格
        rf_param_grid = {
            'rf__n_estimators': [100, 200, 300],
            'rf__max_depth': [10, 20, 30],
            'rf__min_samples_split': [2, 5, 10]
        }
        
        grid_search = GridSearchCV(
            estimator=self.model,
            param_grid=rf_param_grid,
            cv=3,
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        logger.info(f"Best parameters: {grid_search.best_params_}")
        logger.info(f"Best score: {grid_search.best_score_:.4f}")
        
        return grid_search.best_estimator_

    def get_feature_importance(self, X_train, y_train):
        """获取特征重要性"""
        # 使用随机森林获取特征重要性
        self.rf.fit(X_train, y_train)
        feature_names = self.vectorizer.get_feature_names_out()
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': self.rf.feature_importances_
        })
        return feature_importance.sort_values('importance', ascending=False)

    def train_fold(self, X_train, y_train, X_val, y_val, fold_idx, save_dir):
        """训练单个fold"""
        logger.info(f'Training fold {fold_idx + 1}')
        
        # 参数优化
        logger.info("Optimizing parameters...")
        self.model = self.optimize_parameters(X_train, y_train)
        
        # 获取特征重要性
        feature_importance = self.get_feature_importance(X_train, y_train)
        
        # 评估训练集性能
        train_preds = self.model.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_preds)
        train_report = classification_report(y_train, train_preds)
        
        # 评估验证集性能
        val_preds = self.model.predict(X_val)
        val_accuracy = accuracy_score(y_val, val_preds)
        val_report = classification_report(y_val, val_preds)
        
        # 输出结果
        logger.info(f'Train Accuracy: {train_accuracy:.4f}')
        logger.info(f'Train Classification Report:\n{train_report}')
        logger.info(f'Validation Accuracy: {val_accuracy:.4f}')
        logger.info(f'Validation Classification Report:\n{val_report}')
        logger.info('\nTop 10 most important features:')
        logger.info(feature_importance.head(10))
        
        # 保存模型
        self.save_model(save_dir, fold_idx)
        
        return val_accuracy

    def save_model(self, save_dir, fold_idx):
        """保存模型"""
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
            
        model_path = os.path.join(save_dir, f'model_fold_{fold_idx}.joblib')
        vectorizer_path = os.path.join(save_dir, f'vectorizer_fold_{fold_idx}.joblib')
        
        joblib.dump(self.model, model_path)
        joblib.dump(self.vectorizer, vectorizer_path)
        
        logger.info(f'Model and vectorizer saved in {save_dir}')

    def load_model(self, save_dir, fold_idx):
        """加载模型"""
        model_path = os.path.join(save_dir, f'model_fold_{fold_idx}.joblib')
        vectorizer_path = os.path.join(save_dir, f'vectorizer_fold_{fold_idx}.joblib')
        
        self.model = joblib.load(model_path)
        self.vectorizer = joblib.load(vectorizer_path)
        
        logger.info(f'Model and vectorizer loaded from {save_dir}')

    def train_kfold(self, csv_path, n_splits=5, save_dir='model_saves'):
        """K折交叉验证训练"""
        # 读取数据
        df = pd.read_csv(csv_path)
        texts = df['text'].values
        labels = df['target'].values
        
        # 打印标签分布
        unique_labels = np.unique(labels)
        logger.info(f"数据集中的标签类别: {unique_labels}")
        logger.info("标签分布:")
        for label in unique_labels:
            count = np.sum(labels == label)
            logger.info(f"标签 {label}: {count} 个样本 ({count/len(labels)*100:.2f}%)")
        
        # K折交叉验证
        kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        fold_scores = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(texts)):
            logger.info(f'\nTraining Fold {fold_idx + 1}/{n_splits}')
            
            # 准备数据
            train_texts, val_texts = texts[train_idx], texts[val_idx]
            train_labels, val_labels = labels[train_idx], labels[val_idx]
            
            # 特征提取和SMOTE平衡
            logger.info("Preparing training features...")
            X_train, y_train = self.prepare_features(train_texts, train_labels, is_training=True)
            logger.info("Preparing validation features...")
            X_val, _ = self.prepare_features(val_texts, is_training=False)
            
            # 训练和评估
            val_accuracy = self.train_fold(
                X_train, y_train,
                X_val, val_labels,
                fold_idx, save_dir
            )
            
            fold_scores.append(val_accuracy)
        
        # 输出总体结果
        logger.info('\nK-Fold Cross Validation Results:')
        logger.info(f'Mean Accuracy: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})')
        logger.info(f'Individual Fold Scores: {fold_scores}')

def main():
    # 设置随机种子
    np.random.seed(42)
    
    # 初始化分类器
    classifier = EnsembleTextClassifier()
    
    # 训练模型
    classifier.train_kfold(
        csv_path='combination.csv',
        n_splits=5,
        save_dir='model_saves'
    )

if __name__ == "__main__":
    main()

INFO:__main__:数据集中的标签类别: [0 1 2 3 4 5]
INFO:__main__:标签分布:
INFO:__main__:标签 0: 289 个样本 (14.49%)
INFO:__main__:标签 1: 256 个样本 (12.84%)
INFO:__main__:标签 2: 641 个样本 (32.15%)
INFO:__main__:标签 3: 178 个样本 (8.93%)
INFO:__main__:标签 4: 94 个样本 (4.71%)
INFO:__main__:标签 5: 536 个样本 (26.88%)
INFO:__main__:
Training Fold 1/5
INFO:__main__:Preparing training features...
INFO:__main__:Preparing validation features...
INFO:__main__:Training fold 1
INFO:__main__:Optimizing parameters...


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.210472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6351
[LightGBM] [Info] Number of data points in the train set: 2120, number of used features: 410
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.240221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7417
[L

INFO:__main__:Best parameters: {'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
INFO:__main__:Best score: 0.7195
INFO:__main__:Train Accuracy: 0.9962
INFO:__main__:Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       530
           1       1.00      1.00      1.00       530
           2       1.00      0.99      0.99       530
           3       1.00      1.00      1.00       530
           4       1.00      1.00      1.00       530
           5       0.99      0.99      0.99       530

    accuracy                           1.00      3180
   macro avg       1.00      1.00      1.00      3180
weighted avg       1.00      1.00      1.00      3180

INFO:__main__:Validation Accuracy: 0.5464
INFO:__main__:Validation Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.52      0.55        52
           1       0.70      0.47      

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.624648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5392
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.694745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6475
[LightGBM] [Info] Number of data points in the train set: 1988, number of used features: 355
[LightGBM] [Info] Number of data points in the train set: 1988, number of used features: 426
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from

INFO:__main__:Best parameters: {'rf__max_depth': 30, 'rf__min_samples_split': 5, 'rf__n_estimators': 200}
INFO:__main__:Best score: 0.7180
INFO:__main__:Train Accuracy: 0.9953
INFO:__main__:Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       497
           1       1.00      1.00      1.00       497
           2       1.00      0.99      0.99       497
           3       1.00      1.00      1.00       497
           4       1.00      1.00      1.00       497
           5       0.97      1.00      0.99       497

    accuracy                           1.00      2982
   macro avg       1.00      1.00      1.00      2982
weighted avg       1.00      1.00      1.00      2982

INFO:__main__:Validation Accuracy: 0.5138
INFO:__main__:Validation Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.39      0.44        57
           1       0.62      0.57      

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.267779 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7441
[LightGBM] [Info] Number of data points in the train set: 2044, number of used features: 454
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.786244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5646
[L

INFO:__main__:Best parameters: {'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
INFO:__main__:Best score: 0.7156
INFO:__main__:Train Accuracy: 0.9935
INFO:__main__:Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       511
           1       1.00      1.00      1.00       511
           2       0.99      0.99      0.99       511
           3       1.00      1.00      1.00       511
           4       1.00      1.00      1.00       511
           5       0.97      0.99      0.98       511

    accuracy                           0.99      3066
   macro avg       0.99      0.99      0.99      3066
weighted avg       0.99      0.99      0.99      3066

INFO:__main__:Validation Accuracy: 0.5514
INFO:__main__:Validation Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.52      0.54        61
           1       0.56      0.44      

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.331277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7335
[LightGBM] [Info] Number of data points in the train set: 2048, number of used features: 492
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.394616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9853
[L

INFO:__main__:Best parameters: {'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}
INFO:__main__:Best score: 0.7233
INFO:__main__:Train Accuracy: 0.9958
INFO:__main__:Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       512
           1       1.00      1.00      1.00       512
           2       0.99      0.99      0.99       512
           3       1.00      1.00      1.00       512
           4       1.00      1.00      1.00       512
           5       0.99      0.99      0.99       512

    accuracy                           1.00      3072
   macro avg       1.00      1.00      1.00      3072
weighted avg       1.00      1.00      1.00      3072

INFO:__main__:Validation Accuracy: 0.5514
INFO:__main__:Validation Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.36      0.43        61
           1       0.65      0.50      

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8264
[LightGBM] [Info] Number of data points in the train set: 2056, number of used features: 530
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.196097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6140
[L

INFO:__main__:Best parameters: {'rf__max_depth': 30, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
INFO:__main__:Best score: 0.7396
INFO:__main__:Train Accuracy: 0.9935
INFO:__main__:Train Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       514
           1       1.00      1.00      1.00       514
           2       0.99      0.98      0.99       514
           3       1.00      1.00      1.00       514
           4       1.00      1.00      1.00       514
           5       0.98      0.98      0.98       514

    accuracy                           0.99      3084
   macro avg       0.99      0.99      0.99      3084
weighted avg       0.99      0.99      0.99      3084

INFO:__main__:Validation Accuracy: 0.5075
INFO:__main__:Validation Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.33      0.34        58
           1       0.58      0.36      