In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
import logging
import os
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)

class AdComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(labels)
        }

class AdComplaintClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', dropout_rate=0.1):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # 多标签分类器
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 5),
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # 使用[CLS]标记的输出
        pooled_output = outputs.last_hidden_state[:, 0]
        return self.classifier(pooled_output)

def process_labels(label_str):
    """处理多标签"""
    labels = [0] * 5
    if pd.isna(label_str):
        return labels
    for l in str(label_str).split(','):
        if l.isdigit() and 1 <= int(l) <= 5:
            labels[int(l)-1] = 1
    return labels

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs=3):
    """训练模型"""
    best_val_loss = float('inf')
    best_model_state = None
    
    for epoch in range(num_epochs):
        logging.info(f'Epoch {epoch + 1}/{num_epochs}')
        
        # 训练阶段
        model.train()
        train_loss = 0
        train_steps = 0
        
        for batch in tqdm(train_loader, desc='Training'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            
            train_loss += loss.item()
            train_steps += 1
        
        avg_train_loss = train_loss / train_steps
        
        # 验证阶段
        model.eval()
        val_loss = 0
        val_steps = 0
        predictions = []
        true_labels = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                val_steps += 1
                
                predictions.extend(torch.sigmoid(outputs).cpu().numpy())
                true_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = val_loss / val_steps
        
        # 计算评估指标
        predictions = np.array(predictions) > 0.5
        true_labels = np.array(true_labels)
        
        metrics = {
            'micro_f1': f1_score(true_labels, predictions, average='micro'),
            'macro_f1': f1_score(true_labels, predictions, average='macro'),
            'micro_precision': precision_score(true_labels, predictions, average='micro'),
            'micro_recall': recall_score(true_labels, predictions, average='micro'),
            'macro_auc': roc_auc_score(true_labels, predictions, average='macro')
        }
        
        logging.info(f'Avg train loss: {avg_train_loss:.4f}')
        logging.info(f'Avg val loss: {avg_val_loss:.4f}')
        logging.info(f'Metrics: {metrics}')
        
        # 保存最佳模型
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
    
    return best_model_state, metrics

def main():
    # 加载数据
    df = pd.read_csv('../combination_excel_withoutnull_labeled.csv')
    
    # 处理标签
    labels = np.array([process_labels(label) for label in df['label']])
    
    # 计算类别权重
    pos_weights = torch.FloatTensor([
        len(df) / (sum(labels[:, i]) + 1e-10) for i in range(5)
    ])
    
    # 初始化tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logging.info(f'Using device: {device}')
    
    # 交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_metrics = []
    
    # 使用第一个标签进行分层采样
    for fold, (train_idx, val_idx) in enumerate(skf.split(df['complaints'], labels[:, 0])):
        logging.info(f'Fold {fold + 1}/5')
        
        # 创建数据集
        train_dataset = AdComplaintDataset(
            df['complaints'].iloc[train_idx].values,
            labels[train_idx],
            tokenizer
        )
        val_dataset = AdComplaintDataset(
            df['complaints'].iloc[val_idx].values,
            labels[val_idx],
            tokenizer
        )
        
        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=8)
        
        # 初始化模型
        model = AdComplaintClassifier()
        model.to(device)
        
        # 损失函数和优化器
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights.to(device))
        optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
        
        # 学习率调度器
        total_steps = len(train_loader) * 3  # 3 epochs
        warmup_steps = total_steps // 10
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )
        
        # 训练模型
        best_model_state, metrics = train_model(
            model, train_loader, val_loader,
            criterion, optimizer, scheduler, device
        )
        
        # 保存最佳模型
        torch.save(best_model_state, f'best_model_fold_{fold}.pt')
        fold_metrics.append(metrics)
        
        # 清理显存
        del model
        torch.cuda.empty_cache()
    
    # 输出平均指标
    avg_metrics = {
        metric: np.mean([fold[metric] for fold in fold_metrics])
        for metric in fold_metrics[0].keys()
    }
    logging.info('\nAverage metrics across folds:')
    for metric, value in avg_metrics.items():
        logging.info(f'{metric}: {value:.4f}')

if __name__ == '__main__':
    main()

  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: [Errno 2] No such file or directory: 'combination_excel_withoutnull_labeled.csv'