# Starter Notebook

Install and import required libraries

In [1]:
# !pip install transformers datasets evaluate accelerate peft trl bitsandbytes
# !pip install nvidia-ml-py3

In [2]:
import os
import pandas as pd
import torch
import random
import re
import nltk
import pickle
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac

from transformers import (
    RobertaModel,
    RobertaTokenizer, 
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    RobertaForSequenceClassification,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel

# Download required NLTK data
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/viewsetting/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/viewsetting/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# 设置设备
device = torch.device('cuda:1' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using Device: {device}")

Using Device: cuda:1


In [4]:
import wandb
wandb.login(key="2008ab8d896bfc68619ace7f820e0513468b9783", relogin=True)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/viewsetting/.netrc


True

In [5]:
# Get current wandb entity
current_entity = wandb.api.default_entity
print(f"Current wandb entity: {current_entity}")


Current wandb entity: jl10897-new-york-university


## Load Tokenizer and Preprocess Data

In [6]:
# base_model = 'roberta-base'
# # base_model = 'roberta-large'

# dataset = load_dataset('ag_news', split='train')
# tokenizer = RobertaTokenizer.from_pretrained(base_model)

# def clean_text(text):
#     text = re.sub(r"<.*?>", "", text)                 # 去除 HTML 标签
#     text = re.sub(r"http\S+|www\S+", "", text)        # 移除 URL
#     text = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\[\]\s]", "", text)  # 去除特殊字符
#     text = re.sub(r"\s+", " ", text).strip()          # 去除多余空格
#     return text.lower()                               # 可选：统一小写
    
# def preprocess(examples):
#     cleaned_texts = [clean_text(t) for t in examples['text']]
#     return tokenizer(cleaned_texts, truncation=True, padding=True, max_length=256)

# tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [7]:
nltk.download('averaged_perceptron_tagger_eng', download_dir='/home/viewsetting/nltk_data')



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/viewsetting/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [8]:
nltk.data.path.append('/home/viewsetting/nltk_data')

In [9]:
import numpy as np
base_model = 'roberta-base'
# base_model = 'roberta-large'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# 初始化增强器
synonym_aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)
delete_aug = naw.RandomWordAug(action='delete', aug_p=0.1)
swap_aug = naw.RandomWordAug(action='swap', aug_p=0.1)
typo_aug = nac.RandomCharAug(action='swap', aug_char_p=0.05, aug_word_p=0.1)

# 定义情感词列表（按类别）
sentiment_words = {
    'World': ['terrible', 'urgent', 'critical', 'important', 'serious'],
    'Sports': ['exciting', 'thrilling', 'great', 'amazing', 'intense'],
    'Business': ['profitable', 'successful', 'promising', 'risky', 'innovative'],
    'Sci/Tech': ['advanced', 'innovative', 'futuristic', 'complex', 'technical']
}

def sentiment_word_insertion(text, label, aug_p=0.3):
    """随机插入情感词增强"""
    words = text.split()
    if random.random() < aug_p:
        label_name = ['World', 'Sports', 'Business', 'Sci/Tech'][label]
        sentiment_list = sentiment_words[label_name]
        sentiment_word = random.choice(sentiment_list)
        insert_pos = random.randint(0, len(words))
        words.insert(insert_pos, sentiment_word)
    return ' '.join(words)

def clean_text(text):
    """清理文本：去除HTML标签、URL、特殊字符，并统一小写"""
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\[\]\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

def simple_paraphrase(text):
    """简单基于规则的释义"""
    replacements = {
        'said': 'stated',
        'big': 'large',
        'small': 'tiny',
        'good': 'excellent',
        'bad': 'poor',
        'buy': 'purchase',
        'sell': 'trade',
        'make': 'create',
        'show': 'display',
        'start': 'begin'
    }
    words = text.split()
    for i, word in enumerate(words):
        if word.lower() in replacements and random.random() < 0.3:
            words[i] = replacements[word.lower()]
    return ' '.join(words)

def process_single_example(example, augmentations=None):
    """处理单个样本：清理文本、应用增强、分词"""
    if augmentations is None:
        augmentations = {
            'synonym': False,
            'delete': False,
            'swap': False,
            'paraphrase': False,
            'noise': False,
            'sentiment': False
        }

    # 检查是否是批处理模式
    if isinstance(example['text'], (list, np.ndarray)):
        # 批处理模式
        texts = example['text']
        labels = example['label']
        
        # 处理每个文本
        cleaned_texts = [clean_text(t) for t in texts]
        aug_texts = []
        
        for text, label in zip(cleaned_texts, labels):
            aug_text = text
            if augmentations.get('sentiment', False):
                aug_text = sentiment_word_insertion(aug_text, label, aug_p=0.3)
            if augmentations.get('synonym', False):
                aug_text = synonym_aug.augment(aug_text)[0]
            if augmentations.get('delete', False):
                aug_text = delete_aug.augment(aug_text)[0]
            if augmentations.get('swap', False):
                aug_text = swap_aug.augment(aug_text)[0]
            if augmentations.get('paraphrase', False):
                aug_text = simple_paraphrase(aug_text)
            if augmentations.get('noise', False):
                aug_text = typo_aug.augment(aug_text)[0]
            aug_texts.append(aug_text)
            
        # 批量分词
        return tokenizer(aug_texts, truncation=True, padding=True, max_length=256)
    else:
        # 单个样本模式
        text = clean_text(example['text'])
        label = example['label'] if isinstance(example, dict) else example.label

        # 应用增强
        aug_text = text
        if augmentations.get('sentiment', False):
            aug_text = sentiment_word_insertion(aug_text, label, aug_p=0.3)
        if augmentations.get('synonym', False):
            aug_text = synonym_aug.augment(aug_text)[0]
        if augmentations.get('delete', False):
            aug_text = delete_aug.augment(aug_text)[0]
        if augmentations.get('swap', False):
            aug_text = swap_aug.augment(aug_text)[0]
        if augmentations.get('paraphrase', False):
            aug_text = simple_paraphrase(aug_text)
        if augmentations.get('noise', False):
            aug_text = typo_aug.augment(aug_text)[0]

        # 分词
        tokenized = tokenizer(aug_text, truncation=True, padding=True, max_length=256)
        tokenized['labels'] = label
        return tokenized
    
def preprocess_dataset(dataset, augmentations=None, use_parallel=False, num_workers=None):
    """
    预处理数据集，支持并行和非并行处理
    
    Args:
        dataset: 要处理的数据集
        augmentations: 数据增强配置
        use_parallel: 是否使用并行处理
        num_workers: 并行处理的工作进程数
    """
    if use_parallel:
        # 并行处理
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        if num_workers is None:
            num_workers = max(1, cpu_count() - 1)
        
        data_list = [example for example in dataset]
        process_func = partial(process_single_example, augmentations=augmentations)
        
        with Pool(num_workers) as pool:
            tokenized_examples = pool.map(process_func, data_list)
            
        tokenized_dataset = Dataset.from_dict({
            'input_ids': [ex['input_ids'] for ex in tokenized_examples],
            'attention_mask': [ex['attention_mask'] for ex in tokenized_examples],
            'labels': [ex['labels'] for ex in tokenized_examples]
        })
    else:
        # 非并行处理
        tokenized_dataset = dataset.map(
            lambda examples: process_single_example(examples, augmentations),
            batched=True,
            remove_columns=["text"]
        )
    
    return tokenized_dataset

# 示例增强配置
augmentation_config = {
    'synonym': True,
    'delete': True,
    'swap': True,
    'paraphrase': True,
    'noise': True,
    'sentiment': True  # 启用情感词插入
}

# 使用时可以选择是否启用并行处理
import time
start_time = time.time()
tokenized_dataset = preprocess_dataset(
    dataset, 
    augmentations=augmentation_config,
    use_parallel=False,  # 设置为 False 则使用非并行处理
    num_workers=8
)




Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [10]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [11]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

## Anything from here on can be modified

In [12]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [13]:
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "value"],
    task_type="SEQ_CLS"
)

In [14]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Module

In [15]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 741,124 || all params: 125,389,832 || trainable%: 0.5911


## Training Setup

In [19]:
import wandb
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Dict
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix,
    classification_report
)

def compute_metrics(pred):
    """
    Compute and log comprehensive evaluation metrics
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = torch.nn.functional.softmax(torch.tensor(pred.predictions), dim=-1)
    
    # Basic metrics
    accuracy = accuracy_score(labels, preds)
    precision_macro = precision_score(labels, preds, average='macro')
    precision_micro = precision_score(labels, preds, average='micro')
    precision_per_class = precision_score(labels, preds, average=None)
    
    recall_macro = recall_score(labels, preds, average='macro')
    recall_micro = recall_score(labels, preds, average='micro')
    recall_per_class = recall_score(labels, preds, average=None)
    
    f1_macro = f1_score(labels, preds, average='macro')
    f1_micro = f1_score(labels, preds, average='micro')
    f1_per_class = f1_score(labels, preds, average=None)
    
    # Per-class metrics
    metrics_per_class = {}
    for i, class_name in enumerate(id2label.values()):
        metrics_per_class.update({
            f'precision_class_{class_name}': precision_per_class[i],
            f'recall_class_{class_name}': recall_per_class[i],
            f'f1_class_{class_name}': f1_per_class[i]
        })
    
    # Confusion matrix - only log to wandb
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=list(id2label.values()),
                yticklabels=list(id2label.values()))
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    # Log confusion matrix to wandb
    wandb.log({'eval/confusion_matrix': wandb.Image(plt)})
    plt.close()
    
    # Confidence histograms - only log to wandb
    for i in range(len(id2label)):
        plt.figure(figsize=(8,6))
        class_probs = probs[:, i].numpy()
        plt.hist(class_probs, bins=50)
        plt.title(f'Confidence Distribution - Class {id2label[i]}')
        plt.xlabel('Confidence')
        plt.ylabel('Count')
        wandb.log({f'eval/confidence_dist_class_{id2label[i]}': wandb.Image(plt)})
        plt.close()

    # Log metrics to wandb
    wandb.log({
        'eval/accuracy': accuracy,
        'eval/precision_macro': precision_macro,
        'eval/precision_micro': precision_micro,
        'eval/recall_macro': recall_macro,
        'eval/recall_micro': recall_micro,
        'eval/f1_macro': f1_macro,
        'eval/f1_micro': f1_micro,
    })
    
    # Return only JSON-serializable metrics
    return {
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'precision_micro': precision_micro,
        'recall_macro': recall_macro,
        'recall_micro': recall_micro,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        **metrics_per_class
    }

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.gradient_norm = 0.0

    def training_step(self, model, inputs):
        """Override training step to compute training metrics"""
        loss = super().training_step(model, inputs)
        
        # Compute training accuracy
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = outputs.logits.argmax(-1)
            accuracy = (predictions == inputs['labels']).float().mean()
            
            # Log training metrics
            if self.state.global_step % self.args.logging_steps == 0:
                wandb.log({
                    'train/loss': loss.item(),
                    'train/accuracy': accuracy.item(),
                    'train/step': self.state.global_step
                })
        
        # Compute gradient norm
        if self.args.gradient_checkpointing:
            self.gradient_norm = torch.norm(
                torch.stack([
                    torch.norm(p.grad.detach())
                    for p in model.parameters()
                    if p.grad is not None
                ])
            ).item()
        
        return loss

    def log(self, logs: Dict[str, float]) -> None:
        """
        Enhanced logging with additional training metrics
        """
        if self.state.global_step % self.args.logging_steps == 0:
            # Log learning rates
            if hasattr(self.optimizer, "param_groups"):
                current_lr = self.optimizer.param_groups[0]['lr']
                wandb.log({
                    'train/learning_rate': current_lr,
                    'train/step': self.state.global_step
                })
            
            # Log gradient norm
            logs["train/gradient_norm"] = self.gradient_norm
            
            # Log batch size
            logs["train/batch_size"] = self.args.per_device_train_batch_size
            
            # Log memory usage if using GPU
            if torch.cuda.is_available():
                logs["system/gpu_memory_allocated"] = torch.cuda.memory_allocated() / 1024**2  # MB
                logs["system/gpu_memory_cached"] = torch.cuda.memory_reserved() / 1024**2  # MB
            
            # Log parameter statistics
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    logs[f"parameters/mean/{name}"] = param.data.mean().item()
                    logs[f"parameters/std/{name}"] = param.data.std().item()
                    if param.grad is not None:
                        logs[f"gradients/mean/{name}"] = param.grad.data.mean().item()
                        logs[f"gradients/std/{name}"] = param.grad.data.std().item()
            
            # Log all metrics to wandb
            wandb.log(logs)
        
        super().log(logs)

# Setup Training args
output_dir = "results"

training_args = TrainingArguments(
    output_dir=output_dir,
    report_to='wandb',
    eval_strategy='steps',
    logging_steps=100,
    eval_steps=200,
    save_steps=400,
    save_total_limit=2,

    learning_rate=1e-4,  # LoRA
    warmup_ratio=0.1,
    num_train_epochs=2,
    
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,

    optim="adamw_torch",  # BETTER THAN SGD
    weight_decay=0.01,

    gradient_checkpointing=False,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    run_name="lora_finetuning_run_2",
    
    logging_first_step=True, 
    logging_nan_inf_filter=False, 
    logging_strategy="steps",  
    label_names=["labels"]
)

def get_trainer(model):
    """
    Create a trainer instance with enhanced logging
    """
    # Initialize wandb with detailed config
    wandb.init(
        project="ag_news_classification",
        name=f"roberta_lora_{wandb.util.generate_id()}",
        config={
            "model_name": base_model,
            "lora_config": {
                "r": peft_config.r,
                "alpha": peft_config.lora_alpha,
                "dropout": peft_config.lora_dropout,
                "target_modules": peft_config.target_modules,
            },
            "training_config": {
                "learning_rate": training_args.learning_rate,
                "batch_size": training_args.per_device_train_batch_size,
                "epochs": training_args.num_train_epochs,
                "warmup_ratio": training_args.warmup_ratio,
                "weight_decay": training_args.weight_decay,
            },
            "augmentation_config": augmentation_config,
            "dataset": "ag_news",
            "train_size": len(train_dataset),
            "eval_size": len(eval_dataset),
        }
    )
    
    model.config.label2id = {label: i for i, label in enumerate(class_names)}
    model.config.id2label = {i: label for i, label in enumerate(class_names)}
    
    return CustomTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )

### Start Training

In [20]:
!export CUDA_VISIBLE_DEVICES=3


In [21]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

VBox(children=(Label(value='0.266 MB of 0.266 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▁██
eval/f1_class_Business,▁█
eval/f1_class_Sci/Tech,▁█
eval/f1_class_Sports,▁█
eval/f1_class_World,▁█
eval/f1_macro,▁▁██
eval/f1_micro,▁▁██
eval/loss,█▁
eval/precision_class_Business,▁█
eval/precision_class_Sci/Tech,█▁

0,1
eval/accuracy,0.87813
eval/f1_class_Business,0.84039
eval/f1_class_Sci/Tech,0.86387
eval/f1_class_Sports,0.94304
eval/f1_class_World,0.86545
eval/f1_macro,0.87819
eval/f1_micro,0.87813
eval/loss,0.38134
eval/precision_class_Business,0.83766
eval/precision_class_Sci/Tech,0.83333


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113307111534394, max=1.0…

    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Step,Training Loss,Validation Loss,Accuracy,Precision Macro,Precision Micro,Recall Macro,Recall Micro,F1 Macro,F1 Micro,Precision Class World,Recall Class World,F1 Class World,Precision Class Sports,Recall Class Sports,F1 Class Sports,Precision Class Business,Recall Class Business,F1 Class Business,Precision Class Sci/tech,Recall Class Sci/tech,F1 Class Sci/tech,Norm,Size,Memory Allocated,Memory Cached,Model.model.roberta.encoder.layer.0.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.0.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.0.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.0.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.1.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.1.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.1.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.1.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.2.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.2.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.2.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.2.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.3.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.3.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.3.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.3.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.4.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.4.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.4.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.4.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.5.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.5.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.5.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.5.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.6.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.6.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.6.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.6.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.7.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.7.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.7.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.7.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.8.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.8.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.8.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.8.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.9.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.9.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.9.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.9.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.10.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.10.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.10.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.10.attention.self.value.lora B.default.weight,Model.model.roberta.encoder.layer.11.attention.self.query.lora A.default.weight,Model.model.roberta.encoder.layer.11.attention.self.query.lora B.default.weight,Model.model.roberta.encoder.layer.11.attention.self.value.lora A.default.weight,Model.model.roberta.encoder.layer.11.attention.self.value.lora B.default.weight,Model.model.classifier.modules To Save.default.dense.weight,Model.model.classifier.modules To Save.default.dense.bias,Model.model.classifier.modules To Save.default.out Proj.weight,Model.model.classifier.modules To Save.default.out Proj.bias
200,0.3677,0.377625,0.875,0.877097,0.875,0.873184,0.875,0.874608,0.875,0.894737,0.82069,0.856115,0.943038,0.943038,0.943038,0.821656,0.843137,0.832258,0.848958,0.88587,0.867021,0.0,32,509.056641,7180.0,0.023916,0.004431,0.022941,0.003937,0.022371,0.003891,0.02216,0.003464,0.022905,0.00432,0.021249,0.002946,0.02165,0.003276,0.021443,0.003041,0.022254,0.004372,0.021194,0.003239,0.022102,0.004012,0.021672,0.003898,0.023826,0.005395,0.021715,0.004331,0.021839,0.004775,0.02182,0.004666,0.022043,0.005123,0.022162,0.004757,0.022191,0.004993,0.022023,0.004896,0.022559,0.00516,0.022328,0.004714,0.022289,0.005444,0.021913,0.005048,0.020354,0.000616,0.021583,0.000367
400,0.3516,0.365614,0.882812,0.887004,0.882812,0.881078,0.882812,0.883125,0.882812,0.915385,0.82069,0.865455,0.955128,0.943038,0.949045,0.836478,0.869281,0.852564,0.841026,0.891304,0.865435,0.0,32,509.072266,7180.0,0.025336,0.005387,0.023225,0.004299,0.023182,0.004806,0.022405,0.003801,0.023582,0.005088,0.021538,0.003419,0.022599,0.004387,0.02158,0.003351,0.022872,0.005492,0.021373,0.003583,0.022569,0.004677,0.021656,0.004026,0.024548,0.005884,0.021798,0.004497,0.022104,0.005348,0.021833,0.004847,0.022306,0.005509,0.022103,0.004887,0.022778,0.005599,0.02205,0.005066,0.023841,0.006009,0.022259,0.004869,0.023077,0.006331,0.022025,0.005299,0.020416,0.000914,0.021794,0.000707
600,0.3593,0.361966,0.885938,0.888175,0.885938,0.884311,0.885938,0.885896,0.885938,0.884892,0.848276,0.866197,0.960526,0.924051,0.941935,0.851613,0.862745,0.857143,0.85567,0.902174,0.878307,0.0,32,509.072266,7180.0,0.026067,0.005864,0.023418,0.004535,0.023665,0.005273,0.022463,0.003964,0.024093,0.005551,0.021611,0.003605,0.022989,0.004907,0.021752,0.003606,0.023124,0.005966,0.021432,0.003737,0.02272,0.004959,0.021704,0.004106,0.02459,0.006059,0.021882,0.004649,0.022329,0.005692,0.021922,0.005019,0.022613,0.005835,0.022117,0.00499,0.023124,0.006059,0.022194,0.005256,0.025783,0.006849,0.022296,0.005012,0.024912,0.007345,0.022205,0.005558,0.020461,0.001096,0.021867,0.000909
800,0.3451,0.329512,0.89375,0.896211,0.89375,0.893569,0.89375,0.894313,0.89375,0.911765,0.855172,0.882562,0.961039,0.936709,0.948718,0.836364,0.901961,0.867925,0.875676,0.880435,0.878049,0.0,32,509.072266,7180.0,0.02697,0.006346,0.023687,0.00479,0.024192,0.005727,0.022692,0.004167,0.024528,0.005922,0.02175,0.003747,0.023364,0.00537,0.021892,0.003794,0.023395,0.006319,0.021503,0.003881,0.02304,0.005248,0.021756,0.004192,0.025509,0.006423,0.021961,0.00477,0.022414,0.005866,0.021924,0.005093,0.022967,0.006201,0.0222,0.005118,0.023807,0.00659,0.022294,0.00539,0.028356,0.00769,0.022268,0.005062,0.025721,0.00781,0.022221,0.005644,0.020503,0.001193,0.022022,0.000865
1000,0.3383,0.326981,0.889062,0.89104,0.889062,0.887157,0.889062,0.888751,0.889062,0.892857,0.862069,0.877193,0.948387,0.93038,0.939297,0.865772,0.843137,0.854305,0.857143,0.913043,0.884211,0.0,32,509.072266,7180.0,0.027617,0.006675,0.023633,0.004834,0.024384,0.005889,0.022531,0.004188,0.024739,0.006146,0.021777,0.003815,0.023695,0.005725,0.021988,0.003882,0.023558,0.006577,0.021563,0.003971,0.023259,0.0055,0.021795,0.004281,0.025968,0.006573,0.022036,0.004867,0.022514,0.005996,0.02205,0.005215,0.023245,0.006494,0.022276,0.005214,0.024492,0.006967,0.022423,0.00551,0.028612,0.007952,0.022425,0.00521,0.026304,0.008111,0.022362,0.005796,0.020535,0.001244,0.022139,0.000885
1200,0.3236,0.317786,0.895312,0.897626,0.895312,0.894377,0.895312,0.895673,0.895312,0.898551,0.855172,0.876325,0.96732,0.936709,0.951768,0.860759,0.888889,0.874598,0.863874,0.896739,0.88,0.0,32,509.072266,7180.0,0.028151,0.006931,0.023884,0.005002,0.024746,0.006147,0.02267,0.004284,0.02491,0.006306,0.021958,0.003961,0.023917,0.005937,0.022105,0.003992,0.023728,0.006799,0.021638,0.004055,0.023554,0.005716,0.021836,0.004342,0.026391,0.006726,0.022086,0.004937,0.022664,0.006172,0.022059,0.005239,0.023509,0.006704,0.022272,0.005251,0.024987,0.007229,0.022485,0.005578,0.029424,0.008216,0.022457,0.005279,0.026591,0.008292,0.022407,0.005888,0.020557,0.00134,0.022211,0.001021
1400,0.318,0.312512,0.896875,0.898523,0.896875,0.896557,0.896875,0.897341,0.896875,0.900709,0.875862,0.888112,0.96732,0.936709,0.951768,0.849057,0.882353,0.865385,0.877005,0.891304,0.884097,0.0,32,509.072266,7180.0,0.028567,0.007109,0.023947,0.005054,0.024915,0.006251,0.022761,0.004346,0.024917,0.006356,0.022042,0.004037,0.024131,0.006105,0.022188,0.004069,0.023851,0.006965,0.021718,0.004134,0.02378,0.005874,0.021861,0.004365,0.026519,0.006813,0.022123,0.004947,0.022719,0.006253,0.022078,0.005263,0.023689,0.006801,0.022304,0.005296,0.025275,0.007372,0.022544,0.005627,0.02954,0.008304,0.022486,0.005311,0.026854,0.008401,0.022449,0.005927,0.020574,0.001365,0.022325,0.001071
1600,0.3089,0.317747,0.898438,0.900387,0.898438,0.897602,0.898438,0.898724,0.898438,0.9,0.868966,0.884211,0.967105,0.93038,0.948387,0.860759,0.888889,0.874598,0.873684,0.902174,0.887701,0.0,32,509.072266,7180.0,0.028816,0.007207,0.02398,0.00508,0.025018,0.006329,0.022786,0.004364,0.025065,0.00646,0.022112,0.004081,0.024304,0.006231,0.022253,0.004121,0.023941,0.007075,0.021778,0.004178,0.023921,0.005959,0.021889,0.0044,0.026827,0.006895,0.022159,0.004981,0.022798,0.006333,0.02211,0.005273,0.023707,0.006858,0.022345,0.005321,0.025419,0.007439,0.022601,0.005671,0.029588,0.008341,0.022517,0.005337,0.027042,0.00846,0.022493,0.005973,0.020581,0.001402,0.022355,0.00115
1800,0.3098,0.307275,0.89375,0.895682,0.89375,0.893065,0.89375,0.894195,0.89375,0.888112,0.875862,0.881944,0.967105,0.93038,0.948387,0.863636,0.869281,0.86645,0.863874,0.896739,0.88,0.0,32,509.072266,7180.0,0.028949,0.007259,0.024025,0.005105,0.025055,0.006355,0.022807,0.004375,0.025122,0.006499,0.022143,0.004096,0.024339,0.006254,0.022271,0.004133,0.023965,0.007102,0.021792,0.004191,0.023949,0.005977,0.021901,0.004407,0.026817,0.006906,0.022172,0.004993,0.022818,0.006364,0.022122,0.005283,0.023731,0.006875,0.022363,0.005334,0.025419,0.007449,0.022613,0.005671,0.029661,0.008366,0.0225,0.005326,0.027065,0.008468,0.022491,0.005966,0.020583,0.001401,0.02236,0.001152


    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

In [22]:
# Save Model
peft_model_path = os.path.join(output_dir, "peft_model_sentiment")
peft_model.save_pretrained(peft_model_path)
# Save Tokenizer
tokenizer.save_pretrained(peft_model_path)
print(f"Model saved to {peft_model_path}")


Model saved to results/peft_model_sentiment


## Evaluate Finetuned Model


### Run Inference on eval_dataset

In [23]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=32, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [24]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 32, data_collator)

100%|██████████| 20/20 [00:04<00:00,  4.28it/s]

Evaluation Metric: {'accuracy': 0.9015625}





In [28]:
def process_unlabelled_data(data, tokenizer, augmentations=None):
    """
    专门处理未标记数据的函数
    
    Args:
        data: 包含文本数据的DataFrame/Dataset/list
        tokenizer: 使用的tokenizer
        augmentations: 数据增强配置字典
    
    Returns:
        Dataset: 处理后的数据集
    """
    if augmentations is None:
        augmentations = {
            'synonym': True,
            'delete': True,
            'swap': True,
            'paraphrase': True,
            'noise': True
        }
    
    # 首先打印数据类型和前几条数据来调试
    print(f"Input data type: {type(data)}")
    print("First few items:")
    print(data.head() if hasattr(data, 'head') else data[:5])
    
    # 获取文本列表
    texts = []
    if isinstance(data, pd.DataFrame):
        if 'text' in data.columns:
            texts = data['text'].tolist()
        else:
            print("Available columns:", data.columns)
            raise ValueError("No 'text' column found in DataFrame")
    elif isinstance(data, (list, np.ndarray)):
        texts = data
    else:
        try:
            texts = list(data)
        except:
            raise ValueError(f"Unsupported data type: {type(data)}")
    
    # 确保所有文本都是字符串
    texts = [str(t) if t is not None else "" for t in texts]
    
    # 清理和增强文本
    processed_texts = []
    for text in tqdm(texts, desc="Processing texts"):
        # 清理文本
        try:
            clean = clean_text(text)
        except Exception as e:
            print(f"Error cleaning text: {text}")
            print(f"Error message: {str(e)}")
            clean = text  # 如果清理失败，使用原始文本
        
        # 应用数据增强
        aug_text = clean
        if augmentations.get('synonym', False):
            try:
                aug_text = synonym_aug.augment(aug_text)[0]
            except:
                pass
        if augmentations.get('delete', False):
            try:
                aug_text = delete_aug.augment(aug_text)[0]
            except:
                pass
        if augmentations.get('swap', False):
            try:
                aug_text = swap_aug.augment(aug_text)[0]
            except:
                pass
        if augmentations.get('paraphrase', False):
            aug_text = simple_paraphrase(aug_text)
        if augmentations.get('noise', False):
            try:
                aug_text = typo_aug.augment(aug_text)[0]
            except:
                pass
        
        processed_texts.append(aug_text)
    
    # 批量tokenize
    tokenized = tokenizer(
        processed_texts,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="pt"
    )
    
    # 转换为Dataset格式
    dataset = Dataset.from_dict({
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask']
    })
    
    return dataset

# 使用示例：
# 读取未标记数据
unlabelled_dataset = pd.read_pickle("/home/viewsetting/ssd_2T/test_unlabelled.pkl")

# 让我们先看看数据的结构
print("Dataset info:")
print(type(unlabelled_dataset))
if isinstance(unlabelled_dataset, pd.DataFrame):
    print("\nColumns:", unlabelled_dataset.columns)
    print("\nFirst few rows:")
    print(unlabelled_dataset.head())

# 处理数据
test_dataset = process_unlabelled_data(
    unlabelled_dataset,
    tokenizer,
    augmentations={
        'synonym': True,
        'delete': True,
        'swap': True,
        'paraphrase': True,
        'noise': True
    }
)

# 使用示例：
# 读取未标记数据
unlabelled_dataset = pd.read_pickle("/home/viewsetting/ssd_2T/test_unlabelled.pkl")

# 处理数据
test_dataset = process_unlabelled_data(
    unlabelled_dataset,
    tokenizer,
    augmentations={
        'synonym': True,
        'delete': True,
        'swap': True,
        'paraphrase': True,
        'noise': True
    }
)

# 运行推理
preds = evaluate_model(peft_model, test_dataset, labelled=False, batch_size=32, data_collator=data_collator)

# 保存结果
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()
})
df_output.to_csv(os.path.join(output_dir, "inference_output_sentiment.csv"), index=False)
print("Inference complete. Predictions saved to inference_output_sentiment.csv")

Dataset info:
<class 'datasets.arrow_dataset.Dataset'>
Input data type: <class 'datasets.arrow_dataset.Dataset'>
First few items:
{'text': ['Remains of New Species of Hobbit-Sized Human Found Scientists in Australia have found a new species of hobbit-sized humans who lived about 18,000 years ago on an Indonesian island in a discovery that adds another piece to the complex puzzle of human evolution.', 'Iran to cease negotiations with EU in case of dead end A top Iranian official said Sunday that Iran would withdraw from the negotiations with the European Union (EU) if the upcoming talks in Brussels turned into a dead-end, the official IRNA news agency reported.', 'Israel levels new accusations against Syria Without acknowledging responsibility for the car-bombing death of a Hamas activist in Syria, Israeli Deputy Defense Minister Zeev Boim yesterday issued a toughly worded ', 'Enevo a Silicon Valley startup create self-powered battery and another new company building project creating lo

Processing texts: 100%|██████████| 8000/8000 [00:29<00:00, 267.00it/s]


Input data type: <class 'datasets.arrow_dataset.Dataset'>
First few items:
{'text': ['Remains of New Species of Hobbit-Sized Human Found Scientists in Australia have found a new species of hobbit-sized humans who lived about 18,000 years ago on an Indonesian island in a discovery that adds another piece to the complex puzzle of human evolution.', 'Iran to cease negotiations with EU in case of dead end A top Iranian official said Sunday that Iran would withdraw from the negotiations with the European Union (EU) if the upcoming talks in Brussels turned into a dead-end, the official IRNA news agency reported.', 'Israel levels new accusations against Syria Without acknowledging responsibility for the car-bombing death of a Hamas activist in Syria, Israeli Deputy Defense Minister Zeev Boim yesterday issued a toughly worded ', 'Enevo a Silicon Valley startup create self-powered battery and another new company building project creating low-C systems so is probably said in reality also include

Processing texts: 100%|██████████| 8000/8000 [00:30<00:00, 260.33it/s]
100%|██████████| 250/250 [00:28<00:00,  8.72it/s]

Inference complete. Predictions saved to inference_output_sentiment.csv



