# Starter Notebook

Install and import required libraries

In [32]:
# !pip install transformers datasets evaluate accelerate peft trl bitsandbytes
# !pip install nvidia-ml-py3

In [3]:
import os
import pandas as pd
import torch
import random
import re
import nltk
import pickle
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac

from transformers import (
    RobertaModel,
    RobertaTokenizer, 
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    RobertaForSequenceClassification,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel

# Download required NLTK data
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/viewsetting/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/viewsetting/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
# 设置设备
device = torch.device('cuda:1' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"使用设备: {device}")

使用设备: cuda:1


In [3]:
import wandb
wandb.login(key="2008ab8d896bfc68619ace7f820e0513468b9783", relogin=True)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/viewsetting/.netrc


True

In [4]:
# Get current wandb entity
current_entity = wandb.api.default_entity
print(f"Current wandb entity: {current_entity}")


Current wandb entity: jl10897-new-york-university


## Load Tokenizer and Preprocess Data

In [37]:
# base_model = 'roberta-base'
# # base_model = 'roberta-large'

# dataset = load_dataset('ag_news', split='train')
# tokenizer = RobertaTokenizer.from_pretrained(base_model)

# def clean_text(text):
#     text = re.sub(r"<.*?>", "", text)                 # 去除 HTML 标签
#     text = re.sub(r"http\S+|www\S+", "", text)        # 移除 URL
#     text = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\[\]\s]", "", text)  # 去除特殊字符
#     text = re.sub(r"\s+", " ", text).strip()          # 去除多余空格
#     return text.lower()                               # 可选：统一小写
    
# def preprocess(examples):
#     cleaned_texts = [clean_text(t) for t in examples['text']]
#     return tokenizer(cleaned_texts, truncation=True, padding=True, max_length=256)

# tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
nltk.download('averaged_perceptron_tagger_eng', download_dir='/home/viewsetting/nltk_data')



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/viewsetting/nltk_data...


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [45]:
nltk.data.path.append('/home/viewsetting/nltk_data')

In [5]:
import numpy as np
base_model = 'roberta-base'
# base_model = 'roberta-large'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# 初始化增强器
synonym_aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)
delete_aug = naw.RandomWordAug(action='delete', aug_p=0.1)
swap_aug = naw.RandomWordAug(action='swap', aug_p=0.1)
typo_aug = nac.RandomCharAug(action='swap', aug_char_p=0.05, aug_word_p=0.1)

# 定义情感词列表（按类别）
sentiment_words = {
    'World': ['terrible', 'urgent', 'critical', 'important', 'serious'],
    'Sports': ['exciting', 'thrilling', 'great', 'amazing', 'intense'],
    'Business': ['profitable', 'successful', 'promising', 'risky', 'innovative'],
    'Sci/Tech': ['advanced', 'innovative', 'futuristic', 'complex', 'technical']
}

def sentiment_word_insertion(text, label, aug_p=0.3):
    """随机插入情感词增强"""
    words = text.split()
    if random.random() < aug_p:
        label_name = ['World', 'Sports', 'Business', 'Sci/Tech'][label]
        sentiment_list = sentiment_words[label_name]
        sentiment_word = random.choice(sentiment_list)
        insert_pos = random.randint(0, len(words))
        words.insert(insert_pos, sentiment_word)
    return ' '.join(words)

def clean_text(text):
    """清理文本：去除HTML标签、URL、特殊字符，并统一小写"""
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\[\]\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

def simple_paraphrase(text):
    """简单基于规则的释义"""
    replacements = {
        'said': 'stated',
        'big': 'large',
        'small': 'tiny',
        'good': 'excellent',
        'bad': 'poor',
        'buy': 'purchase',
        'sell': 'trade',
        'make': 'create',
        'show': 'display',
        'start': 'begin'
    }
    words = text.split()
    for i, word in enumerate(words):
        if word.lower() in replacements and random.random() < 0.3:
            words[i] = replacements[word.lower()]
    return ' '.join(words)

def process_single_example(example, augmentations=None):
    """处理单个样本：清理文本、应用增强、分词"""
    if augmentations is None:
        augmentations = {
            'synonym': False,
            'delete': False,
            'swap': False,
            'paraphrase': False,
            'noise': False,
            'sentiment': False
        }

    # 检查是否是批处理模式
    if isinstance(example['text'], (list, np.ndarray)):
        # 批处理模式
        texts = example['text']
        labels = example['label']
        
        # 处理每个文本
        cleaned_texts = [clean_text(t) for t in texts]
        aug_texts = []
        
        for text, label in zip(cleaned_texts, labels):
            aug_text = text
            if augmentations.get('sentiment', False):
                aug_text = sentiment_word_insertion(aug_text, label, aug_p=0.3)
            if augmentations.get('synonym', False):
                aug_text = synonym_aug.augment(aug_text)[0]
            if augmentations.get('delete', False):
                aug_text = delete_aug.augment(aug_text)[0]
            if augmentations.get('swap', False):
                aug_text = swap_aug.augment(aug_text)[0]
            if augmentations.get('paraphrase', False):
                aug_text = simple_paraphrase(aug_text)
            if augmentations.get('noise', False):
                aug_text = typo_aug.augment(aug_text)[0]
            aug_texts.append(aug_text)
            
        # 批量分词
        return tokenizer(aug_texts, truncation=True, padding=True, max_length=256)
    else:
        # 单个样本模式
        text = clean_text(example['text'])
        label = example['label'] if isinstance(example, dict) else example.label

        # 应用增强
        aug_text = text
        if augmentations.get('sentiment', False):
            aug_text = sentiment_word_insertion(aug_text, label, aug_p=0.3)
        if augmentations.get('synonym', False):
            aug_text = synonym_aug.augment(aug_text)[0]
        if augmentations.get('delete', False):
            aug_text = delete_aug.augment(aug_text)[0]
        if augmentations.get('swap', False):
            aug_text = swap_aug.augment(aug_text)[0]
        if augmentations.get('paraphrase', False):
            aug_text = simple_paraphrase(aug_text)
        if augmentations.get('noise', False):
            aug_text = typo_aug.augment(aug_text)[0]

        # 分词
        tokenized = tokenizer(aug_text, truncation=True, padding=True, max_length=256)
        tokenized['labels'] = label
        return tokenized
    
def preprocess_dataset(dataset, augmentations=None, use_parallel=False, num_workers=None):
    """
    预处理数据集，支持并行和非并行处理
    
    Args:
        dataset: 要处理的数据集
        augmentations: 数据增强配置
        use_parallel: 是否使用并行处理
        num_workers: 并行处理的工作进程数
    """
    if use_parallel:
        # 并行处理
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        if num_workers is None:
            num_workers = max(1, cpu_count() - 1)
        
        data_list = [example for example in dataset]
        process_func = partial(process_single_example, augmentations=augmentations)
        
        with Pool(num_workers) as pool:
            tokenized_examples = pool.map(process_func, data_list)
            
        tokenized_dataset = Dataset.from_dict({
            'input_ids': [ex['input_ids'] for ex in tokenized_examples],
            'attention_mask': [ex['attention_mask'] for ex in tokenized_examples],
            'labels': [ex['labels'] for ex in tokenized_examples]
        })
    else:
        # 非并行处理
        tokenized_dataset = dataset.map(
            lambda examples: process_single_example(examples, augmentations),
            batched=True,
            remove_columns=["text"]
        )
    
    return tokenized_dataset

# 示例增强配置
augmentation_config = {
    'synonym': True,
    'delete': True,
    'swap': True,
    'paraphrase': True,
    'noise': True,
    'sentiment': True  # 启用情感词插入
}

# 使用时可以选择是否启用并行处理
import time
start_time = time.time()
tokenized_dataset = preprocess_dataset(
    dataset, 
    augmentations=augmentation_config,
    use_parallel=False,  # 设置为 False 则使用非并行处理
    num_workers=8
)
print(f"预处理耗时: {time.time() - start_time} 秒")



Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

预处理耗时: 366.195396900177 秒


In [11]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [10]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

## Anything from here on can be modified

In [6]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [12]:
# PEFT Config
# peft_config = LoraConfig(
#     r=2,
#     lora_alpha=4,
#     lora_dropout=0.05,
#     bias = 'none',
#     target_modules = ['query'],
#     task_type="SEQ_CLS",
# )
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "value"],
    task_type="SEQ_CLS"
)

In [12]:
# !module load gcc
# !which gcc


In [13]:
# import os
# os.environ["CC"] = "/share/apps/NYUAD5/gcc/9.2.0/bin/gcc"


In [13]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Module

In [15]:
# print("Trainable parameters:")
# for name, param in peft_model.named_parameters():
#     if param.requires_grad:
#         print(name)

In [14]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 741,124 || all params: 125,389,832 || trainable%: 0.5911


## Training Setup

In [15]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [10]:
# Setup Training args
output_dir = "results"
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     report_to=None,
#     eval_strategy='steps',
#     logging_steps=100,
#     learning_rate=5e-6,
#     num_train_epochs=1,
#     max_steps=1200,
#     use_cpu=False,
#     dataloader_num_workers=4,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=64,
#     optim="sgd",
#     gradient_checkpointing=False,
#     gradient_checkpointing_kwargs={'use_reentrant':True}
# )
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    logging_steps=100,
    eval_steps=200,
    save_steps=400,
    save_total_limit=2,

    # # device
    # device='cuda:3',

    learning_rate=1e-4,  # 对 LoRA 来说比较合理
    warmup_ratio=0.1,
    fp16=True,

    num_train_epochs=2,
    
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,

    optim="adamw_torch",  # 比 sgd 效果好很多
    weight_decay=0.01,

    # use tensorboard
    # use_tensorboard=True,

    gradient_checkpointing=True,
    gradient_accumulation_steps=2,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    run_name="lora_finetuning_run_1",
    
     # 添加进度条显示
    logging_first_step=True,  # 显示第一步的日志
    logging_nan_inf_filter=False,  # 显示所有日志，包括 NaN 和 Inf
    logging_strategy="steps",  # 按步数记录日志
    label_names=["labels"]
)


def get_trainer(model):
    # 设置模型的标签名称
    model.config.label2id = {label: i for i, label in enumerate(class_names)}
    model.config.id2label = {i: label for i, label in enumerate(class_names)}
    return  Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] #早停
    )

### Start Training

In [17]:
!export CUDA_VISIBLE_DEVICES=3


In [25]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy
200,0.6895,0.620559,0.83125
400,0.6299,0.570813,0.825
600,0.6129,0.544243,0.825
800,0.5985,0.528592,0.826562
1000,0.6017,0.521703,0.821875
1200,0.6102,0.514241,0.825
1400,0.5868,0.507734,0.83125
1600,0.5732,0.508855,0.825
1800,0.5769,0.508148,0.828125


  return fn(*args, **kwargs)
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  return fn(*args, **kwargs)
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  return fn(*args, **kwargs)
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  return fn(*args, **kwargs)
    There is an imbalance between your GPUs. You may want to exclude GP

In [26]:
# 保存模型
peft_model_path = os.path.join(output_dir, "peft_model_sentiment")
peft_model.save_pretrained(peft_model_path)
# 保存tokenizer
tokenizer.save_pretrained(peft_model_path)
print(f"Model saved to {peft_model_path}")


Model saved to results/peft_model_sentiment


## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [15]:
# def classify(model, tokenizer, text):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
#     output = model(**inputs)

#     prediction = output.logits.argmax(dim=-1).item()

#     print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
#     return id2label[prediction]

In [16]:
# classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
# classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [27]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=32, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [28]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 32, data_collator)

100%|██████████| 20/20 [00:02<00:00,  9.08it/s]

Evaluation Metric: {'accuracy': 0.828125}





## 重新导入模型

In [7]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 加载模型
peft_model_path = "/home/viewsetting/ssd_2T/results/peft_model_sentiment"
config = PeftConfig.from_pretrained(peft_model_path)
model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path,
    num_labels=4  # 确保与训练时的类别数量一致
)
peft_model = PeftModel.from_pretrained(model, peft_model_path)

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=32, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [12]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 32, data_collator)

100%|██████████| 20/20 [00:02<00:00,  7.93it/s]

Evaluation Metric: {'accuracy': 0.8390625}





### Run Inference on unlabelled dataset

In [16]:
# #Load your unlabelled data
# unlabelled_dataset = pd.read_pickle("/home/viewsetting/ssd_2T/test_unlabelled.pkl")
# test_dataset = unlabelled_dataset.map(preprocess_dataset, batched=True, remove_columns=["text"])
# unlabelled_dataset

# 首先检查加载的数据格式
unlabelled_dataset = pd.read_pickle("/home/viewsetting/ssd_2T/test_unlabelled.pkl")
print("Type of unlabelled_dataset:", type(unlabelled_dataset))
print("\nStructure of unlabelled_dataset:")
print(unlabelled_dataset)

# 如果已经是 Dataset 对象，直接使用
if isinstance(unlabelled_dataset, Dataset):
    test_dataset = unlabelled_dataset
else:
    # 如果是 DataFrame，转换为 Dataset
    test_dataset = Dataset.from_pandas(unlabelled_dataset)

# 应用预处理
test_dataset = test_dataset.map(
    lambda examples: process_single_example(examples, augmentations=None),
    batched=True,
    remove_columns=["text"]
)

# 运行推理并保存预测结果
preds = evaluate_model(peft_model, test_dataset, False, 32, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()
})
df_output.to_csv(os.path.join(output_dir, "inference_output_sentiment.csv"), index=False)
print("推理完成。预测结果已保存到 inference_output.csv")

Type of unlabelled_dataset: <class 'datasets.arrow_dataset.Dataset'>

Structure of unlabelled_dataset:
Dataset({
    features: ['text'],
    num_rows: 8000
})


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

KeyError: 'label'

In [17]:
def process_single_example(example, augmentations=None):
    """处理单个样本：清理文本、应用增强、分词"""
    if augmentations is None:
        augmentations = {
            'synonym': False,
            'delete': False,
            'swap': False,
            'paraphrase': False,
            'noise': False,
            'sentiment': False
        }

    # 检查是否是批处理模式
    if isinstance(example['text'], (list, np.ndarray)):
        # 批处理模式
        texts = example['text']
        
        # 处理每个文本
        cleaned_texts = [clean_text(t) for t in texts]
        aug_texts = []
        
        for text in cleaned_texts:
            aug_text = text
            if augmentations.get('synonym', False):
                aug_text = synonym_aug.augment(aug_text)[0]
            if augmentations.get('delete', False):
                aug_text = delete_aug.augment(aug_text)[0]
            if augmentations.get('swap', False):
                aug_text = swap_aug.augment(aug_text)[0]
            if augmentations.get('paraphrase', False):
                aug_text = simple_paraphrase(aug_text)
            if augmentations.get('noise', False):
                aug_text = typo_aug.augment(aug_text)[0]
            aug_texts.append(aug_text)
            
        # 批量分词
        return tokenizer(aug_texts, truncation=True, padding=True, max_length=256)
    else:
        # 单个样本模式
        text = clean_text(example['text'])

        # 应用增强
        aug_text = text
        if augmentations.get('synonym', False):
            aug_text = synonym_aug.augment(aug_text)[0]
        if augmentations.get('delete', False):
            aug_text = delete_aug.augment(aug_text)[0]
        if augmentations.get('swap', False):
            aug_text = swap_aug.augment(aug_text)[0]
        if augmentations.get('paraphrase', False):
            aug_text = simple_paraphrase(aug_text)
        if augmentations.get('noise', False):
            aug_text = typo_aug.augment(aug_text)[0]

        # 分词
        return tokenizer(aug_text, truncation=True, padding=True, max_length=256)

# 使用修改后的函数处理测试数据
test_dataset = test_dataset.map(
    lambda examples: process_single_example(examples, augmentations=None),
    batched=True,
    remove_columns=["text"]
)

# 运行推理并保存预测结果
preds = evaluate_model(peft_model, test_dataset, False, 32, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()
})
df_output.to_csv(os.path.join(output_dir, "inference_output_sentiment.csv"), index=False)
print("推理完成。预测结果已保存到 inference_output.csv")

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

100%|██████████| 250/250 [00:25<00:00,  9.65it/s]

推理完成。预测结果已保存到 inference_output.csv





In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 32, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output_sentiment.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")