# Starter Notebook

Install and import required libraries

In [41]:
# !pip install transformers datasets evaluate accelerate peft trl bitsandbytes
# !pip install nvidia-ml-py3

In [42]:
import os
import pandas as pd
import torch
import random
import re
import nltk
import pickle
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac

from transformers import (
    RobertaModel,
    RobertaTokenizer, 
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    RobertaForSequenceClassification,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel

# Download required NLTK data
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/viewsetting/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/viewsetting/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [43]:
# 设置设备
device = torch.device('cuda:1' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"使用设备: {device}")

使用设备: cuda:1


In [44]:
import wandb
wandb.login(key="2008ab8d896bfc68619ace7f820e0513468b9783", relogin=True)




True

In [45]:
# Get current wandb entity
current_entity = wandb.api.default_entity
print(f"Current wandb entity: {current_entity}")


Current wandb entity: jl10897-new-york-university


## Load Tokenizer and Preprocess Data

In [46]:
# base_model = 'roberta-base'
# # base_model = 'roberta-large'

# dataset = load_dataset('ag_news', split='train')
# tokenizer = RobertaTokenizer.from_pretrained(base_model)

# def clean_text(text):
#     text = re.sub(r"<.*?>", "", text)                 # 去除 HTML 标签
#     text = re.sub(r"http\S+|www\S+", "", text)        # 移除 URL
#     text = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\[\]\s]", "", text)  # 去除特殊字符
#     text = re.sub(r"\s+", " ", text).strip()          # 去除多余空格
#     return text.lower()                               # 可选：统一小写
    
# def preprocess(examples):
#     cleaned_texts = [clean_text(t) for t in examples['text']]
#     return tokenizer(cleaned_texts, truncation=True, padding=True, max_length=256)

# tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [47]:
nltk.download('averaged_perceptron_tagger_eng', download_dir='/home/viewsetting/nltk_data')



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/viewsetting/nltk_data...


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [48]:
nltk.data.path.append('/home/viewsetting/nltk_data')

In [None]:
base_model = 'roberta-base'
# base_model = 'roberta-large'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# Initialize augmenters
synonym_aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)
delete_aug = naw.RandomWordAug(action='delete', aug_p=0.1)
swap_aug = naw.RandomWordAug(action='swap', aug_p=0.1)
typo_aug = nac.RandomCharAug(action='swap', aug_char_p=0.05, aug_word_p=0.1)

def clean_text(text):
    text = re.sub(r"<.*?>", "", text)                 # 去除 HTML 标签
    text = re.sub(r"http\S+|www\S+", "", text)        # 移除 URL
    text = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\[\]\s]", "", text)  # 去除特殊字符
    text = re.sub(r"\s+", " ", text).strip()          # 去除多余空格
    return text.lower()                               # 统一小写

def sentence_shuffle(text):
    # Split text into sentences using simple punctuation-based splitting
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    if len(sentences) < 2:  # Skip if only one sentence
        return text
    # Randomly shuffle sentences
    random.shuffle(sentences)
    # Rejoin with spaces, ensuring proper spacing
    return ' '.join(s.strip() for s in sentences if s.strip())

def preprocess_with_augmentation(examples, augmentations=None):
    if augmentations is None:
        augmentations = {
            'synonym': False,
            'delete': False,
            'swap': False,
            'shuffle': False,
            'noise': False
        }

    # Clean texts first
    cleaned_texts = [clean_text(t) for t in examples['text']]
    augmented_texts = []

    for text in cleaned_texts:
        aug_text = text
        if augmentations.get('synonym', False):
            aug_text = synonym_aug.augment(aug_text)[0]
        if augmentations.get('delete', False):
            aug_text = delete_aug.augment(aug_text)[0]
        if augmentations.get('swap', False):
            aug_text = swap_aug.augment(aug_text)[0]
        if augmentations.get('shuffle', False):
            aug_text = sentence_shuffle(aug_text)
        if augmentations.get('noise', False):
            aug_text = typo_aug.augment(aug_text)[0]
        augmented_texts.append(aug_text)

    # Tokenize with same parameters as original
    return tokenizer(augmented_texts, truncation=True, padding=True, max_length=256)

# Example augmentation configuration
augmentation_config = {
    'synonym': True,
    'delete': True,
    'swap': True,
    'shuffle': True,  # Enable sentence shuffling
    'noise': True
}



tokenized_dataset = dataset.map(
    lambda examples: preprocess_with_augmentation(examples, augmentation_config),
    batched=True,
    remove_columns=["text"]
)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")



Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [50]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [51]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

## Anything from here on can be modified

In [52]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [53]:
# PEFT Config
# peft_config = LoraConfig(
#     r=2,
#     lora_alpha=4,
#     lora_dropout=0.05,
#     bias = 'none',
#     target_modules = ['query'],
#     task_type="SEQ_CLS",
# )
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "value"],
    task_type="SEQ_CLS"
)

In [54]:
# !module load gcc
# !which gcc


In [55]:
# import os
# os.environ["CC"] = "/share/apps/NYUAD5/gcc/9.2.0/bin/gcc"


In [56]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Module

In [57]:
# print("Trainable parameters:")
# for name, param in peft_model.named_parameters():
#     if param.requires_grad:
#         print(name)

In [58]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 741,124 || all params: 125,389,832 || trainable%: 0.5911


## Training Setup

In [59]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [60]:
# Setup Training args
output_dir = "results"
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     report_to=None,
#     eval_strategy='steps',
#     logging_steps=100,
#     learning_rate=5e-6,
#     num_train_epochs=1,
#     max_steps=1200,
#     use_cpu=False,
#     dataloader_num_workers=4,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=64,
#     optim="sgd",
#     gradient_checkpointing=False,
#     gradient_checkpointing_kwargs={'use_reentrant':True}
# )
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    logging_steps=100,
    eval_steps=200,
    save_steps=400,
    save_total_limit=2,

    # # device
    # device='cuda:3',

    learning_rate=1e-4,  # 对 LoRA 来说比较合理
    warmup_ratio=0.1,
    num_train_epochs=2,
    
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,

    optim="adamw_torch",  # 比 sgd 效果好很多
    weight_decay=0.01,


    gradient_checkpointing=False,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    run_name="lora_finetuning_run_1",
    
     # 添加进度条显示
    logging_first_step=True,  # 显示第一步的日志
    logging_nan_inf_filter=False,  # 显示所有日志，包括 NaN 和 Inf
    logging_strategy="steps",  # 按步数记录日志
    label_names=["labels"]
)


def get_trainer(model):
    # 设置模型的标签名称
    model.config.label2id = {label: i for i, label in enumerate(class_names)}
    model.config.id2label = {i: label for i, label in enumerate(class_names)}
    return  Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] #早停
    )

### Start Training

In [61]:
!export CUDA_VISIBLE_DEVICES=3


In [62]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Step,Training Loss,Validation Loss,Accuracy
200,0.6026,0.399347,0.86875
400,0.3915,0.383159,0.86875
600,0.3796,0.367866,0.875
800,0.3631,0.348966,0.8875
1000,0.3585,0.341553,0.889062
1200,0.3515,0.335361,0.8875
1400,0.3359,0.332197,0.889062
1600,0.3384,0.326703,0.890625
1800,0.3347,0.324699,0.890625


    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

In [63]:
# 保存模型
peft_model_path = os.path.join(output_dir, "peft_model_昨天")
peft_model.save_pretrained(peft_model_path)
# 保存tokenizer
tokenizer.save_pretrained(peft_model_path)
print(f"Model saved to {peft_model_path}")


Model saved to results/peft_model_昨天


## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [64]:
# def classify(model, tokenizer, text):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
#     output = model(**inputs)

#     prediction = output.logits.argmax(dim=-1).item()

#     print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
#     return id2label[prediction]

In [65]:
# classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
# classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [66]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=32, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [71]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 32, data_collator)

KeyboardInterrupt: 

In [73]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 加载模型
peft_model_path = "/home/viewsetting/ssd_2T/results/peft_model_昨天"
config = PeftConfig.from_pretrained(peft_model_path)
model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path,
    num_labels=4  # 确保与训练时的类别数量一致
)
peft_model = PeftModel.from_pretrained(model, peft_model_path)

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 32, data_collator)

100%|██████████| 20/20 [00:02<00:00,  9.05it/s]

Evaluation Metric: {'accuracy': 0.890625}





### Run Inference on unlabelled dataset

In [68]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("/home/viewsetting/ssd_2T/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess_with_augmentation, batched=True, remove_columns=["text"])
unlabelled_dataset

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 8000
})

In [69]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 32, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_outpu_shuffle.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 250/250 [00:26<00:00,  9.50it/s]

Inference complete. Predictions saved to inference_output.csv



