# The Data Collators for Contrastive Learning
### This notebook is for reference only and does NOT the correspond to the final submission in the Kaggle Competition!
### To run this notebook, move this to the master directory of the project, otherwise it may NOT run correctly!
### The notebook is INCOMPLETE. We had no time for training the regression model for contrastive learning.

In [1]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle
import nlpaug.augmenter.word as naw
import nlpaug.flow as nafc

  from .autonotebook import tqdm as notebook_tqdm





In [None]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

pipeline = nafc.Sometimes([
    naw.SynonymAug(aug_src='wordnet'),
    naw.RandomWordAug(action='delete', aug_p=0.1),
    naw.RandomWordAug(action='swap', aug_p=0.05),
])

def create_views(text):
    aug1 = pipeline.augment(text)
    aug2 = pipeline.augment(text)

    # If pipeline returns list of one string instead of string, fix it
    if isinstance(aug1, list): aug1 = aug1[0]
    if isinstance(aug2, list): aug2 = aug2[0]
    return [aug1, aug2]

def preprocess_with_augmentation(examples):
    views1 = []
    views2 = []
    for text in examples['text']:
        aug1, aug2 = create_views(text)
        views1.append(aug1)
        views2.append(aug2)

    max_seq_len = 363  # Maximum length for tokenized AG news texts is 362

    tokens1 = tokenizer(views1, truncation=True, padding="max_length", max_length=max_seq_len)
    tokens2 = tokenizer(views2, truncation=True, padding="max_length", max_length=max_seq_len)

    return {
        'input_ids_view1': tokens1['input_ids'],
        'attention_mask_view1': tokens1['attention_mask'],
        'input_ids_view2': tokens2['input_ids'],
        'attention_mask_view2': tokens2['attention_mask'],
        'labels': examples['label']
    }

tokenized_dataset = dataset.map(
    preprocess_with_augmentation,
    batched=True,
    remove_columns=["text"],
)


Map: 100%|██████████| 120000/120000 [05:02<00:00, 396.04 examples/s]


In [3]:
class DataCollatorForContrastiveLearning:
    def __init__(self, tokenizer, max_length=None):
        self.tokenizer = tokenizer
        self.pad = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=max_length)

    def __call__(self, features):
        # Split inputs by view
        features_view1 = [
            {'input_ids': f['input_ids_view1'], 'attention_mask': f['attention_mask_view1']} for f in features
        ]
        features_view2 = [
            {'input_ids': f['input_ids_view2'], 'attention_mask': f['attention_mask_view2']} for f in features
        ]

        # Pad both views
        batch1 = self.pad(features_view1)
        batch2 = self.pad(features_view2)

        # Stack them: [batch_size, 2, seq_len]
        input_ids = torch.stack([batch1['input_ids'], batch2['input_ids']], dim=1)
        attention_mask = torch.stack([batch1['attention_mask'], batch2['attention_mask']], dim=1)

        # Get labels
        labels = torch.tensor([f['labels'] for f in features], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }
    
class DataCollatorForClassification:
    def __init__(self, tokenizer, max_length=None):
        self.pad = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=max_length)

    def __call__(self, features):
        # import pdb
        # pdb.set_trace()
        features_view1 = [
            {
                'input_ids': f['input_ids_view1'],
                'attention_mask': f['attention_mask_view1']
            }
            for f in features
        ]

        batch = self.pad(features_view1)
        batch['labels'] = torch.tensor([f['labels'] for f in features], dtype=torch.long)
        return batch


In [None]:
class DebugCollator:
    def __init__(self, base_collator):
        self.base_collator = base_collator

    def __call__(self, batch):
        print("Collator received batch of size:", len(batch))
        return self.base_collator(batch)

In [4]:
data_collator = DataCollatorForContrastiveLearning(tokenizer, max_length=363)

In [4]:
data_collator = DataCollatorForClassification(tokenizer, max_length=363)
# data_collator = DebugCollator(data_collator)

In [17]:
from torch.utils.data import DataLoader
batch = next(iter(DataLoader(tokenized_dataset, batch_size=4, collate_fn=data_collator)))

🔍 Collator received batch of size: 4


In [None]:
# Extract the number of classess and their names
class_names = dataset.features["label"].names
num_labels = len(class_names)
id2label = {i: label for i, label in enumerate(class_names)}

print(f"✔️ Augmented and tokenized dataset ready. Number of labels: {num_labels}, Classes: {class_names}")



✔️ Augmented and tokenized dataset ready. Number of labels: 4, Classes: ['World', 'Sports', 'Business', 'Sci/Tech']


In [6]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label) 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']
train_dataset = train_dataset.remove_columns(['label'])
eval_dataset = eval_dataset.remove_columns(['label'])

In [36]:
train_dataset

Dataset({
    features: ['input_ids_view1', 'attention_mask_view1', 'input_ids_view2', 'attention_mask_view2', 'labels'],
    num_rows: 119360
})

In [23]:
for i, f in enumerate(train_dataset):
    if 'input_ids_view1' not in f:
        print(f"🚫 Sample {i} missing 'input_ids_view1'")

In [8]:
# PEFT Config
peft_config = LoraConfig(
    r=2,
    lora_alpha=4,
    lora_dropout=0.05,
    bias = 'none',
    target_modules = ['query'],
    task_type="SEQ_CLS",
)

In [9]:
peft_model = get_peft_model(model, peft_config)
# peft_model

In [10]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 630,532 || all params: 125,279,240 || trainable%: 0.5033


### Test the data collators

In [18]:
batch = next(iter(DataLoader(train_dataset, batch_size=4, collate_fn=data_collator)))

peft_model.train()
output = peft_model(
    input_ids=batch['input_ids'].to(peft_model.device),
    attention_mask=batch['attention_mask'].to(peft_model.device),
    labels=batch['labels'].to(peft_model.device)
)
print(output.loss)

🔍 Collator received batch of size: 4
tensor(1.3891, device='cuda:0', grad_fn=<NllLossBackward0>)


In [29]:
batch = next(iter(DataLoader(eval_dataset, batch_size=4, collate_fn=data_collator)))

peft_model.eval()
output = peft_model(
    input_ids=batch['input_ids'].to(peft_model.device),
    attention_mask=batch['attention_mask'].to(peft_model.device),
    labels=batch['labels'].to(peft_model.device)
)
print(output.loss)

tensor(1.4454, grad_fn=<NllLossBackward0>)


## Training Setup

In [11]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [12]:
from transformers import EarlyStoppingCallback
# Setup Training args
output_dir = "results7"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    eval_steps=1000,
    save_strategy='steps',
    save_steps=1000,
    logging_steps=1000,
    learning_rate=1e-4,
    num_train_epochs=5,
    use_cpu=False,
    dataloader_num_workers=0,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant':True},
    load_best_model_at_end=True,
    remove_unused_columns = False
)

def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
          callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
      )

### Start Training

In [13]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mly2414[0m ([33mly2414-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
1000,0.5697,0.336833,0.896875
2000,0.3473,0.332048,0.8875
3000,0.3255,0.328261,0.895312
4000,0.3062,0.320629,0.890625
5000,0.3067,0.308067,0.885938
6000,0.2893,0.317132,0.885938
7000,0.2895,0.30403,0.895312
8000,0.2878,0.287343,0.898438
9000,0.2776,0.281952,0.898438
10000,0.2686,0.283971,0.903125
