<a href="https://colab.research.google.com/github/GavinButts/MATH470Final/blob/main/fullfinetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import all necessary packages.
- If you have all packages installed already, you may skip the first code block
- If you do not have all packages installed already, run both code blocks.


If an error occurs installing or importing the packages, remove `%%capture` and restart runtime. Follow errors to resolve issue.

In [None]:
%%capture
!pip install transformers
!pip install torch
!pip install datasets
!pip install evaluate
!pip install collections

In [None]:
import json
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from datasets import load_dataset
import evaluate
import os
from collections import defaultdict
import random
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

Now, we will prepare the dataset. This process includes the following steps.

- Remove datapoints with low inter-rater reliability (these are data points where the annotators could not agree on a correct label).
- Map categorical variable to numerical values.
  - entailment → 0
  - neutral → 1
  - contradiction → 2

In [None]:
# Load dataset directly with HuggingFace `datasets`
dataset = load_dataset("snli")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# Tokenize the dataset
def preprocess(examples):
    tokenized_inputs = tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    # Ensure labels are tensors and integers
    tokenized_inputs["label"] = [int(label) for label in examples["label"]]  # Ensure labels are integers
    return tokenized_inputs

# Apply preprocessing
encoded_dataset = dataset.map(preprocess, batched=True)

# Remove unnecessary columns
encoded_dataset = encoded_dataset.remove_columns(["premise", "hypothesis"])
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Split the dataset
train_dataset = encoded_dataset["train"]
val_dataset = encoded_dataset["validation"]

# Filter out invalid labels from the train and validation datasets
train_dataset = train_dataset.filter(lambda example: example["label"] in [0, 1, 2])
val_dataset = val_dataset.filter(lambda example: example["label"] in [0, 1, 2])

# Stratified sampling for the training dataset
def stratified_sample(dataset, fraction=0.5, seed=24):
    # Group examples by label
    label_to_examples = defaultdict(list)
    for idx, example in enumerate(dataset):
        label_to_examples[example["label"].item()].append(idx)

    # Randomly sample from each group
    random.seed(seed)
    sampled_indices = []
    for label, indices in label_to_examples.items():
        sample_size = int(len(indices) * fraction)
        sampled_indices.extend(random.sample(indices, sample_size))

    # Select the sampled examples
    return dataset.select(sampled_indices)

# Reduce the training dataset to 50% of its original size while preserving label distribution
train_dataset = stratified_sample(train_dataset, fraction=0.5)

# Verify the label distribution after sampling
#train_labels = [example["label"].item() for example in train_dataset]
#val_labels = [example["label"].item() for example in val_dataset]

#print(f"Reduced training dataset size: {len(train_dataset)}")
#print(f"Validation dataset size: {len(val_dataset)}")
#print(f"Train label distribution: { {label: train_labels.count(label) for label in set(train_labels)} }")
#print(f"Validation label distribution: { {label: val_labels.count(label) for label in set(val_labels)} }")

# Verify the labels
#train_labels = [example["label"].item() for example in train_dataset]
#val_labels = [example["label"].item() for example in val_dataset]

#print("Unique train labels after filtering:", set(train_labels))
#print("Unique validation labels after filtering:", set(val_labels))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

KeyboardInterrupt: 

Now, we will specify how to train the model. Since this is the first time we are running this model, we are not too concerned about learning rate, batch size, epochs, etc. We do, however, want to decrease logging. This is slow when running on CPU.

In [None]:
training_args = TrainingArguments(
    output_dir="./snli_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Adjust based on GPU memory
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir=None,
    report_to=[],  # Disable external logging integrations
    fp16=False,  # Enable mixed precision training
)

This code is only needed to evaluate the model.

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=predictions, references=labels)

Most importantly-- train the model!

Model usage is linked to my account. To run this code, you will likely need to use my API. This key is to be used solely for Loyola Marymount University's Machine Learning course offered by Dr. Junyuan Lin, FA2024.

API: `c106b36638d831c31a78263d3588995d5454329d`

In [None]:
!nvidia-smi

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./snli_model")
tokenizer.save_pretrained("./snli_model")

Run to evaluate model

In [None]:
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate

# Paths to your fine-tuned models
fullfinetuned_model_path = "/content/drive/My Drive/ML Final/fullfinetune_ML_Final/checkpoint-3"
lorafinetuned_model_path = "/content/drive/My Drive/ML Final/lorafinetune_ML_Final/checkpoint-3"

# Load SNLI dataset again
dataset = load_dataset("snli")

# Load the tokenizer (same as during training)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Preprocessing function (same as during training)
def preprocess(examples):
    tokenized_inputs = tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    # Ensure labels are integers
    tokenized_inputs["label"] = [int(label) for label in examples["label"]]
    return tokenized_inputs

# Apply preprocessing to test set
encoded_test = dataset["test"].map(preprocess, batched=True)
encoded_test = encoded_test.remove_columns(["premise", "hypothesis"])
encoded_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Filter out invalid labels from the test dataset
encoded_test = encoded_test.filter(lambda example: example["label"] in [0, 1, 2])

# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def evaluate_model(model_path):
    # Load model
    model = RobertaForSequenceClassification.from_pretrained(model_path)

    # Create a Trainer for evaluation
    training_args = TrainingArguments(
        output_dir="./evaluation",
        per_device_eval_batch_size=16,
        dataloader_drop_last=False,
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        eval_dataset=encoded_test,
        compute_metrics=compute_metrics
    )

    results = trainer.evaluate()
    return results

# Evaluate full fine-tuned model
print("Evaluating fully fine-tuned model:")
full_results = evaluate_model(fullfinetuned_model_path)
print(full_results)

# Evaluate LoRA fine-tuned model
print("Evaluating LoRA fine-tuned model:")
lora_results = evaluate_model(lorafinetuned_model_path)
print(lora_results)