<a href="https://colab.research.google.com/github/GavinButts/MATH470Final/blob/main/nofinetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install transformers
!pip install torch
!pip install datasets
!pip install evaluate
!pip install collections

In [None]:
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
import os
from collections import defaultdict
import random

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# Load dataset directly with HuggingFace `datasets`
dataset = load_dataset("snli")

# Tokenize the dataset
def preprocess(examples):
    tokenized_inputs = tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    tokenized_inputs["label"] = [int(label) for label in examples["label"]]  # Ensure labels are integers
    return tokenized_inputs

# Apply preprocessing
encoded_dataset = dataset.map(preprocess, batched=True)

# Remove unnecessary columns
encoded_dataset = encoded_dataset.remove_columns(["premise", "hypothesis"])
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Split the dataset
train_dataset = encoded_dataset["train"]
val_dataset = encoded_dataset["validation"]

# Filter out invalid labels from the train and validation datasets
train_dataset = train_dataset.filter(lambda example: example["label"] in [0, 1, 2])
val_dataset = val_dataset.filter(lambda example: example["label"] in [0, 1, 2])

# Stratified sampling for the training dataset
def stratified_sample(dataset, fraction=0.5, seed=24):
    # Group examples by label
    label_to_examples = defaultdict(list)
    for idx, example in enumerate(dataset):
        label_to_examples[example["label"].item()].append(idx)

    # Randomly sample from each group
    random.seed(seed)
    sampled_indices = []
    for label, indices in label_to_examples.items():
        sample_size = int(len(indices) * fraction)
        sampled_indices.extend(random.sample(indices, sample_size))

    # Select the sampled examples
    return dataset.select(sampled_indices)

# Reduce the training dataset to 50% of its original size while preserving label distribution
train_dataset = stratified_sample(train_dataset, fraction=0.5)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./snli_base_model",
    evaluation_strategy="epoch",
    save_strategy="no",  # Disable saving
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,  # No fine-tuning, just evaluate
    logging_dir=None,
    report_to=[],  # Disable external logging integrations
    fp16=False,
)

# Load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

# Trainer for evaluation only
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Evaluate the base model
print("\nEvaluating the Base Model:")
eval_results = trainer.evaluate(eval_dataset=val_dataset)
print(eval_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]



Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

  trainer = Trainer(



Evaluating the Base Model:


{'eval_loss': 1.10783851146698, 'eval_model_preparation_time': 0.0035, 'eval_accuracy': 0.3382442592968909, 'eval_precision': 0.11440917894730235, 'eval_recall': 0.3382442592968909, 'eval_f1': 0.17098400109321232, 'eval_runtime': 25.681, 'eval_samples_per_second': 383.241, 'eval_steps_per_second': 23.987}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
