<a href="https://colab.research.google.com/github/Jagoda222/LoLa---group-8/blob/Jagoda/model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports





In [4]:
!pip install accelerate -U
!pip install datasets==2.14.5




In [5]:
import torch
from os import path as op
import os
import numpy as np
from collections import Counter
import datasets
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# GPT2:
## First 1000 rows
- accuracy: +/- 40%
- time: 30s (GPU)

## Random 1000 rows with balanced labels
- accuracy: +/- 42%
- time: 30s (GPU)

In [17]:
# META Variables
# it is good to have certain directories for saving model checkpoints (e.g., on google drive)
MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "gpt2"
BATCH_SIZE = 16

snli_data = load_dataset("snli")
print(Counter(snli_data['train']['label']))

# SNLI data needs to be cleaned as it contains -1s as a label
for k in snli_data:
    snli_data[k] = snli_data[k].filter( lambda prob: prob['label'] >= 0 )


metric = load_metric('glue', "mnli")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Add a padding token for GPT-2 since it doesn’t have one by default
tokenizer.pad_token = tokenizer.eos_token

# https://huggingface.co/transformers/preprocessing.html
def preprocess_function(d):
    return tokenizer(d['premise'], d['hypothesis'], truncation=True)


model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)
model.config.pad_token_id = model.config.eos_token_id

encoded_snli_data = snli_data.map(preprocess_function, batched=True, load_from_cache_file=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

args = TrainingArguments(
    MODEL_DIR, # to save models
    # evaluation_strategy = "epoch", # 1 epoch for training takes too long for colab
    evaluation_strategy = "steps",
    eval_steps = 500, # evaluate and save after training on every next 500x16 examples
    save_steps=500, # saves model after every 500 steps. save_steps should be divisible on eval_steps
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1, # going throught the training data only once
    weight_decay=0.01,
    load_best_model_at_end=True, # after fine-tuning trainer.model will keep the best model
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_snli_data["train"].select(range(1000)),
    eval_dataset=encoded_snli_data["validation"],
    # You could use "test" here but it will be cheating then
    # to select the model checkpoint which gets highest score on test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Counter({0: 183416, 2: 183187, 1: 182764, -1: 785})


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss


TrainOutput(global_step=63, training_loss=1.098713587200831, metrics={'train_runtime': 19.9809, 'train_samples_per_second': 50.048, 'train_steps_per_second': 3.153, 'total_flos': 22141016764416.0, 'train_loss': 1.098713587200831, 'epoch': 1.0})

In [19]:
# evaluation of a particular model

# if you want to load a model from a checkpoint for evaluation
# ft_model = AutoModelForSequenceClassification.from_pretrained(op.join(MODEL_DIR, 'checkpoint-5000'))

trainer_eval = Trainer(
    trainer.model, # model that you want to evaluate, In this case this is the best model based on the fine-tuning
    args,
    train_dataset=encoded_snli_data["train"].select(range(1000)),
    eval_dataset=encoded_snli_data["test"], # you want to evaluate on test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_eval.evaluate()

  trainer_eval = Trainer(


{'eval_loss': 1.0584849119186401,
 'eval_model_preparation_time': 0.0045,
 'eval_accuracy': 0.4551099348534202,
 'eval_runtime': 27.6495,
 'eval_samples_per_second': 355.304,
 'eval_steps_per_second': 22.207}

# DeBERTA:

## First 1000 rows
- accuracy: +/- 34%
- time: +/- 20 minutes (CPU), 35s (GPU)

## Random 1000 rows with balanced labels
- accuracy: +/- 33%
- time: 35s (GPU)

In [10]:
# META Variables
# it is good to have certain directories for saving model checkpoints (e.g., on google drive)
MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "microsoft/deberta-base"
BATCH_SIZE = 16

snli_data = load_dataset("snli")
print(Counter(snli_data['train']['label']))

# SNLI data needs to be cleaned as it contains -1s as a label
for k in snli_data:
    snli_data[k] = snli_data[k].filter( lambda prob: prob['label'] >= 0 )

metric = load_metric('glue', "mnli")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


# https://huggingface.co/transformers/preprocessing.html
def preprocess_function(d):
    return tokenizer(d['premise'], d['hypothesis'], truncation=True)

encoded_snli_data = snli_data.map(preprocess_function, batched=True, load_from_cache_file=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

args = TrainingArguments(
    MODEL_DIR, # to save models
    # evaluation_strategy = "epoch", # 1 epoch for training takes too long for colab
    evaluation_strategy = "steps",
    eval_steps = 500, # evaluate and save after training on every next 500x16 examples
    save_steps=500, # saves model after every 500 steps. save_steps should be divisible on eval_steps
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1, # going throught the training data only once
    weight_decay=0.01,
    load_best_model_at_end=True, # after fine-tuning trainer.model will keep the best model
    metric_for_best_model="accuracy",
)


trainer = Trainer(
    model,
    args,
    train_dataset=encoded_snli_data["train"].select(range(1000)),
    eval_dataset=encoded_snli_data["validation"],
    # You could use "test" here but it will be cheating then
    # to select the model checkpoint which gets highest score on test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Counter({0: 183416, 2: 183187, 1: 182764, -1: 785})


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

TrainOutput(global_step=63, training_loss=1.1051064143105158, metrics={'train_runtime': 25.5885, 'train_samples_per_second': 39.08, 'train_steps_per_second': 2.462, 'total_flos': 27776040271200.0, 'train_loss': 1.1051064143105158, 'epoch': 1.0})

In [11]:
# evaluation of a particular model

# if you want to load a model from a checkpoint for evaluation
# ft_model = AutoModelForSequenceClassification.from_pretrained(op.join(MODEL_DIR, 'checkpoint-5000'))

trainer_eval = Trainer(
    trainer.model, # model that you want to evaluate, In this case this is the best model based on the fine-tuning
    args,
    train_dataset=encoded_snli_data["train"].select(range(1000)),
    eval_dataset=encoded_snli_data["test"], # you want to evaluate on test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_eval.evaluate()

  trainer_eval = Trainer(


{'eval_loss': 1.1019401550292969,
 'eval_model_preparation_time': 0.0035,
 'eval_accuracy': 0.34283387622149836,
 'eval_runtime': 30.1552,
 'eval_samples_per_second': 325.782,
 'eval_steps_per_second': 20.361}

# RoBERTA

## First 1000 rows
- accuracy: 32%
- time: 20 min (CPU), 30s (CPU)

## Random 1000 rows with balanced labels
- accuracy: 33%
- time: 30s (CPU)

In [13]:
# META Variables
# it is good to have certain directories for saving model checkpoints (e.g., on google drive)
MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "roberta-base"
BATCH_SIZE = 16

snli_data = load_dataset("snli")
print(Counter(snli_data['train']['label']))

# SNLI data needs to be cleaned as it contains -1s as a label
for k in snli_data:
    snli_data[k] = snli_data[k].filter( lambda prob: prob['label'] >= 0 )

metric = load_metric('glue', "mnli")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# https://huggingface.co/transformers/preprocessing.html
def preprocess_function(d):
    return tokenizer(d['premise'], d['hypothesis'], truncation=True)

encoded_snli_data = snli_data.map(preprocess_function, batched=True, load_from_cache_file=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

args = TrainingArguments(
    MODEL_DIR, # to save models
    # evaluation_strategy = "epoch", # 1 epoch for training takes too long for colab
    evaluation_strategy = "steps",
    eval_steps = 500, # evaluate and save after training on every next 500x16 examples
    save_steps=500, # saves model after every 500 steps. save_steps should be divisible on eval_steps
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1, # going throught the training data only once
    weight_decay=0.01,
    load_best_model_at_end=True, # after fine-tuning trainer.model will keep the best model
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_snli_data["train"].select(range(1000)),
    eval_dataset=encoded_snli_data["validation"],
    # You could use "test" here but it will be cheating then
    # to select the model checkpoint which gets highest score on test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Counter({0: 183416, 2: 183187, 1: 182764, -1: 785})


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss


TrainOutput(global_step=63, training_loss=1.1041773235987102, metrics={'train_runtime': 18.4116, 'train_samples_per_second': 54.314, 'train_steps_per_second': 3.422, 'total_flos': 24350324581296.0, 'train_loss': 1.1041773235987102, 'epoch': 1.0})

In [15]:
# evaluation of a particular model

# if you want to load a model from a checkpoint for evaluation
# ft_model = AutoModelForSequenceClassification.from_pretrained(op.join(MODEL_DIR, 'checkpoint-5000'))

trainer_eval = Trainer(
    trainer.model, # model that you want to evaluate, In this case this is the best model based on the fine-tuning
    args,
    train_dataset=encoded_snli_data["train"].select(range(1000)),
    eval_dataset=encoded_snli_data["test"], # you want to evaluate on test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_eval.evaluate()

  trainer_eval = Trainer(


{'eval_loss': 1.0979324579238892,
 'eval_model_preparation_time': 0.0297,
 'eval_accuracy': 0.32797231270358307,
 'eval_runtime': 27.0222,
 'eval_samples_per_second': 363.553,
 'eval_steps_per_second': 22.722}

# RoBERTA 2 (from homework):

## First 1000 rows
- accuracy: 92%
- time: 1min (GPU)

## Random 1000 rows with the balanced labels
- accuracy: 91%
- time: 1min (GPU)

In [None]:
# META Variables
# it is good to have certain directories for saving model checkpoints (e.g., on google drive)
MODEL_DIR = 'model_checkpoints'
MODEL_CHECKPOINT = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
BATCH_SIZE = 16

metric = load_metric('glue', "mnli")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# https://huggingface.co/transformers/preprocessing.html
def preprocess_function(d):
    return tokenizer_r(d['premise'], d['hypothesis'], truncation=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

args = TrainingArguments(
    MODEL_DIR, # to save models
    # evaluation_strategy = "epoch", # 1 epoch for training takes too long for colab
    evaluation_strategy = "steps",
    eval_steps = 500, # evaluate and save after training on every next 500x16 examples
    save_steps=500, # saves model after every 500 steps. save_steps should be divisible on eval_steps
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1, # going throught the training data only once
    weight_decay=0.01,
    load_best_model_at_end=True, # after fine-tuning trainer.model will keep the best model
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_snli_data["train"].select(range(1000)),
    eval_dataset=encoded_snli_data["validation"],
    # You could use "test" here but it will be cheating then
    # to select the model checkpoint which gets highest score on test
    tokenizer=tokenizer_r,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
# evaluation of a particular model

# if you want to load a model from a checkpoint for evaluation
# ft_model = AutoModelForSequenceClassification.from_pretrained(op.join(MODEL_DIR, 'checkpoint-5000'))

trainer_eval = Trainer(
    trainer.model, # model that you want to evaluate, In this case this is the best model based on the fine-tuning
    args,
    train_dataset=encoded_snli_data["train"].select(range(1000)),
    eval_dataset=encoded_snli_data["validation"], # you want to evaluate on test
    tokenizer=tokenizer_r,
    compute_metrics=compute_metrics
)

trainer_eval.evaluate()

  trainer_eval4 = Trainer(


{'eval_loss': 0.8884444832801819,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.6581995529363951,
 'eval_runtime': 25.8065,
 'eval_samples_per_second': 381.377,
 'eval_steps_per_second': 23.87}

# Extracting 1000 random rows with balanced labels

In [None]:
label_0 = encoded_snli_data["train"].filter(lambda x: x['label'] == 0).shuffle(seed=42).select(range(333))
label_1 = encoded_snli_data["train"].filter(lambda x: x['label'] == 1).shuffle(seed=42).select(range(333))
label_2 = encoded_snli_data["train"].filter(lambda x: x['label'] == 2).shuffle(seed=42).select(range(334))

random_1000 = concatenate_datasets([label_0, label_1, label_2])

random_1000 = random_1000.shuffle(seed=42)

print(random_1000)
print(Counter(random_1000['label']))
