In [1]:
# Run this to download necessary libraries (If you have done this in the past in any notebook, you don't need to do this)
!pip install transformers[sentencepiece,torch] # HG Transformer’s Library
!pip install datasets # HG Dataset’s Library
!pip install huggingface_hub # HG sharing Library
!pip install torch # Pytorch
# Remember to turn on T4 GPU Accelerator under 'Runtime'->'Change runtime type'
import torch

if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("Using CPU")

# ***Download These Libraries As Well***
!pip install numpy
!pip install evaluate
!pip install accelerate -U

Collecting accelerate>=0.21.0 (from transformers[sentencepiece,torch])
  Using cached accelerate-0.29.3-py3-none-any.whl (297 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[sentencepiece,torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch->transformers[sentencepiece,torch])
  Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)
Installing collected packages: nvidia-cudnn-cu12, nvidia-cusolver-cu12, accelerate
Successfully installed accelerate-0.29.3 nvidia-cudnn-cu12-8.9.2.26 nvidia-cusolver-cu12-11.4.5.107
Collecting datasets
  Using cached datasets-2.19.0-py3-none-any.whl (542 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Dow

In [2]:
# James's 3.2 Preprocessing/Tokenization for Trainer API process
# Import Datasets, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

# Define the dataset and Tokenizer
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Create Function to Map
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True, padding = False, return_token_type_ids=True)

# Call the .map function
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# Define DataCollatorWithPadding for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [3]:
# James's 3.3 Using Trainer API
# Define Model with specific head for its designed task

# Import AutoModel
from transformers import AutoModelForSequenceClassification
# num_labels=2 means the 2 catergories: Whether a pair of paraphrases are equivalent or not
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# OPTIONAL Keep Track of Progress

# Import Neccessary Libraries
from transformers import TrainingArguments # For intializing TrainingArguments
import numpy as np # For the argmax
# Defing Training Arguments
training_args = TrainingArguments(
    output_dir='/content/results',			# Output directory
    num_train_epochs=3,                 # Number of passes it does through the data
    per_device_train_batch_size=16,     # How many batches each device (GPU's) should handle (Google Colab has 2 GPU's)
    learning_rate=5e-5,                 # How many small steps it does for each calculation (The smaller the better)
    evaluation_strategy='steps',        # When the trainer evaluates the model either after 'steps' or 'epoch'
)

# Using evaluate.load (HG course's way)
import evaluate
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# # Using load_metric (Alternative Method without downloading new package, same results)
# from datasets import load_metric
# def compute_metrics(eval_preds):
#     metric = load_metric("glue", "mrpc")
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [None]:
# Training the Model
# Import Trainer API Library
from transformers import Trainer

# Put Previous things into Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Call the Trainer to actually train
# If this takes more than 5 minutes on google colab, check your runtime type
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 