In [6]:
"""!pip3 install datasets"""

'!pip3 install datasets'

In [1]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.tensor([1, 1])

tensor([1, 1])

In [3]:
# from previous notebook summary for 1 epoch
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1]) # as both are positive
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from datasets import load_dataset

raw_datasets = load_dataset("paws", "labeled_final")
raw_datasets

Downloading readme: 100%|██████████| 9.79k/9.79k [00:00<00:00, 32.0MB/s]
Downloading data:   0%|          | 0.00/8.43M [00:00<?, ?B/s]'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 22b0f778-1cbc-46fc-9416-f5636a1e92b9)')' thrown while requesting GET https://huggingface.co/datasets/paws/resolve/161ece9501cf0a11f3e48bd356eaa82de46d6a09/labeled_final/train-00000-of-00001.parquet
Retrying in 1s [Retry 1/5].
Downloading data: 100%|██████████| 8.43M/8.43M [00:02<00:00, 3.80MB/s]
Downloading data: 100%|██████████| 1.24M/1.24M [00:00<00:00, 2.06MB/s]
Downloading data: 100%|██████████| 1.23M/1.23M [00:00<00:00, 2.89MB/s]
Generating train split: 100%|██████████| 49401/49401 [00:00<00:00, 867607.17 examples/s]
Generating test split: 100%|██████████| 8000/8000 [00:00<00:00, 889094.65 examples/s]
Generating validation split: 100%|██████████| 8000/8000 [00:00<00:00, 929331.19 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
})

In [11]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'id': 1,
 'sentence1': 'In Paris , in October 1560 , he secretly met the English ambassador , Nicolas Throckmorton , asking him for a passport to return to England through Scotland .',
 'sentence2': 'In October 1560 , he secretly met with the English ambassador , Nicolas Throckmorton , in Paris , and asked him for a passport to return to Scotland through England .',
 'label': 0}

In [13]:
raw_train_dataset.features # 0 corresponds to not-equivalent and 1 to equivalent

{'id': Value(dtype='int32', id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['0', '1'], id=None)}

In [None]:
# this will load all in memory
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [14]:
# this we can do in batches but without padding
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True) # truncation so sequences are not bigger than the model

""" This is because padding all the samples to the maximum length is not efficient: it’s better to 
pad the samples when we’re building a batch, as then we only need to pad to the maximum length in 
that batch, and not the maximum length in the entire dataset."""

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map: 100%|██████████| 49401/49401 [00:01<00:00, 26897.11 examples/s]
Map: 100%|██████████| 8000/8000 [00:00<00:00, 30945.02 examples/s]
Map: 100%|██████████| 8000/8000 [00:00<00:00, 31361.92 examples/s]


In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
})

In [16]:
# DYNAMIC PADDING
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


2024-03-10 16:10:52.534155: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-10 16:10:52.564807: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]] # we get samples of varying length, from 32 to 67

[69, 40, 30, 29, 35, 64, 44, 68]

In [18]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'id': torch.Size([8]),
 'input_ids': torch.Size([8, 69]),
 'token_type_ids': torch.Size([8, 69]),
 'attention_mask': torch.Size([8, 69]),
 'labels': torch.Size([8])}

In [24]:
from transformers import TrainingArguments
"""TrainingArguments class that will contain all the hyperparameters the Trainer will use for 
training and evaluation"""
training_args = TrainingArguments("/") #to automatically upload your model to the Hub during training, pass along push_to_hub=True in the TrainingArguments.

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [25]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

NameError: name 'training_args' is not defined

In [None]:
trainer.train()
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import evaluate
import numpy as np
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [27]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`