In [1]:
!pip install -q --upgrade transformers torch torchvision torchaudio
!pip install -q tokenizers==0.13.3
!pip install -q bitsandbytes transformers accelerate gradio thread6

[0m

In [2]:
# continuing from the previous chapter, here is how we would train a sequence classifier on one batch in pytorch
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# same as last chapter
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# sentences
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

batch = tokenizer(sequences, padding = True, truncation = True, return_tensors = "pt")

# This part is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### The above code trains the model on just 2 sentences. 
#### As you can tell, this won't do any good as its just 2 sentences
#### To get better results, we need to prepare a bigger dataset
#### In this next cell, we will use the MRPC dataset, it consists of 5,801 pairs of sentences, with a label indicating whether they are paraphrased or not

In [4]:
# The huggingface library provides a simple command to download and cache the dataset on the hub

# we can download teh MRPC dataset like this:
from datasets import load_dataset

# note that we use 'glue'
# 'glue' is the benchmark that is composed of 10 datasets including the 'mrpc' dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

#### The 'DatasetDict' object above contains the training, validation, and test sets. Each of those contains several columns ->
#### -> (sentence1, sentence2, label, and idx) and a variable number of rows(num_rows)
#### The command downloads and caches the dataset by default in '~/.cache/huggingface/dataset'

In [9]:
# We can access each pair of sentences in our raw_datasets object by indexing, like in a dictionary
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [15]:
# Exercise
print(raw_train_dataset[14]) # element 15 of the training set


raw_validation_dataset = raw_datasets["validation"]
print("\n",raw_validation_dataset[86]) # element 87 of validation set

# Note: the validation is 87 but since its previously split it'll mark a big number (796) since its technically that number if it wasn't split between train, val, test

{'sentence1': 'Gyorgy Heizler , head of the local disaster unit , said the coach was carrying 38 passengers .', 'sentence2': 'The head of the local disaster unit , Gyorgy Heizler , said the coach driver had failed to heed red stop lights .', 'label': 0, 'idx': 15}

 {'sentence1': 'He was arrested Friday night at an Alpharetta seafood restaurant while dining with his wife , singer Whitney Houston .', 'sentence2': 'He was arrested again Friday night at an Alpharetta restaurant where he was having dinner with his wife .', 'label': 1, 'idx': 796}


#### To preprocess the dataset, we need to convert the text to numbers teh model can make sense of.
#### This can be done with a tokenizer
#### We can feed the tokenizer one sentence or a list of sentences, so we can directly tokenize all the first sentences and all the second sentences of each pair like this

In [None]:
# tokenize sentences

# we already tokenized called the model and AutoTokenizer previously so we'll comment that part out

# from transformers import AutoTokenizer

# checkpoint = "bert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# remember that the ouput of our dataset is a dictionary with sentence1 and sentence2 as keys 
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

# the above tokenizes all instances in the dataset of sentence1 and sentence2 (don't print it lol)

In [20]:
# we can't just pass 2 sequences to the model and get a prediction of whether the sentences are paraphrased or not

# Fortunately, the tokenizer can take a pair of sequences and prepare it the way our BERT model expects
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

# notice the input_ids key and the attention_mask key

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

### 'token_type_ids': This is what tells the model which part of the input is the first sentence and which is the second sentence.

In [34]:
# Exercise
ex_element = raw_train_dataset[14]

exsent1 = tokenizer(ex_element['sentence1'])
exsent2 = tokenizer(ex_element['sentence2'])

ex_both = tokenizer([ex_element['sentence1'],ex_element[ 'sentence2']])

print(exsent1)
print("\n")
print(exsent2)
print("\n")
print(ex_both)

{'input_ids': [101, 1043, 7677, 22637, 2002, 10993, 3917, 1010, 2132, 1997, 1996, 2334, 7071, 3131, 1010, 2056, 1996, 2873, 2001, 4755, 4229, 5467, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


{'input_ids': [101, 1996, 2132, 1997, 1996, 2334, 7071, 3131, 1010, 1043, 7677, 22637, 2002, 10993, 3917, 1010, 2056, 1996, 2873, 4062, 2018, 3478, 2000, 18235, 2094, 2417, 2644, 4597, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


{'input_ids': [[101, 1043, 7677, 22637, 2002, 10993, 3917, 1010, 2132, 1997, 1996, 2334, 7071, 3131, 1010, 2056, 1996, 2873, 2001, 4755, 4229, 5467, 1012, 102], [101, 1996, 2132, 1997, 1996, 2334, 7071, 3131, 1010, 1043, 7677, 22637, 2

In [36]:
# we can decode the IDs inside input_ids back to words
tokenizer.convert_ids_to_tokens(inputs["input_ids"])


['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

#### The parts of the above input correspond to [CLS] sentence1 [SEP] all have a token type ID of 0.
#### While the other parts, corresponding to sentence2 [SEP], all have a token type ID of 1.

#### Note that if you select a different checkpoint, you'll probably have different token_type_ids in your tokenized inputs.
   * For example, they're not returned if you used a DistilBERT model

#### They are only returned when the model knows what to do with them
   * This is because the model has seen them during its pretraining

#### The model were using above is BERT, it has 2 objectives(things its trained to do)
   * Masked language modeling (predicting a missing word)
   * Next sentence predicition (Model the relationship between pairs of sentences)
      * This is done by providing MASKED sentences to the model in pairs, with half being from the same paragraph and the rest are random. 
      * The model is then tasked with predicting if the second sentence follows the first
      
#### You don't need to worry about token_type_ids in your tokenized input
   * This is as long as you use the same checkpoint for the tokenizer and the model
   * Everything will be fine as the tokenizer knows what to provide to its model

In [37]:
# Now we can tokenize the whole dataset

# we'll feed the tokenizer a list of pairs of sentences, we can include 'padding' and 'truncation'
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding = True,
    truncation = True,
)
# This works well but has the disadvantage of returning a dictionary
# Another disadvantage is that it will attempt to store the whole dataset in the RAM during tokenization

In [40]:
# To keep the data as a dataset, we'll use the 'Dataset.map' method

# This provides more flexibility if we need more preprocessing done other than tokenization

# The map method works by applying a function on each element of the dataset, so let's define a function that tokenizes our input
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation = True)

#### The funciton above takes a dictionary and returns a new dictionary with the keys input_ids, attention_mask, and token_type_ids.
#### It also works if the example dictionary contains several samples since the tokenizer works on lists of pairs of sentences
#### This alows us to use the option 'batched = True' in our call to 'map()', which will greatly speed up tokenization

#### We've left the padding argument out of our function for now. This is because padding all the samples to the max length is not efficient
   * Its better to pad the samples when we're building a batch, as then we only need to pad to the maximum length in that batch, and not the maximum length of the dataset
       * This can save time and processing power


In [41]:
# This is how we apply the tokenization function to all our datasets at once.
# we're using 'batched=True' in our call to 'map()' so that the function is applied to multiple elements of our dataset at once and not seperatly.
# This allows for faster processing
tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)
tokenized_datasets

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

#### The function that is reponsible for putting together samples inside a batch is called the 'collate function'
#### It's an argument you can pass when building a 'Data Loader'
   * The default being a function that will just convert your samples to PyTorch tensors and concatenate them
   
#### We have deliberately postponed the padding, to only apply it as necessary on each batch and avoid having over-long inputs with a lot of padding
   * Example: a input_id that has like 3 digits and 20 0's since it had a few words in the original sentence but the longest sentence in the batch was 23 tokens long
  
#### To do this in practice, we have to define a collate function that will apply the correct amount of padding to the items of the dataset we want to batch together.
   * The 🤗 Transformers library provides such a function via 'DataCollatorWithPadding' and will do everything we need
       * It takes a tokenizer when you instantiate it(to know which padding token to use, and whether the model expects padding to be on the left or on the rightof the inputs)
       

In [44]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [45]:
# Lets test it by grabbing a few samples of our training set that we would like to batch together

# We remove idx, sentence1, and sentence2 as they won't be needed
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ['idx', 'sentence1', 'sentence2']}

[len(x) for x in samples['input_ids']]

# our output was between 32-67, dynmaic paddin means all samples in the batch should all be padded to a length of 67, the max length inside the batch

[50, 59, 47, 67, 59, 50, 62, 32]

In [46]:
# Let's double check that our 'data_collator' is dynamically padding the batch properly
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [47]:
# Exercise

# 'glue' is the benchmark that is composed of 10 datasets including the 'sst-2' dataset
sst_raw_datasets = load_dataset("glue", "sst2")
sst_raw_datasets

Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})