<a href="https://colab.research.google.com/github/Nghiauet/Using_LLaMA_FAISS_and_LangChain_for_Question-Answering/blob/master/Hugging_face_full_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A full training

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
# !pip install datasets evaluate transformers[sentencepiece]
# !pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

zsh:1: no matches found: transformers[sentencepiece]


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import time

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 28.8k/28.8k [00:00<00:00, 10.9MB/s]
Downloading metadata: 100%|██████████| 28.7k/28.7k [00:00<00:00, 19.6MB/s]
Downloading readme: 100%|██████████| 27.9k/27.9k [00:00<00:00, 15.4MB/s]
Downloading data: 6.22kB [00:00, 4.55MB/s]/3 [00:00<?, ?it/s]
Downloading data: 1.05MB [00:00, 13.7MB/s]/3 [00:00<00:01,  1.78it/s]
Downloading data: 441kB [00:00, 30.9MB/s]2/3 [00:00<00:00,  2.22it/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  2.27it/s]
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 29071.25 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 15631.52 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 77745.74 examples/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 2.21kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 48.1kB/s]
Downloading (…)

In [3]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [4]:
["attention_mask", "input_ids", "labels", "token_type_ids"]

['attention_mask', 'input_ids', 'labels', 'token_type_ids']

In [5]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [6]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 81]),
 'token_type_ids': torch.Size([8, 81]),
 'attention_mask': torch.Size([8, 81])}

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading model.safetensors: 100%|██████████| 440M/440M [00:07<00:00, 59.2MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.6127, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [9]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [10]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1377


In [11]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [12]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
start_time = time.time()

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
end_time = time.time()
print("Training complete!")
print(f"Time taken: {end_time - start_time} seconds")

100%|█████████▉| 1374/1377 [00:46<00:00, 29.63it/s]

In [13]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 8.74MB/s]


{'accuracy': 0.8455882352941176, 'f1': 0.8934010152284263}

In [22]:
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
# import format_time

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
# calculate elapsed time
start_time = time.time()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
# calculate elapsed time
end_time = time.time()
print("Training complete!")
print(f"Time taken: {end_time - start_time} seconds")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1377/1377 [09:13<00:00,  2.49it/s]
100%|██████████| 1377/1377 [00:46<00:00, 30.75it/s]

Training complete!
Time taken: 46.17062568664551 seconds


In [21]:
# print first batch in train_dataloader
for batch in train_dataloader:
    print('key in batch: ', batch.keys())
    print('shape of each key: ', {k: v.shape for k, v in batch.items()})
    break

key in batch:  dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])
shape of each key:  {'labels': torch.Size([8]), 'input_ids': torch.Size([8, 89]), 'token_type_ids': torch.Size([8, 89]), 'attention_mask': torch.Size([8, 89])}


In [23]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare( # prepare model for accelerator
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))
start_time = time.time()

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
end_time = time.time()
print("Training complete!")
print(f"Time taken: {end_time - start_time} seconds")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1377/1377 [00:49<00:00, 27.65it/s]


Training complete!
Time taken: 50.7180118560791 seconds


