# Perform fine-tuning

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=8.0.0 (from datasets)
  Downloading pyarrow-14.0.1-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.1.4-cp310-cp310-win_amd64.whl.metadata (18 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.9.1-cp310-cp310-win_amd64.whl.metadata (7.6 kB)


In [1]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# This dataset contains 2 sentence and will be labeled 0 and 1 based on similarity
def tokenize_func(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [6]:
raw_datasets["train"][0]['label']

1

In [7]:
print(raw_datasets["train"][0]['sentence1'])
print(raw_datasets["train"][0]['sentence2'])

Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .


In [8]:
print(raw_datasets["train"][1]['label'])
print(raw_datasets["train"][1]['sentence1'])
print(raw_datasets["train"][1]['sentence2'])

0
Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .
Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .


So, `1` means same meaning while `0` means different meaning. 

In [2]:
# map the data to suitable format for fine-tuning
# define the collator as well
tokenized_dataset = raw_datasets.map(tokenize_func, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|█████████████████████████████████████████████████████████████████| 408/408 [00:00<00:00, 14557.24 examples/s]


In [3]:
# now lets define the model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
   ---------------------------------------- 0.0/265.7 kB ? eta -:--:--
   -------------------------------------- - 256.0/265.7 kB 5.2 MB/s eta 0:00:01
   ---------------------------------------- 265.7/265.7 kB 4.1 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [4]:
from transformers import TrainingArguments

train_args = TrainingArguments("train_classification1")

In [5]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [7]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [8]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.5069
1000,0.279


TrainOutput(global_step=1377, training_loss=0.32450293802054964, metrics={'train_runtime': 94.3653, 'train_samples_per_second': 116.611, 'train_steps_per_second': 14.592, 'total_flos': 405626802939840.0, 'train_loss': 0.32450293802054964, 'epoch': 3.0})

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [10]:
tokenized_dataset["train"][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0,
 'input_ids': [101,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1010,
  3183,
  2002,
  2170,
  1000,
  1996,
  7409,
  1000,
  1010,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102,
  7727,
  2000,
  2032,
  2004,
  2069,
  1000,
  1996,
  7409,
  1000,
  1010,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
 

In [11]:
tokenized_dataset["validation"]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 408
})

In [12]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
   ---------------------------------------- 0.0/84.1 kB ? eta -:--:--
   ---------------------------------------- 84.1/84.1 kB 4.6 MB/s eta 0:00:00
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [13]:
!pip install numpy



In [14]:
# define compute metrics
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
# redefine the training arguments, how metrics is computed and where is the directory
train_args = TrainingArguments("train_classification1", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
!pip install scikit-learn scipy

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting scipy
  Downloading scipy-1.11.4-cp310-cp310-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     ---------------------------------------- 60.4/60.4 kB 1.6 MB/s eta 0:00:00
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp310-cp310-win_amd64.whl (9.3 MB)
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   -- ------------------------------------- 0.5/9.3 MB 15.5 MB/s eta 0:00:01
   -------- ------------------------------- 2.0/9.3 MB 25.1 MB/s eta 0:00:01
   -------------------- ------------------- 4.7/9.3 MB 37.7 MB/s eta 0:00:01
   ------------------------------------ --- 8.6/9.3 MB 45.7 MB/s e

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.406799,0.862745,0.90106
2,0.333900,0.712412,0.828431,0.878472
3,0.170200,0.892179,0.838235,0.888514


TrainOutput(global_step=1377, training_loss=0.2069919228813487, metrics={'train_runtime': 102.5504, 'train_samples_per_second': 107.303, 'train_steps_per_second': 13.428, 'total_flos': 541038552024960.0, 'train_loss': 0.2069919228813487, 'epoch': 3.0})

In [23]:
raw_datasets["train"][1]['idx']

1

In [24]:
type(raw_datasets)

datasets.dataset_dict.DatasetDict

In [27]:
# new prediction (custom data)
new_data = tokenizer("I like apple.", "I love apple.", truncation=True, return_tensors="pt")
new_data

{'input_ids': tensor([[ 101, 1045, 2066, 6207, 1012,  102, 1045, 2293, 6207, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [29]:
model(new_data)

AttributeError: 

# full training

In [30]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

# load the dataset and define the tokenizer
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_func(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

# tokenized data and data collator
tokenized_dataset = raw_datasets.map(tokenize_func, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|███████████████████████████████████████████████████████████████| 1725/1725 [00:00<00:00, 20395.88 examples/s]


## Post processing of the tokenize data 

In [31]:
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset.column_names

{'train': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 'validation': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 'test': ['labels', 'input_ids', 'token_type_ids', 'attention_mask']}

In [32]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, collate_fn=data_collator, batch_size=8
)

eval_dataloader = DataLoader(
    tokenized_dataset["validation"], collate_fn=data_collator, batch_size=8 
)

In [33]:
for batch in train_dataloader:
    break

{k:v.shape for k,v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 70]),
 'token_type_ids': torch.Size([8, 70]),
 'attention_mask': torch.Size([8, 70])}

In [36]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.6582, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [38]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)



In [39]:
# check for cuda (if there is GPU acceleration)
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [40]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|█████████████████████████████████████████████████████████████████████████████▉| 1376/1377 [01:23<00:00, 16.85it/s]

In [41]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8651960784313726, 'f1': 0.9043478260869565}

100%|██████████████████████████████████████████████████████████████████████████████| 1377/1377 [01:41<00:00, 16.85it/s]

# New task (token classification)

## ner

In [42]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")


Downloading builder script: 100%|█████████████████████████████████████████████████| 9.57k/9.57k [00:00<00:00, 9.56MB/s][A

Downloading metadata: 100%|███████████████████████████████████████████████████████████████| 3.73k/3.73k [00:00<?, ?B/s][A

Downloading readme: 100%|█████████████████████████████████████████████████████████| 12.3k/12.3k [00:00<00:00, 13.1MB/s][A

Downloading data: 100%|█████████████████████████████████████████████████████████████| 983k/983k [00:00<00:00, 9.96MB/s][A

Generating train split:   0%|                                                         | 0/14041 [00:00<?, ? examples/s][A
Generating train split:   8%|███▏                                       | 1054/14041 [00:00<00:01, 10260.38 examples/s][A
Generating train split:  17%|███████▌                                    | 2400/14041 [00:00<00:01, 9281.98 examples/s][A
Generating train split:  27%|███████████▉                                | 3824/14041 [00:00<00:01, 9320.59 examples/s][A
Generating 

In [43]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [44]:
idx_sample = 0
print(raw_datasets["train"][idx_sample]["id"])
print(raw_datasets["train"][idx_sample]["tokens"])
print(raw_datasets["train"][idx_sample]["pos_tags"])
print(raw_datasets["train"][idx_sample]["chunk_tags"])
print(raw_datasets["train"][idx_sample]["ner_tags"])

0
['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[22, 42, 16, 21, 35, 37, 16, 21, 7]
[11, 21, 11, 12, 21, 22, 11, 12, 0]
[3, 0, 7, 0, 0, 0, 7, 0, 0]


In [45]:
ner_features = raw_datasets["train"].features["ner_tags"]
ner_features

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [46]:
chunk_features = raw_datasets["train"].features["chunk_tags"]

In [47]:
chunk_features

Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None)

In [48]:
label_names = ner_features.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [50]:
words = raw_datasets["train"][idx_sample]["tokens"]
labels = raw_datasets["train"][idx_sample]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


**Warning⚠️**
---
We have to pass a specific argument to `tokenizer` API when we are dealing with pre-tokenized text.

In [53]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████| 29.0/29.0 [00:00<?, ?B/s][A

config.json: 100%|████████████████████████████████████████████████████████████████████████████| 570/570 [00:00<?, ?B/s][A

vocab.txt:   0%|                                                                            | 0.00/213k [00:00<?, ?B/s][A
vocab.txt: 100%|█████████████████████████████████████████████████████████████████████| 213k/213k [00:00<00:00, 422kB/s][A

tokenizer.json:   0%|                                                                       | 0.00/436k [00:00<?, ?B/s][A
tokenizer.json: 100%|████████████████████████████████████████████████████████████████| 436k/436k [00:00<00:00, 590kB/s][A


In [54]:
tokenizer.is_fast

True

In [55]:
inputs = tokenizer(raw_datasets["train"][idx_sample]["tokens"], is_split_into_words=True)
inputs

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [56]:
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [57]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]