In [1]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import Trainer
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


# Preprocessing

In [2]:
checkpoint = 'bert-base-uncased' #Trained model weights
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # from raw text to numerical IDs
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
seq = ["I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",] #We will fine tune with these sentence but we will use a dataset from hub soon.

batch = tokenizer(seq, padding= True, truncation=True, return_tensors='pt')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [3]:
batch
#padding to make all of them same size because models only accept triangular
#token_type_ids : to distunguish between different sentence in a single input seq. if we were use a pair it would contain ones and zeros instead of only zeros.
#attention_mask: shows us which ones is real input(1) which ones are padding(0)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2023,  2607,  2003,  6429,   999,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [4]:
batch["labels"] = torch.tensor([1,1]) # we are adding labels for our inputs.
optimizer = AdamW(model.parameters()) #with transformars AdamW better than Adam mostly. But these are similar.
loss = model(**batch).loss # model forward pass (takes inputs and produces outputs)
loss.backward() #calculade gradients without this step model can't know how to update the weights
optimizer.step() # applies those gradients to update the model parameters.

okay, now we see how it works so we can use a real dataset:

# Input Dataset

In [5]:
raw_datasets = load_dataset("glue", "mrpc")
#glue: it is a benchmark name with 9 natural langunage understanding tasks
#mrpc: is the one of the glue task: microsoft reasearch paraphrase corpus. each sample has 2 sentence sentence 1 and sentence2
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

`label = 1`: two sentece is same meaning


`label = 0`: diffeent meaning

In [6]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[13] # an example from the dataset

{'sentence1': "He told The Sun newspaper that Mr. Hussein 's daughters had British schools and hospitals in mind when they decided to ask for asylum .",
 'sentence2': '" Saddam \'s daughters had British schools and hospitals in mind when they decided to ask for asylum -- especially the schools , " he told The Sun .',
 'label': 1,
 'idx': 14}

In [7]:
raw_train_dataset.features 

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [8]:
tokenized_sent1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sent2 = tokenizer(raw_datasets["train"]["sentence2"])

In [9]:
inputs = tokenizer("This is the first sentence.", "This is the second one.","Gokhan ergul") #only 2 pairs are allowed
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [101, 2175, 26370, 9413, 24848, 102]}

In [10]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [11]:
#we didn't give whole dataset to tokenizer direktly because of ram.
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [12]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) #batched=True in our call to map so the function is applied to multiple elements of our dataset at once
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [13]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) #creats mini batchs and padding them.

In [14]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]] #token_type_ids or attention_mask.

[50, 59, 47, 67, 59, 50, 62, 32]

In [15]:
batch = data_collator(samples) # takes the sample and turns it to a batch of tensors
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

# Training

In [16]:
training_args = TrainingArguments("test-trainer") # the parameter is the output_dir for record everything about the training.

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [18]:
trainer = Trainer(model,
                 training_args,
                 train_dataset=tokenized_datasets['train'],
                 eval_dataset=tokenized_datasets['validation'],
                 data_collator=data_collator,
                 tokenizer = tokenizer)

In [19]:
trainer.train() #fit

[34m[1mwandb[0m: Currently logged in as: [33mgokhannergull[0m ([33mgokhannergull-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.5023
1000,0.292


TrainOutput(global_step=1377, training_loss=0.3214979040163881, metrics={'train_runtime': 225.4865, 'train_samples_per_second': 48.801, 'train_steps_per_second': 6.107, 'total_flos': 406183858377360.0, 'train_loss': 0.3214979040163881, 'epoch': 3.0})

In [21]:
pred = trainer.predict(tokenized_datasets['validation'])

In [22]:
print(pred.predictions.shape,pred.label_ids.shape)

(408, 2) (408,)


In [23]:
preds = np.argmax(pred.predictions, axis = -1)#return the highest index

In [24]:
preds

array([1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,

In [29]:
metric = evaluate.load('glue','mrpc')

In [31]:
metric.compute(predictions = preds , references = pred.label_ids) #these values will change on your machine because of random seed

{'accuracy': 0.8774509803921569, 'f1': 0.9134948096885813}

In [26]:
#instead of the codes that is above, we could use:
#acc = accuracy_score(labels, preds)
#f1 = f1_score(labels, preds)
#as well

In [32]:
tokenized_datasets['validation'].column_names

['sentence1',
 'sentence2',
 'label',
 'idx',
 'input_ids',
 'token_type_ids',
 'attention_mask']

In [34]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    
    predictions = np.argmax(logits, axis=-1)
    print(f"Predictions shape: {np.array(predictions).shape}")
    return metric.compute(predictions=predictions, references=labels)


In [35]:
training_args = TrainingArguments("test-trainer",evaluation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)
trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics #the different is it will show as the accuarcy metrices after the training automatically
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [43]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.357547,0.848039,0.88968
2,0.491700,0.478387,0.85049,0.897133
3,0.259100,0.63839,0.870098,0.909091


Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)


TrainOutput(global_step=1377, training_loss=0.3007839039433409, metrics={'train_runtime': 229.1099, 'train_samples_per_second': 48.029, 'train_steps_per_second': 6.01, 'total_flos': 406183858377360.0, 'train_loss': 0.3007839039433409, 'epoch': 3.0})

# A full training loop




In [36]:
tokenized_datasets.shape

{'train': (3668, 7), 'validation': (408, 7), 'test': (1725, 7)}

In [37]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [38]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1','sentence2','idx'])

In [39]:
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

In [40]:
tokenized_datasets.set_format('torch')

In [41]:
tokenized_datasets['train'].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [42]:
from torch.utils.data import DataLoader

In [43]:
train_dataloader = DataLoader(
    tokenized_datasets['train'],shuffle=True,batch_size=8,collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets['validation'], batch_size= 8, collate_fn=data_collator
)

In [55]:
for batch in train_dataloader:
    break#take the first batch
{k:v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 75]),
 'token_type_ids': torch.Size([8, 75]),
 'attention_mask': torch.Size([8, 75])}

In [56]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [57]:
outputs = model (**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.1290, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [58]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(),lr = 5e-5)

In [59]:
from transformers import get_scheduler

In [60]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0, 
    num_training_steps = num_training_steps
)
print(num_training_steps)

1377


In [61]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [62]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


100%|███████████████████████████████████████████████████████████████████████████████| 1377/1377 [07:57<00:00,  2.88it/s][A

  0%|                                                                                  | 1/1377 [00:00<12:41,  1.81it/s][A
  0%|▏                                                                                 | 3/1377 [00:00<04:59,  4.58it/s][A
  1%|▍                                                                                 | 7/1377 [00:00<02:08, 10.66it/s][A
  1%|▌                                                                                 | 9/1377 [00:01<02:34,  8.84it/s][A
  1%|▋                                                                                | 11/1377 [00:01<03:00,  7.57it/s][A
  1%|▊                                                                                | 13/1377 [00:01<03:15,  6.99it/s][A
  1%|▊                                                                                | 14/1377 [00:02<03:25,  6.64it/s][A
  1%|▉

In [63]:
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()


100%|███████████████████████████████████████████████████████████████████████████████| 1377/1377 [03:53<00:00,  6.63it/s][A

{'accuracy': 0.8333333333333334, 'f1': 0.8855218855218855}

# Loss Curves

In [64]:
# Example of tracking loss during training with the Trainer
from transformers import Trainer, TrainingArguments
import wandb

# Initialize Weights & Biases for experiment tracking

In [65]:
import sys
#!{sys.executable} -m pip install wandb

In [66]:
wandb.init(project="transformer-fine-tuning", name="bert-mrpc-analysis")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=100,
    logging_steps=10,  # Log metrics every 10 steps
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="wandb",  # Send logs to Weights & Biases
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train and automatically log metrics
trainer.train()

0,1
eval/accuracy,▁▁▅▅▆▇▆▇▇██▇▇
eval/f1,▁▁▄▅▆▇▆▇▆██▇▇
eval/loss,█▇▅▃▃▆▄▂▁▃▄▇▆
eval/runtime,▄▅▅▅▅▆▁█▇▁██▂
eval/samples_per_second,▄▄▄▄▃▃█▁▁▇▁▁▇
eval/steps_per_second,▄▄▄▄▃▃█▁▁▇▁▁▇
train/epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇███
train/learning_rate,██▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,█▇▇▇██▇█▆▇▅▆▆▆▅▄▅▄▃▄▅▃▃▄▂▃▅▄▂▃▁▂▂▂▂▂▂▁▃▁

0,1
eval/accuracy,0.85784
eval/f1,0.90236
eval/loss,0.54568
eval/runtime,1.5947
eval/samples_per_second,255.848
eval/steps_per_second,16.304
train/epoch,3.0
train/global_step,690.0
train/learning_rate,0.0
train/loss,0.0859




Step,Training Loss,Validation Loss,Accuracy,F1
50,0.1581,0.715121,0.848039,0.891986
100,0.1911,0.667575,0.840686,0.890017
150,0.0956,0.864052,0.838235,0.888514
200,0.2407,0.514255,0.845588,0.883978
250,0.1054,0.879439,0.835784,0.889984
300,0.1157,0.853561,0.835784,0.88057
350,0.0284,0.724919,0.843137,0.892256
400,0.1573,0.791553,0.857843,0.903333
450,0.1087,0.737043,0.823529,0.875
500,0.0042,0.845043,0.833333,0.883562


Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)
Predictions shape: (408,)


TrainOutput(global_step=690, training_loss=0.09419950324286154, metrics={'train_runtime': 229.2179, 'train_samples_per_second': 48.007, 'train_steps_per_second': 3.01, 'total_flos': 430291408824720.0, 'train_loss': 0.09419950324286154, 'epoch': 3.0})

the result of the output of the above codes We can see there is a "**confidence overfitting**" there.

######
The model is generally progressing with accuracy, but its losses are getting worse because it is making mistakes with overconfidence.