<a href="https://colab.research.google.com/github/KashishV999/nlp-transformers-journey/blob/main/finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning
- by adding a new classification layer on top while keeping the pre-trained weights frozen
- Retains its rich linguistic understanding gained from training , while the newly added layer learns task-specific patterns from the labeled dataset.

## LOAD DATASET

In [None]:
!pip install -U datasets

In [5]:
from datasets import load_dataset


In [None]:
raw_datasets = load_dataset("glue", "mrpc")


In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [7]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

## TOKENIZE THE DATASET TO feed TO MODEL

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)




In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
from transformers import TrainingArguments

## To compute the metrics

In [None]:
!pip install evaluate


In [16]:
import numpy as np
import evaluate


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    metric = evaluate.load("glue", "mrpc")
    return metric.compute(predictions=predictions, references=labels)


## Training arguments

In [17]:

training_args = TrainingArguments(
    output_dir="test-trainer",
     eval_strategy="epoch",
    report_to=[],

)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [19]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
train_output = trainer.train()



In [25]:
print(train_output)

TrainOutput(global_step=1377, training_loss=0.09268783536092388, metrics={'train_runtime': 255.4258, 'train_samples_per_second': 43.081, 'train_steps_per_second': 5.391, 'total_flos': 405114969714960.0, 'train_loss': 0.09268783536092388, 'epoch': 3.0})


In [21]:
predictions = trainer.predict(tokenized_datasets["validation"])


## TESTING

In [23]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

from evaluate import load
metric = load("glue", "mrpc")
results = metric.compute(predictions=preds, references=predictions.label_ids)
print(results)


{'accuracy': 0.8651960784313726, 'f1': 0.9053356282271945}


In [10]:

inputs = tokenizer("The weather today is sunny and warm",
                   "It’s a bright and pleasant day outside", return_tensors="pt")


outputs = model(**inputs)
logits = outputs.logits


pred = logits.argmax(dim=-1).item()
print("Predicted label:", pred)  # 1 for paraphrase, 0 for not

Predicted label: 1
