## Fine Tuning / Transfer Learning
- by adding a new classification layer on top while keeping the pre-trained weights frozen
- Retains its language understanding gained from training , while learning task-specific patterns from the labeled dataset.

## Load Dataset

In [1]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("mteb/emotion")

In [None]:
raw_datasets

In [5]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'text': 'i didnt feel humiliated', 'label': 0, 'label_text': 'sadness'}

## Tokenize the dataset to feed to model

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
def tokenize_function(example):
    return tokenizer(example["text"],truncation=True)

### why map ? why not just directly tokenize ?
- batch process -> fast
- keep the dataset strcuture -> add new columns

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15956
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1988
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1986
    })
})

In [None]:
## add padding -> largest in a batch -> pass in trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## To compute the metrics

In [None]:
!pip install evaluate

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

## comapre prediction with correct ans (label)
## - logits (raw score predicted)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## Training

In [16]:
from transformers import TrainingArguments

In [20]:
training_args = TrainingArguments(
    push_to_hub=True,
    hub_model_id = "Kash123aa/emotion-classify",
    output_dir="trainer-emotion",
    ## epoch -> one full pass through the dataset
     eval_strategy="epoch",
    report_to=[],

)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=6,
    id2label={0: "sadness", 1: "joy", 2: "love",   3: "anger", 4:"fear", 5:"surprise"},
    label2id = {   "sadness": 0, "joy": 1, "love": 2,  "anger": 3, "fear": 4, "surprise": 5}
    )

In [22]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
train_output = trainer.train()

In [24]:
train_output

TrainOutput(global_step=2994, training_loss=0.20496291579129938, metrics={'train_runtime': 610.0612, 'train_samples_per_second': 78.464, 'train_steps_per_second': 4.908, 'total_flos': 1157358152421936.0, 'train_loss': 0.20496291579129938, 'epoch': 3.0})

## PUSH TO HUGGING FACE 🤗

In [19]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub()

## TESTING

In [26]:
tokenized_datasets["test"]

Dataset({
    features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1986
})

In [None]:
output = trainer.predict(tokenized_datasets["test"])

In [28]:
output.metrics

{'test_loss': 0.15343958139419556,
 'test_accuracy': 0.9370594159113796,
 'test_runtime': 8.0646,
 'test_samples_per_second': 246.263,
 'test_steps_per_second': 15.5}

## TRYING MY OWN FINETUNED MODEL 🤗

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

classify = pipeline("text-classification", model="Kash123aa/emotion-classify")



In [31]:
classify("I wanted to got to karan aujla's concert but i was unable to get tickets")

[{'label': 'fear', 'score': 0.4427618682384491}]

In [32]:
classify("I went finally after 4 years to aujla's concert")

[{'label': 'joy', 'score': 0.9822591543197632}]

In [33]:
classify("I smiled at the photo, but deep down, I felt hollow.")

[{'label': 'sadness', 'score': 0.9996618032455444}]

In [34]:
classify("I booked tht ticktes for concert but at th end my brother canceelled the plan")

[{'label': 'anger', 'score': 0.48287907242774963}]