<a href="https://colab.research.google.com/github/KashishV999/nlp-transformers-journey/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning / Transfer learning
- by adding a new classification layer on top while keeping the pre-trained weights frozen
- Retains its language understanding gained from training , while learning task-specific patterns from the labeled dataset.

## LOAD DATASET

In [None]:
!pip install -U datasets

In [2]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("SetFit/sst2")

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 6920
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 872
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1821
    })
})

In [5]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'text': 'a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films',
 'label': 1,
 'label_text': 'positive'}

## TOKENIZE THE DATASET TO FEED TO MODEL

In [6]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
def tokenize_function(example):
    return tokenizer(example["text"],truncation=True)

### why map ? why not just directly tokenize ?
- batch process -> fast
- keep the dataset strcuture -> add new columns


In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6920
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [11]:
## add padding -> largest in a batch -> pass in trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## To compute the metrics

In [None]:
!pip install evaluate

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

## comapre prediction with correct ans (label)
## - logits (raw score predicted)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## TRAINING

In [14]:
from transformers import TrainingArguments

In [15]:

training_args = TrainingArguments(
    push_to_hub=True,
    hub_model_id = "Kash123aa/sentiment-bert",
    output_dir="test-trainer-sentiment",
    ## epoch -> one full pass through the dataset
     eval_strategy="epoch",
    report_to=[],

)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1}
)


In [27]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
train_output = trainer.train()

In [29]:
train_output

TrainOutput(global_step=2595, training_loss=0.20611073065814714, metrics={'train_runtime': 693.099, 'train_samples_per_second': 29.952, 'train_steps_per_second': 3.744, 'total_flos': 458828680555680.0, 'train_loss': 0.20611073065814714, 'epoch': 3.0})

## PUSH TO HUGGING FACE 🤗

In [None]:
trainer.push_to_hub()

## TESTING

In [None]:
tokenized_datasets["test"]


Dataset({
    features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1821
})

In [None]:
output = trainer.predict(tokenized_datasets["test"])


In [None]:

output.metrics

{'test_loss': 0.4576760530471802,
 'test_accuracy': 0.9132344865458539,
 'test_runtime': 5.8218,
 'test_samples_per_second': 312.789,
 'test_steps_per_second': 39.163}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## TRYING MY OWN FINETUNED MODEL 🤗

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

classifier = pipeline("text-classification", model="Kash123aa/sentiment-bert")

In [32]:
classifier("I am so excited to attend the concert!")

[{'label': 'POSITIVE', 'score': 0.999800980091095}]