# BERT FINETUNING
We will fine tune a BERT Model using Yelp Review for Text Classifications

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

In [None]:
dataset = load_dataset("yelp_review_full")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def tokenize(sentence):
    return tokenizer(sentence['text'], truncation=True, padding="max_length")

In [None]:
tokenized_train = dataset['train'].map(tokenize, batched=True)
tokenized_test = dataset['test'].map(tokenize, batched=True)

In [None]:
tokenized_train = tokenized_train.shuffle(seed=42).select(range(1000))
tokenized_test = tokenized_test.shuffle(seed=42).select(range(1000))

## Fine Tuning using Hugging Face
We will use Trainer and Training Arguments to fine tune the model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer, TrainingArguments

train_args = TrainingArguments(
    output_dir="bert-training",
    num_train_epochs=3,
    learning_rate=5e-5,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args = train_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

## Fine Tuning using Native PyTorch
We will use native PyTorch Neural Network Model to fine tune our model

In [None]:
import torch
from torch import nn

In [None]:
tokenized_train = tokenized_train.remove_columns(['text'])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.remove_columns(['text'])
tokenized_test = tokenized_test.rename_column("label", "labels")

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

In [None]:
from torch.utils.data import DataLoader

tokenized_train = DataLoader(tokenized_train, shuffle=True, batch_size=8)
tokenized_test = DataLoader(tokenized_test, batch_size=8)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 5)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr = 5e-5)

In [None]:
num_epoch = 3
model.train()
for i in range(num_epoch):
    for batch in tokenized_train:
        # Training Loop
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        # Evaluate Loop
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

In [None]:
metric.compute()