In [None]:
%pip install transformers[sentencepiece]
%pip install datasets
%pip install evaluate
%pip install accelerate
%pip show transformers

In [None]:
import torch

# Prepare and preprocess data from Huggingface


## Load dataset from Huggingface

In [None]:
from datasets import load_dataset, DatasetDict
dataset = load_dataset("finkztah/youtube_trailer_comment_sentiment")

# The dataset only has training set, so we have to split it
train_testvalid = dataset['train'].train_test_split(test_size=0.3, shuffle=True, seed=42)

# Split the 40% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)

# Gather everything to have a single DatasetDict
dataset_split = DatasetDict({
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test'],
})


## Explore and preprocess the data

In [None]:
train = dataset_split['train']
train

In [None]:
train.features

In [None]:
train[0]['comment']

### Preprocess

### Tokenize the data

In [None]:

from transformers import AutoTokenizer


checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def preprocess(checkpoint, dataset: DatasetDict):
    ''' Preprocess data
      Args:
        checkpoint: the model that the tokenizer was trained on
        data: the data to be preprocessed
      Returns:
        inputs: the preprocessed data
    '''
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    def tokenize_fnc(example: dict):
        return tokenizer(example['comment'], truncation = True)

    tokenized_dataset = dataset.map(tokenize_fnc, batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(['comment', '__index_level_0__', 'title', 'video_id'])
    tokenized_dataset.set_format('torch')
    return tokenized_dataset



tokenized_dataset = preprocess(checkpoint, dataset_split)
tokenized_dataset






### Load data into DataLoaders

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


def load(tokenized_dataset, checkpoint, batch_size):
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
    train_dataloader = DataLoader(tokenized_dataset['train'], shuffle = True, batch_size = batch_size, collate_fn = data_collator)
    valid_dataloader = DataLoader(tokenized_dataset['validation'], shuffle = True, batch_size = batch_size, collate_fn = data_collator)
    test_dataloader = DataLoader(tokenized_dataset['test'], shuffle = True, batch_size = batch_size, collate_fn = data_collator)
    return train_dataloader, valid_dataloader, test_dataloader

torch.manual_seed(42)
torch.cuda.manual_seed(42)
train_dataloader, valid_dataloader, test_dataloader = load(tokenized_dataset, checkpoint, 8)
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

# Train the model on the dataset

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from transformers import logging
from accelerate import Accelerator
#Create Accelerator
accelerator = Accelerator()

# Set the logging level to 'ERROR' to suppress all warnings
logging.set_verbosity_error()

#Set random seed
torch.manual_seed(42)
torch.cuda.manual_seed(42)
#Prepare model, optimizer and scheduler(decaying learning rate)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, ignore_mismatched_sizes=True, num_labels = 3)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-06)
epochs = 3
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
#Put everything on accelerator
train_dataloader, valid_dataloader, model, optimizer = accelerator.prepare(train_dataloader, valid_dataloader, model, optimizer)


#We create a progress bar for visuals
progress_bar = tqdm(range(num_training_steps))

train_losses = []
valid_losses = []

#def train_loop(model, train_dataloader, valid_dataloader, epochs, optimizer):
for epoch in range(epochs):
#1. Forward loop
    model.train()
    loss_per_epoch = 0
    for batch in train_dataloader:
        outputs = model(**batch)
        #2. Calculate the loss
        loss = outputs.loss
        loss_per_epoch += loss.item()
        #3. Loss backward
        accelerator.backward(loss)
        #4. Update the gradient
        optimizer.step()
        #5. Zero grad
        optimizer.zero_grad()
        #6. Update the learning rate
        lr_scheduler.step()
        #7. Update the progress bar
        progress_bar.update(1)
    model.eval()
    # Calculate the average loss per epoch
    loss_per_epoch /= len(train_dataloader)
    with torch.inference_mode():
        valid_loss_per_epoch = 0
        for vbatch in train_dataloader:
            outputs = model(**vbatch)
            valid_outputs = model(**vbatch)
            valid_loss = valid_outputs.loss
            valid_loss_per_epoch += valid_loss.item()
        valid_loss_per_epoch /= len(valid_dataloader)
    train_losses.append(loss_per_epoch)
    valid_losses.append(valid_loss_per_epoch)
    print(f"Epoch: {epoch}, Loss: {loss_per_epoch}, Validation Loss: {valid_loss_per_epoch}")
#train_loop(model, train_dataloader, valid_dataloader, epochs, optimizer)

In [None]:
# Plot the train and valid curve
def plot_loss(train_losses, valid_losses):
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label = "Training Loss")
    plt.plot(valid_losses, label = "Validation Loss")
    plt.ylabel("Losses")
    plt.xlabel("Epochs")
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()



plot_loss(train_losses, valid_losses)

# Validate on the validation set

In [None]:
from evaluate import load

# Load the individual metrics
accuracy = load("accuracy")
f1 = load("f1")
precision = load("precision")
recall = load("recall")

metrics = [accuracy, f1, precision, recall]
def eval_loop(model, valid_dataloader, metrics):
  #1. Set to eval() mode
  model.eval()
  #2. Set to inference mode
  with torch.inference_mode():
    for batch in valid_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim = -1)
        for metric in metrics:
            metric.add_batch(predictions = predictions, references = batch['labels'])

    result = {}
    for metric in metrics:
        if metric == accuracy:
            result[metric.name] = metric.compute()[metric.name]
        else:
            result[metric.name] = metric.compute(average = None)[metric.name]
    return result


eval_loop(model, valid_dataloader, metrics)


In [None]:
# Try on the test set
eval_loop(model, test_dataloader, metrics)

In [None]:
# Try back on the train set to see if overfit
eval_loop(model, train_dataloader, metrics)

In [None]:
from sklearn.metrics import classification_report


y_true = []
y_pred = []

with torch.inference_mode():
    for batch in valid_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).tolist()
        y_pred.extend(predictions)
        y_true.extend(batch['labels'].tolist())

# Generate the classification report
report = classification_report(y_true, y_pred)
print(report)

        
        

In [None]:
# Try on the test set
from sklearn.metrics import classification_report

y_true = []
y_pred = []

with torch.inference_mode():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).tolist()
        y_pred.extend(predictions)
        y_true.extend(batch['labels'].tolist())

# Generate the classification report
report = classification_report(y_true, y_pred)
print(report)

# Save the model 

In [None]:
from pathlib import Path

# 1. Create models directory 
MODEL_PATH = Path("models")
MODEL_PATH.mkdir(parents=True, exist_ok=True)

# 2. Create model save path 
MODEL_NAME = "version1.pth"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

# 3. Save the model state dict 
print(f"Saving model to: {MODEL_SAVE_PATH}")
torch.save(obj=model.state_dict(), # only saving the state_dict() only saves the models learned parameters
           f=MODEL_SAVE_PATH)