In [6]:
import torch
from tqdm.auto import tqdm
import evaluate
from datasets import load_dataset
from torch.utils.data import DataLoader
import datasets

from torch.optim import AdamW

from transformers import AutoTokenizer    
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler

In [78]:
dataset = load_dataset("imsoumyaneel/sentiment-analysis-llama2")

In [84]:
# splitting the data into train/test portions, 80% / 20 %
dataset = dataset["train"].train_test_split(test_size=0.2)

In [85]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'text'],
        num_rows: 478638
    })
    test: Dataset({
        features: ['sentence', 'label', 'text'],
        num_rows: 119660
    })
})

In [86]:
# using a pre-trained model for tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [87]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [88]:
# creating tokens for the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/478638 [00:00<?, ? examples/s]

Map:   0%|          | 0/119660 [00:00<?, ? examples/s]

In [89]:
# Removing the text column
tokenized_dataset = tokenized_datasets.remove_columns(["text"])

In [90]:
# Renaming label column to labels for model expectation
tokenized_dataset = tokenized_datasets.rename_column("label", "labels")

In [91]:
# to return pytorch tensors instead of lists
tokenized_dataset.set_format("torch")

In [92]:
# smaller datasets to speed up fine-tuning
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))

In [None]:
# creating a dataloader to iterate over batches of data
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [None]:
# loading a pre-trained model for classification
model = AutoModelForSequenceClassification(model_name, num_labels=5)

In [None]:
# creating a optimizer to minimize the loss of the training, using a learning rate of 5e-5
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# creating the default learning rate scheduler from Trainer class
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
# using cuda cores on nvidia GPU to accelerate training
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

In [None]:
# training loop
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# evaluating the training
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
# saving the model
torch.save(model, '')