In [1]:
from datasets import load_dataset
from torch.utils.data import DataLoader
import datasets
from transformers import AutoTokenizer                 

In [78]:
dataset = load_dataset("imsoumyaneel/sentiment-analysis-llama2")

In [84]:
# splitting the data into train/test portions, 80% / 20 %
dataset = dataset["train"].train_test_split(test_size=0.2)

In [85]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'text'],
        num_rows: 478638
    })
    test: Dataset({
        features: ['sentence', 'label', 'text'],
        num_rows: 119660
    })
})

In [86]:
# using a pre-trained model for tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [87]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [88]:
# creating tokens for the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/478638 [00:00<?, ? examples/s]

Map:   0%|          | 0/119660 [00:00<?, ? examples/s]

In [89]:
# Removing the text column
tokenized_dataset = tokenized_datasets.remove_columns(["text"])

In [90]:
# Renaming label column to labels for model expectation
tokenized_dataset = tokenized_datasets.rename_column("label", "labels")

In [91]:
# to return pytorch tensors instead of lists
tokenized_dataset.set_format("torch")

In [92]:
# smaller datasets to speed up fine-tuning
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))