In [None]:
!pip install accelerate -U
!pip install transformers[torch]
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install evaluate
!pip install transformers

In [1]:
import pandas as pd
import numpy as np
import evaluate

import torch

In [2]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, labels=None):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=512)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings["input_ids"])

In [3]:
def load_dataset():
    # load documents
    df = pd.read_csv("IMDB Dataset.csv",sep=",")
    df = df.drop_duplicates()
    df = df.apply(lambda x: x.astype(str).str.lower() if x.dtype == "object" else x)
    df["review"] = df["review"].str.replace('br','')
    df['sentiment'] = df['sentiment'].replace({'positive': 0, 'negative': 1})
    subset = df.iloc[0:3300]

    return subset["review"].tolist(), subset["sentiment"].tolist()


In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers.optimization import get_scheduler
from sklearn.model_selection import train_test_split
from datasets import load_metric

In [5]:
# load the dataset
docs, labels = load_dataset()
X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2, random_state=42)

In [6]:
# create model and tokenizers
bert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')

train_docs_dataset = Dataset(bert_tokenizer, X_train, y_train)
eval_docs_dataset = Dataset(bert_tokenizer, X_test, y_test)

batch_size = 8
epochs = 2

weight_decay = 0.01


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    #########################################
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}
    #########################################

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    weight_decay=weight_decay,
    logging_dir='./logs'
)

In [8]:
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_docs_dataset,
    eval_dataset=eval_docs_dataset,
    compute_metrics=compute_metrics,

)

trainer.train()
trainer.evaluate()

trainer.save_model()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.399903,0.871212
2,0.311000,0.368192,0.901515
