In [None]:
!pip install accelerate -U
!pip install transformers[torch]
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# !pip install evaluate
# !pip install transformers

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers.optimization import get_scheduler
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import evaluate

import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, labels=None):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=512)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings["input_ids"])


# load and clean a dataset
def load_dataset():
    # load documents
    df = pd.read_csv("IMDB Dataset.csv",sep=",")
    df = df.drop_duplicates()
    df = df.apply(lambda x: x.astype(str).str.lower() if x.dtype == "object" else x)
    df["review"] = df["review"].str.replace('br','')
    df['sentiment'] = df['sentiment'].replace({'positive': 0, 'negative': 1})

    return df["review"].tolist(), df["sentiment"].tolist()


# load the dataset
docs, labels = load_dataset()
X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2, random_state=42)



# create model and tokenizers
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

train_docs_dataset = Dataset(bert_tokenizer, X_train, y_train)
eval_docs_dataset = Dataset(bert_tokenizer, X_test, y_test)

batch_size = 8
epochs = 2


weight_decay = 0.01


metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    weight_decay=weight_decay,
    logging_dir='./logs'

)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_docs_dataset,
    eval_dataset=eval_docs_dataset,
    compute_metrics=compute_metrics,

)

trainer.train()
trainer.evaluate()

trainer.save_model()




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.267,0.21274,0.927801
2,0.1214,0.312541,0.934557
