In [1]:
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import re

# Download IMDB dataset using Hugging Face datasets
dataset = load_dataset("imdb")

# Split dataset into train and test sets
train_data, test_data = train_test_split(dataset["train"].to_pandas(), test_size=0.2, random_state=42)

# Preprocess text
def preprocess_text(text):
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.split()
    return " ".join(text)

train_data["text"] = train_data["text"].apply(preprocess_text)
test_data["text"] = test_data["text"].apply(preprocess_text)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize data
train_encodings = tokenizer(list(train_data["text"]), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_data["text"]), truncation=True, padding=True, max_length=512)

# Convert data to torch tensors
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, list(train_data["label"]))
test_dataset = IMDbDataset(test_encodings, list(test_data["label"]))





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


: 

In [3]:
import accelerate

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()


  0%|          | 0/7500 [00:00<?, ?it/s]

In [None]:
import matplotlib.pyplot as plt

# Evaluate model
eval_result = trainer.evaluate()

# Plot training and validation loss
epochs = range(1, training_args.num_train_epochs + 1)

train_losses = trainer.state.log_history
train_loss_values = [entry["loss"] for entry in train_losses if "loss" in entry]

plt.plot(epochs, train_loss_values, 'b', label='Training loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
