In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import pickle
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm
import pandas as pd

class SentimentDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return self.tokenized_data[idx]

def preprocess_and_tokenize(reviews, labels, tokenizer, max_len, cache_file=None):
    if cache_file and os.path.exists(cache_file):
        print(f"Loading tokenized dataset from {cache_file}")
        with open(cache_file, 'rb') as f:
            tokenized_data = pickle.load(f)
    else:
        print("Tokenizing dataset...")
        tokenized_data = []
        for review, label in tqdm(zip(reviews, labels), total=len(reviews), desc="Tokenizing"):
            encoding = tokenizer(
                review,
                max_length=max_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            tokenized_data.append({
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'label': torch.tensor(label, dtype=torch.long)
            })

        if cache_file:
            print(f"Saving tokenized dataset to {cache_file}")
            with open(cache_file, 'wb') as f:
                pickle.dump(tokenized_data, f)

    return tokenized_data

data = pd.read_csv('IMDB_reviews.csv')
reviews = data.iloc[:, 0].tolist()
labels = data.iloc[:, 1].apply(lambda x: 1 if x == 'positive' else 0).tolist()

train_reviews, temp_reviews, train_labels, temp_labels = train_test_split(
    reviews, labels, test_size=0.3, random_state=42
)
val_reviews, test_reviews, val_labels, test_labels = train_test_split(
    temp_reviews, temp_labels, test_size=0.5, random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512
batch_size = 16

train_cache_file = "train_tokenized.pkl"
val_cache_file = "val_tokenized.pkl"
test_cache_file = "test_tokenized.pkl"

train_tokenized = preprocess_and_tokenize(train_reviews, train_labels, tokenizer, max_len, train_cache_file)
val_tokenized = preprocess_and_tokenize(val_reviews, val_labels, tokenizer, max_len, val_cache_file)
test_tokenized = preprocess_and_tokenize(test_reviews, test_labels, tokenizer, max_len, test_cache_file)

train_dataset = SentimentDataset(train_tokenized)
val_dataset = SentimentDataset(val_tokenized)
test_dataset = SentimentDataset(test_tokenized)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fn = torch.nn.CrossEntropyLoss()

def train_model(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels

epochs = 100
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    print(f"Training loss: {train_loss}")

    val_predictions, val_true_labels = evaluate_model(model, val_loader, device)
    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f"Validation accuracy: {val_accuracy}")

test_predictions, test_true_labels = evaluate_model(model, test_loader, device)
test_accuracy = accuracy_score(test_true_labels, test_predictions)
print(f"Test accuracy: {test_accuracy}")

conf_matrix = confusion_matrix(test_true_labels, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(classification_report(test_true_labels, test_predictions, target_names=['Negative', 'Positive']))