In [None]:
import torch
import torch.nn as nn

# Set random seed for reproducibility
torch.manual_seed(42)

# Handle device placement
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os

# Disable Hugging Face Hub symlink warning
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

# Specify the model's name
model_name = "google-bert/bert-base-uncased"

# Load the tokenizer to turn text into numbers
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # 2 labels (positive,negative)

# Modify classifier for better performance
model.classifier.dropout = nn.Dropout(0.2)  # Increased to 0.2

# Move model to device
model.to(device)

In [None]:
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load IMDB dataset with caching
dataset = load_dataset("imdb" )

# Use full train for training, original test for evaluation
train_dataset = dataset["train"].shuffle(seed=42)
val_dataset = dataset["test"].shuffle(seed=42)

# Tokenize the data with dynamic max_length
def tokenize_function(data):
    return tokenizer(data["text"], truncation=True)

# Define compute_metrics for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

'''# Freeze the first 3 layers of BERT to prevent overfitting (unfreeze more for better fine-tuning)
for name, param in model.named_parameters():
    if 'encoder.layer' in name:
        match = re.search(r'encoder\.layer\.(\d+)', name)
        if match:
            layer_num = int(match.group(1))
            if layer_num < 3:
                param.requires_grad = False
'''

# Define parameter groups with different learning rates
encoder_params = [param for name, param in model.named_parameters() if 'encoder' in name and param.requires_grad]  # Unfrozen BERT layers
classifier_params = [param for name, param in model.named_parameters() if 'classifier' in name]  # Classifier head

# Custom optimizer with different LRs
optimizer = AdamW([
    {'params': encoder_params, 'lr': 5e-5},  # Lower LR for encoder
    {'params': classifier_params, 'lr': 1e-4}  # Higher LR for classifier
])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert-sentiment",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.02,
    logging_steps=100,
    fp16=True,
    max_grad_norm=1.0, # Gradient Clipping
    lr_scheduler_type = "cosine",
    warmup_steps=100,
    label_smoothing_factor=0.2
) 

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,  # Use dynamic padding
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    optimizers=(optimizer, None),  # Use custom optimizer with different LRs
)

# Fine-Tuning
trainer.train()

In [None]:
# Load downloaded test data
import pandas as pd

test_dataset = pd.read_csv('IMDB Dataset.csv')
test_dataset.rename(columns={'review': 'text'}, inplace=True)  # Rename column to match tokenize_function
test_dataset['sentiment'] = test_dataset['sentiment'].map({'positive': 1, 'negative': 0})
test_dataset.rename(columns={'sentiment': 'label'}, inplace=True)  # Rename to 'label' for Trainer
test_hf = Dataset.from_pandas(test_dataset)
tokenized_test = test_hf.map(tokenize_function, batched=True)

# Evaluate on downloaded test set
test_results = trainer.evaluate(tokenized_test)
print("Test Results:", test_results)

# Additional metrics
predictions = trainer.predict(tokenized_test)
y_pred = predictions.predictions.argmax(axis=1)
y_true = predictions.label_ids

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))