## 1. Install and Import Libraries

In [1]:
# Install required libraries (run once)
# !pip install transformers torch datasets accelerate -q

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

print("\nLibraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load dataset
print("Loading dataset...")
df = pd.read_csv('../data/cleaned_label.csv')

# Sample subset for faster training (optional - uncomment to use)
# df = df.sample(n=5000, random_state=42).reset_index(drop=True)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nSentiment distribution:")
print(df['sentiment_label'].value_counts())
print(f"\nFirst few rows:")
print(df.head())

## 3. Initialize BERT Cased Tokenizer

In [None]:
# Initialize BERT Base Cased tokenizer
MODEL_NAME = 'bert-base-cased'

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

print(f"Tokenizer loaded successfully!")
print(f"Vocabulary size: {tokenizer.vocab_size}")

# Test tokenizer - notice case sensitivity
sample_text = "This MOVIE is ABSOLUTELY fantastic!"
tokens = tokenizer.tokenize(sample_text)
print(f"\nSample text: {sample_text}")
print(f"Tokens (case preserved): {tokens}")
print(f"Token IDs: {tokenizer.convert_tokens_to_ids(tokens)}")

## 4. Create Dataset Class

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("Dataset class created!")

## 5. Prepare Data

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['review_text'].values,
    df['sentiment_label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment_label'].values
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# Create datasets
MAX_LENGTH = 128
BATCH_SIZE = 16

train_dataset = SentimentDataset(X_train, y_train, tokenizer, MAX_LENGTH)
test_dataset = SentimentDataset(X_test, y_test, tokenizer, MAX_LENGTH)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"\nBatch size: {BATCH_SIZE}")
print(f"Training batches: {len(train_loader)}")
print(f"Testing batches: {len(test_loader)}")

## 6. Initialize BERT Cased Model

In [None]:
# Load BERT Base Cased model
print(f"Loading model: {MODEL_NAME}")
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

model = model.to(device)

print(f"Model loaded successfully!")
print(f"\nModel architecture:")
print(f"  - Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  - Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"  - Case sensitive: YES")

## 7. Setup Training

In [None]:
# Training parameters
EPOCHS = 1
LEARNING_RATE = 2e-5

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)

# Scheduler
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

print(f"Training setup:")
print(f"  - Epochs: {EPOCHS}")
print(f"  - Learning rate: {LEARNING_RATE}")
print(f"  - Total steps: {total_steps}")
print(f"  - Optimizer: AdamW")
print(f"  - Scheduler: Linear with warmup")

## 8. Training Functions

In [None]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    correct_predictions = 0
    
    progress_bar = tqdm(data_loader, desc='Training')
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        progress_bar.set_postfix({'loss': np.mean(losses)})
    
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses), predictions, true_labels

print("Training functions defined!")

## 9. Train Model

In [None]:
# Training loop
history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': []}

print("="*80)
print("Starting training BERT Base Cased...")
print("="*80)

start_time = time.time()

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 80)
    
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Train loss: {train_loss:.4f} | Train accuracy: {train_acc:.4f}")
    
    val_acc, val_loss, _, _ = eval_model(model, test_loader, device)
    print(f"Val loss: {val_loss:.4f} | Val accuracy: {val_acc:.4f}")
    
    history['train_acc'].append(train_acc.item())
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc.item())
    history['val_loss'].append(val_loss)

training_time = time.time() - start_time

print("\n" + "="*80)
print(f"Training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
print("="*80)

## 10. Evaluate Model

In [None]:
# Final evaluation
print("\nFinal Evaluation:")
print("="*80)

test_acc, test_loss, predictions, true_labels = eval_model(model, test_loader, device)

print(f"Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"Test Loss: {test_loss:.4f}")

print("\nClassification Report:")
print("-"*80)
print(classification_report(true_labels, predictions, target_names=['Negative', 'Positive']))

# Confusion matrix
cm = confusion_matrix(true_labels, predictions)
print("\nConfusion Matrix:")
print(cm)

## 11. Visualize Results

In [None]:
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy
ax1.plot(history['train_acc'], label='Train Accuracy', marker='o')
ax1.plot(history['val_acc'], label='Val Accuracy', marker='o')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.set_title('BERT Base Cased - Training & Validation Accuracy')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Loss
ax2.plot(history['train_loss'], label='Train Loss', marker='o')
ax2.plot(history['val_loss'], label='Val Loss', marker='o')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.set_title('BERT Base Cased - Training & Validation Loss')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('BERT Base Cased - Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 12. Test with Custom Reviews

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
        confidence = probs[0][prediction].item()
    
    return prediction, confidence

# Test reviews (with varied case)
test_reviews = [
    "This movie was ABSOLUTELY FANTASTIC! The acting was superb.",
    "terrible waste of time. Poor acting and BORING plot.",
    "It was Okay, not GREAT but not terrible either.",
    "ONE OF THE BEST films I've ever seen!",
    "Complete GARBAGE. Don't waste your money."
]

print("Testing custom reviews with BERT Base Cased:")
print("="*80)
print("Note: This model preserves case sensitivity\n")

for i, review in enumerate(test_reviews, 1):
    prediction, confidence = predict_sentiment(review, model, tokenizer, device)
    sentiment = "Positive ✓" if prediction == 1 else "Negative ✗"
    print(f"{i}. Review: {review}")
    print(f"   Prediction: {sentiment} (Confidence: {confidence:.2%})\n")

print("="*80)

## 13. Save Model

In [None]:
# Save model and tokenizer
model_save_path = './bert_cased_sentiment'

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to: {model_save_path}")
print("\n" + "="*80)
print("BERT Base Cased Summary:")
print("="*80)
print(f"Model: {MODEL_NAME}")
print(f"Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"Training Time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Case Sensitive: YES")
print("="*80)