In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os

project_path = "/content/drive/MyDrive/Sentiment_Project"

if not os.path.exists(project_path):
    os.makedirs(project_path)
    print("Project folder created!")
else:
    print("Project folder already exists!")


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
bert_save_path = project_path + "/bert_model"
print("BERT will be saved at:", bert_save_path)


BERT

In [None]:
# Import and Install the required libraries
!pip install -q transformers datasets scikit-learn

import torch
import numpy as np
import pandas as pd
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import DataCollatorWithPadding
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
# Load Dataset
dataset = load_dataset("imdb")

print("Train size:", len(dataset["train"]))
print("Test size:", len(dataset["test"]))

In [None]:
# Load BERT tokenizer
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

In [None]:
# Rename label column
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

train_dataset = tokenized_datasets["train"].shuffle(seed=42)
test_dataset  = tokenized_datasets["test"].shuffle(seed=42)


In [None]:
# CELL 6: Load BERT model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Using device: {device}")

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    report_to="none"
)

In [None]:
# Define metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='binary', zero_division=0)
    recall = recall_score(labels, predictions, average='binary', zero_division=0)
    f1 = f1_score(labels, predictions, average='binary', zero_division=0)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
# Train BERT
print("Starting BERT training...")
trainer.train()

In [None]:
# Evaluate BERT

In [None]:
# CELL 11: Evaluate BERT
results = trainer.evaluate()
print("\n" + "="*50)
print("BERT MODEL TEST RESULTS")
print("="*50)
for key, value in results.items():
    print(f"{key}: {value:.4f}")

In [None]:
# CELL 12: COMPARISON - All Three Models
print("\n" + "="*60)
print("FINAL MODEL COMPARISON")
print("="*60)

comparison = pd.DataFrame({
    'Model': ['Custom LSTM', 'AWD-LSTM (ULMFiT)', 'BERT'],
    'Accuracy': [0.8166, 0.9058, results['eval_accuracy']],
    'Precision': [0.8012, 0.8923, results['eval_precision']],
    'Recall': [0.8422, 0.9230, results['eval_recall']],
    'F1 Score': [0.8212, 0.9074, results['eval_f1']]
})

print(comparison.to_string(index=False))
print("="*60)

In [None]:
# CELL 13: Visual Comparison
import matplotlib.pyplot as plt

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
custom_scores = [0.8166, 0.8012, 0.8422, 0.8212]
awd_scores = [0.9058, 0.8923, 0.9230, 0.9074]
bert_scores = [results['eval_accuracy'], results['eval_precision'],
               results['eval_recall'], results['eval_f1']]

x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, custom_scores, width, label='Custom LSTM', color='skyblue')
bars2 = ax.bar(x, awd_scores, width, label='AWD-LSTM', color='lightgreen')
bars3 = ax.bar(x + width, bert_scores, width, label='BERT', color='salmon')

ax.set_ylabel('Scores')
ax.set_title('Model Comparison - Sentiment Analysis')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim(0, 1)

# Add value labels
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(bar.get_x() + bar.get_width()/2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# CELL 14: Save BERT model
model.save_pretrained("/content/drive/MyDrive/Sentiment_Project/bert_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Sentiment_Project/bert_model")
print("BERT model saved!")

In [None]:
# Cell A: Imports & Device Setup
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Cell B: Load Saved BERT Model & Tokenizer
# Load BERT model from Drive
bert_model = BertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/Sentiment_Project/bert_model"
)
bert_tokenizer = BertTokenizer.from_pretrained(
    "/content/drive/MyDrive/Sentiment_Project/bert_model"
)
bert_model.to(device)
bert_model.eval()
print("âœ… BERT model loaded successfully!")