# Sentiment Analysis with BERT on IMDB Dataset

### Mount Google Drive and Install Dependencies
- Mount Google Drive to access dataset files
- Install required libraries (transformers, pandas, torch)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers pandas torch

Mounted at /content/drive
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading

### Import Required Libraries

In [8]:
import transformers
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn import model_selection
from sklearn import metrics
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

### Configuration Settings
- Define : max sequence length, batch sizes, epochs
- Set model/file paths (BERT model, dataset path, save path)
- Initialize BERT tokenizer

In [9]:
MAX_LEN = 128
TRAINING_BACH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 3
BERT_PATH = 'bert-base-uncased'
TRAINING_FILE = '/content/drive/MyDrive/IMDB Dataset.csv'
MODEL_PATH = '/content/drive/MyDrive/model.bin'
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH, do_lower_case=True)


### BERT Model Architecture
- Define custom BERT classification model
- Add dropout layer for regularization
- Add linear output layer for binary classification

In [10]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids=None):
        outputs = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        pooled_output = outputs.pooler_output
        bo = self.bert_drop(pooled_output)
        return self.out(bo)

### Dataset Preparation Class
- Create custom Dataset class for IMDB reviews
- Handle tokenization and padding
- Return formatted tensors (input_ids, attention_mask, targets)

In [11]:
class BERTDataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        inputs = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding = 'max_length',
        )


        pad_len = self.max_len - len(inputs["input_ids"])
        input_ids = inputs["input_ids"] + [0] * pad_len
        mask = inputs["attention_mask"] + [0] * pad_len
        token_type_ids = inputs["token_type_ids"] + [0] * pad_len

        return {
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)
        }

### Training/Evaluation Functions
- Define binary cross-entropy loss function
- Create training loop with gradient accumulation
- Create evaluation function with metrics calculation
- Use sigmoid activation for probability outputs

In [12]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets


### Main Training Execution
- Load and split IMDB dataset (90% train, 10% validation)
- Create dataloaders with proper batching
- Set up optimizer with weight decay parameters
- Implement learning rate scheduling
- Train for multiple epochs with model checkpointing

In [13]:
def run():
    dfx = pd.read_csv(TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = BERTDataset(
        review=df_train.review.values, target=df_train.sentiment.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAINING_BACH_SIZE, num_workers=0, pin_memory=True
    )

    valid_dataset = BERTDataset(
        review=df_valid.review.values, target=df_valid.sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device('cuda')
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / TRAINING_BACH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_accuracy = 0
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = accuracy


if __name__ == "__main__":
    run()


100%|██████████| 2813/2813 [15:30<00:00,  3.02it/s]
100%|██████████| 157/157 [00:34<00:00,  4.49it/s]


Accuracy Score = 0.8862


100%|██████████| 2813/2813 [15:29<00:00,  3.03it/s]
100%|██████████| 157/157 [00:36<00:00,  4.26it/s]


Accuracy Score = 0.8992


100%|██████████| 2813/2813 [15:28<00:00,  3.03it/s]
100%|██████████| 157/157 [00:35<00:00,  4.39it/s]


Accuracy Score = 0.8998


### Prediction Pipeline
- Create prediction dataset class for new texts
- Build prediction function with saved model
- Handle GPU/CPU device compatibility
- Return human-readable sentiment labels

In [14]:
class PredictionDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        return {
            "ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
            "mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs["token_type_ids"], dtype=torch.long)
        }

def predict_sentiment(texts, model_path=MODEL_PATH):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BERTBaseUncased()
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    dataset = PredictionDataset(texts)
    dataloader = DataLoader(dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

    predictions = []
    with torch.no_grad():
        for bi, d in enumerate(dataloader):
            ids = d["ids"].to(device)
            mask = d["mask"].to(device)
            token_type_ids = d["token_type_ids"].to(device)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().numpy()
            predictions.extend(outputs.flatten().tolist())

    return ["positive" if p >= 0.5 else "negative" for p in predictions]

### Sample Predictions

In [15]:
sample_texts = [
    "This movie was absolutely fantastic! The acting was superb.",
    'ooh love it',
    'amazing actors but I hate it',
    'the actors are not good at acting'
]


predictions = predict_sentiment(sample_texts)

for text, pred in zip(sample_texts, predictions):
    print(f"Text: {text[:50]}... | Prediction: {pred}")

Text: This movie was absolutely fantastic! The acting wa... | Prediction: positive
Text: ooh love it... | Prediction: positive
Text: amazing actors but I hate it... | Prediction: negative
Text: the actors are not good at acting... | Prediction: negative
