### Making necessary imports

In [1]:
import torch
import torch.optim as optim

### Importing datasets

In [2]:
from datasets import load_dataset

dataset = load_dataset('imdb')

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
import pandas as pd

train_data = pd.DataFrame(dataset['train'])
test_data = pd.DataFrame(dataset['train'])

In [5]:
x_train = train_data['text']
y_train = train_data['label']

from sklearn.model_selection import train_test_split

x_val, x_test, y_val, y_test = train_test_split(test_data['text'], test_data['label'], test_size=0.5)

In [6]:
import numpy as np

x_val = np.array(x_val)
y_val = np.array(y_val)
x_test = np.array(x_test)
y_test = np.array(y_test)

### Getting a Pretrained Model and Tokenizer

In [7]:
import transformers
from transformers import AutoTokenizer, AutoModel

In [8]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="google-bert/bert-base-uncased")
bert_model = AutoModel.from_pretrained(pretrained_model_name_or_path="google-bert/bert-base-uncased")

2025-09-28 16:17:31.982936: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-28 16:17:31.995716: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759056452.011241 1034142 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759056452.015813 1034142 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1759056452.028074 1034142 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

### Understanding the Model

In [9]:
# model

In [10]:
bert_model.pooler.dense.in_features, bert_model.pooler.dense.out_features 

(768, 768)

In [11]:
inputs = tokenizer(
    ["Example sentence 1", "Example sentence 2"],
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
outputs = bert_model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
dict(outputs)['last_hidden_state'].shape

torch.Size([2, 5, 768])

The shape `torch.Size([2, 5, 768])` corresponds to the `last_hidden_state` output from BERT. Here’s what each dimension means:

- `2`: The batch size (number of input sentences in the batch).
- `5`: The sequence length (number of tokens in each input sentence, including special tokens like `[CLS]` and `[SEP]`).
- `768`: The hidden size (the dimensionality of the BERT embeddings for each token; for `bert-base-uncased`, this is always 768).

So, for each of the 2 input sentences, BERT outputs a sequence of 5 token embeddings, each of size 768.

### Preparing the data

In [12]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        
        self.x = [
            tokenizer(
                x,
                padding='max_length',
                truncation=True,
                max_length=200,
                padding_side='right',
                return_tensors='pt',
                return_token_type_ids=False
            ) for x in X
        ] 
        
        self.y = torch.tensor(y, dtype=torch.float32)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [13]:
train_data = CustomDataset(x_train, y_train)
test_data = CustomDataset(x_test, y_test)
# val_data = CustomDataset(x_val, y_val)

In [14]:
len(dict(train_data[2][0])['input_ids'][0])

200

### Building data_loaders

In [15]:
from torch.utils.data import DataLoader

train_data_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_data_loader = DataLoader(test_data, batch_size=32, shuffle=True)

In [16]:
train_data_loader.dataset[0]

({'input_ids': tensor([[  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
           2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
           2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
           2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
           1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
           2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
           6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
           1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
           5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
          14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
           1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
           2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
          25430, 14728,  2

In [17]:
bert_model(input_ids=torch.tensor(
    [[  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
           2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
           2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
           2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
           1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
           2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
           6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
           1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
           5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
          14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
           1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
           2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
          25430, 14728,  2245,  2055,  3056,  2576,  3314,  2107,  2004,  1996,
           5148,  2162,  1998,  2679,  3314,  1999,  1996,  2142,  2163,  1012,
           1999,  2090,  4851,  8801,  1998,  6623,  7939,  4697,  3619,  1997,
           8947,  2055,  2037, 10740,  2006,  4331,  1010,  2016,  2038,  3348,
           2007,  2014,  3689,  3836,  1010, 19846,  1010,  1998,  2496,  2273,
           1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  2054,
           8563,  2033,  2055,  1045,  2572,  8025,  1011,  3756,  2003,  2008,
           2871,  2086,  3283,  1010,  2023,  2001,  2641, 26932,  1012,   102]]), 
        attention_mask=torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]]), return_dict=False)[0].shape

torch.Size([1, 200, 768])

### Setting up HyperParams

In [18]:
BATCH_SIZE = 32
EPOCHS = 20

### Building a Model

In [19]:
import torch.nn as nn

class SentimentModel(nn.Module):
    def __init__(self, bert):
        super(SentimentModel, self).__init__()
        self.bert = bert
        self.model_extrension = nn.Sequential(
            nn.Linear(768, 384),
            nn.Dropout(0.25),
            nn.Linear(384, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask):
        outs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)[0][:, 0]
        output = self.model_extrension(outs)
        return output


In [20]:
for params in list(bert_model.parameters()):
    params.requires_grad = False

In [23]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentModel(bert=bert_model).to(device)
model

SentimentModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [24]:
# Loss function and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

train_acc_plot = []
train_loss_plot = []

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for data in train_data_loader:
        input, labels = data
        optimizer.zero_grad()
        input_ids = input['input_ids'].squeeze(1).to(device)
        attention_mask = input['attention_mask'].squeeze(1).to(device)
        labels = labels.to(device).unsqueeze(1)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = (outputs > 0.5).float()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = train_loss / len(train_data_loader)
    accuracy = correct / total
    train_loss_plot.append(avg_loss)
    train_acc_plot.append(accuracy)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/20, Loss: 0.6930, Accuracy: 0.5001
Epoch 2/20, Loss: 0.6853, Accuracy: 0.5008
Epoch 3/20, Loss: 0.6706, Accuracy: 0.5577
Epoch 4/20, Loss: 0.6516, Accuracy: 0.6580
Epoch 5/20, Loss: 0.6373, Accuracy: 0.7076
Epoch 6/20, Loss: 0.6270, Accuracy: 0.7353
Epoch 7/20, Loss: 0.6201, Accuracy: 0.7511
Epoch 8/20, Loss: 0.6157, Accuracy: 0.7592
Epoch 9/20, Loss: 0.6121, Accuracy: 0.7661
Epoch 10/20, Loss: 0.6095, Accuracy: 0.7706
Epoch 11/20, Loss: 0.6071, Accuracy: 0.7750
Epoch 12/20, Loss: 0.6052, Accuracy: 0.7776
Epoch 13/20, Loss: 0.6032, Accuracy: 0.7823
Epoch 14/20, Loss: 0.6012, Accuracy: 0.7890
Epoch 15/20, Loss: 0.6013, Accuracy: 0.7853
Epoch 16/20, Loss: 0.6003, Accuracy: 0.7881
Epoch 17/20, Loss: 0.5989, Accuracy: 0.7924
Epoch 18/20, Loss: 0.5983, Accuracy: 0.7926
Epoch 19/20, Loss: 0.5981, Accuracy: 0.7924
Epoch 20/20, Loss: 0.5973, Accuracy: 0.7932


In [25]:
model.eval()
val_loss = 0
val_correct = 0
val_total = 0

val_loss_plot = []
val_acc_plot = []

with torch.no_grad():
    for batch in test_data_loader:
        inputs, labels = batch
        input_ids = inputs['input_ids'].squeeze(1).to(device)
        attention_mask = inputs['attention_mask'].squeeze(1).to(device)
        labels = labels.to(device).unsqueeze(1)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

        preds = (outputs > 0.5).float()
        val_correct += (preds == labels).sum().item()
        val_total += labels.size(0)
        
        val_loss_plot.append(val_loss / val_total)
        val_acc_plot.append(val_correct / BATCH_SIZE)

avg_val_loss = val_loss / len(test_data_loader)
val_accuracy = val_correct / val_total


print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

Validation Loss: 0.5913, Validation Accuracy: 0.8210


In [26]:
def predict_sentiment(texts):
    model.eval()
    results = []
    with torch.no_grad():
        for text in texts:
            encoded = tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=200,
                return_tensors='pt'
            )
            input_ids = encoded['input_ids'].to(device)
            attention_mask = encoded['attention_mask'].to(device)
            output = model(input_ids, attention_mask)
            pred = (output > 0.5).float().item()
            results.append(int(pred))
    return results

# Example usage:
custom_texts = ["This movie was awesome", "I did not like this film."]
predictions = predict_sentiment(custom_texts)
print(predictions)

[1, 0]


In [27]:
torch.save(model.state_dict(), "sentiment_model.pth")

In [None]:
model.load_state_dict(torch.load("sentiment_model.pth", map_location=device))
model.eval()

predictions = predict_sentiment(custom_texts)
print(predictions)

[1, 0]
