In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
# Import AdamW from torch.optim instead of transformers
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Check for cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

df = pd.read_csv('/content/spam.csv')

In [2]:
df = pd.read_csv('/content/spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [5]:
df.Category.unique()

array(['ham', 'spam'], dtype=object)

In [6]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


The data is unbalanced. So let us either increase the spam or reduce ham. Here I will reduce ham emails.

In [7]:
df.Category = df.Category.map({'ham':0,'spam':1})
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df_spam = df[df.Category==1]
df_ham_small = df[df.Category==0].sample(1000) # sample(df_spam.shape[0])
df_spam.shape, df_ham_small.shape

((747, 2), (1000, 2))

In [9]:
df_small = pd.concat([df_spam,df_ham_small])
df_small.value_counts

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_small.Message, df_small.Category, test_size=0.2, random_state=42)


In [11]:
y_train.value_counts(), y_test.value_counts()

(Category
 0    812
 1    585
 Name: count, dtype: int64,
 Category
 0    188
 1    162
 Name: count, dtype: int64)

# Use BERT tokenizer

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(text, labels):
  encodings = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors = 'pt')
  return encodings['input_ids'],encodings['attention_mask'], torch.tensor(labels, dtype=torch.float)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
tokenize_function(['Hurry up, the offer ends soon. Click here', 'I like apples and bananas'], [1,0])

(tensor([[  101,  9241,  2039,  1010,  1996,  3749,  4515,  2574,  1012, 11562,
           2182,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [14]:
X_train.head()

Unnamed: 0,Message
3463,Bloomberg -Message center +447797706009 Why wa...
3720,"Thanks for your ringtone order, reference numb..."
3059,You are now unsubscribed all services. Get ton...
437,"Ask g or iouri, I've told the story like ten t..."
3128,Thats cool. i liked your photos. You are very ...


In [15]:
y_train.head()

Unnamed: 0,Category
3463,1
3720,1
3059,1
437,0
3128,0


In [16]:
X_train.values.tolist()[:2]

['Bloomberg -Message center +447797706009 Why wait? Apply for your future http://careers. bloomberg.com',
 'Thanks for your ringtone order, reference number X49. Your mobile will be charged 4.50. Should your tone not arrive please call customer services 09065989182. From: [colour=red]text[/colour]TXTstar']

In [17]:
train_input_ids,train_attention_mask, train_labels = tokenize_function(X_train.values.tolist(), y_train.values.tolist())
val_input_ids,val_attention_mask, val_labels = tokenize_function(X_test.values.tolist(), y_test.values.tolist())

In [18]:
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_mask, val_labels)

# Modelling

In [19]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [20]:
bert = BertModel.from_pretrained('bert-base-uncased')
bert.config.hidden_size

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

768

In [25]:
class SentimentClassifier(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    # Call the .parameters() method to get an iterable of parameters
    for param in self.bert.parameters():
      param.requires_grad = False # Freeze all BERT layers since it is trained on a very very very large data set
    # BERT is a pretrained layer. We have to do further fine tuning for spam classification
    self.classifier = nn.Sequential(
        nn.Linear(self.bert.config.hidden_size, 256),
        nn.ReLU(), # Activation function
        nn.Dropout(0.3), # To avoid overfitting
        nn.Linear(256, 1), # Fully Connected Layer
        nn.Sigmoid()
    )

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    sentence_embedding = outputs.last_hidden_state[:,0,:] # 0 here is [CLS] which has the meaning or summary of entire sentence
    return self.classifier(sentence_embedding)

Why the forward method?

When you define a custom model in PyTorch using a class that inherits from nn.Module, you must implement the forward() method. It tells PyTorch how to apply your layers to the inputs.

The forward() method is automatically called when you run model(inputs).

So, without forward(), your model doesn't know how to actually process the inputs.

In [26]:
model = SentimentClassifier()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

What's an Optimizer?

In deep learning, an optimizer adjusts the model's weights to minimize the loss function.

* After computing the loss (how wrong the model is), we calculate gradients of that loss w.r.t the model's parameters using backpropagation.

* The optimizer uses those gradients to update the weights so that the loss becomes smaller in the next iteration.

In [27]:
epochs=1

for epoch in range(epochs):
  model.train()
  total_train_loss = 0
  for batch, (input_ids, attention_mask, labels) in enumerate(train_loader):
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask).squeeze()
    loss = criterion(outputs.squeeze(), labels)
    loss.backward()
    optimizer.step()

    total_train_loss += loss.item()
  avg_train_loss = total_train_loss / len(train_loader)
  print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f}")

Epoch 1/1 | Train Loss: 0.2990


1. model.train() - This sets the model to training mode.

    * Some layers like Dropout and BatchNorm behave differently during training vs. evaluation — so this is important.
2. total_train_loss = 0
    * Used to keep track of cumulative loss over all batches to later compute average loss for the epoch.
3. for batch, (input_ids, attention_mask, labels) in enumerate(train_loader):
* This fetches each batch from your train_loader, which returns:

  * input_ids (tokenized text input)

  * attention_mask (tells BERT which tokens are actual input vs. padding)

  * labels (ground-truth targets: e.g., spam or not spam)

4. optimizer.zero_grad()
  * Clears out gradients from the previous batch so they don’t accumulate.
5. outputs = model(input_ids, attention_mask).squeeze()
  * Model produces predictions for the batch.

  * squeeze() removes extra dimensions (e.g., from [32,1] to [32]), which matches the shape of labels.
6. loss = criterion(outputs.squeeze(), labels)
  * Uses nn.BCELoss() (Binary Cross Entropy Loss) to measure how different the predictions (outputs) are from the true labels (labels).

  * Both should be values between 0 and 1, which is why the model ends in a Sigmoid.
7. loss.backward()
  * Computes gradients of the loss with respect to model parameters using backpropagation.
8. optimizer.step()
  * Uses the computed gradients to update model parameters.


| Step                      | What's Happening                         |
| ------------------------- | ---------------------------------------- |
| `model.train()`           | Prepares model for training              |
| `for batch...`            | Loops over training data                 |
| `.to(device)`             | Moves data to GPU/CPU                    |
| `optimizer.zero_grad()`   | Clears old gradients                     |
| `outputs = model(...)`    | Makes predictions                        |
| `loss = criterion(...)`   | Calculates how wrong the predictions are |
| `loss.backward()`         | Computes gradients                       |
| `optimizer.step()`        | Updates model weights                    |
| `total_train_loss += ...` | Tracks how training is going             |


In [29]:
# Evaluation
model.eval()
total_val_loss = 0
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
  for input_ids, attention_mask, labels in test_loader:
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    outputs = model(input_ids, attention_mask).squeeze()
    loss = criterion(outputs, labels)
    total_val_loss += loss.item()

    # Calculate accuracy
    # For binary classification with Sigmoid output, predictions are 0 or 1 based on a threshold (0.5)
    predicted = (outputs.data > 0.5).long() # Convert boolean tensor to long (0 or 1)

    # The labels tensor might be float, convert it to long for comparison
    correct_predictions += (predicted == labels.long()).sum().item()
    total_predictions += labels.size(0)

avg_val_loss = total_val_loss / len(test_loader)
val_accuracy = correct_predictions / total_predictions

print(f"Validation Loss: {avg_val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

Validation Loss: 0.0904
Validation Accuracy: 0.9686


In [33]:
def predict(model, text, max_length=128):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  inputs = tokenizer(text, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
  input_ids = inputs['input_ids'].to(device)
  attention_mask = inputs['attention_mask'].to(device)

  model.eval()
  with torch.no_grad():
    outputs = model(input_ids, attention_mask).squeeze()
    # Call .float() to convert the boolean tensor to float
    prediction = (outputs>0.5).float().item()
    return 'spam' if prediction==1 else 'ham'

In [34]:
predict(model, 'This is your last chance to win the lottery. Click here') # Model is sentimentclassifier class

'spam'

In [35]:
predict(model,'I know it is difficult.Hang in there')

'ham'