In [4]:
!pip install transformers -U



In [5]:
import pandas as pd

In [42]:
pip install --upgrade accelerate

Note: you may need to restart the kernel to use updated packages.


In [6]:
data = pd.read_csv(
    r"C:\Users\Sourish\Downloads\spam_mail_classifier.csv",
    on_bad_lines='skip',   # instead of error_bad_lines=False
    engine="python"
)
data.head()


Unnamed: 0,email_text,label
0,Let's catch up sometime next week!,ham
1,Don't forget to submit your project by Friday.,ham
2,Win a free iPhone now!!! Click here.,spam
3,Can you send me the report when it's ready?,ham
4,Meeting has been rescheduled to next Monday.,ham


In [7]:
print(data.columns)

Index(['email_text', 'label'], dtype='object')


In [8]:
data['label'].value_counts()

label
ham     583
spam    417
Name: count, dtype: int64

In [9]:
!pip install scikit-learn



In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
model = model.to('cpu')

In [15]:
!pip install torch

Collecting torch
  Using cached torch-2.6.0-cp313-cp313-win_amd64.whl.metadata (28 kB)
Using cached torch-2.6.0-cp313-cp313-win_amd64.whl (204.1 MB)
Installing collected packages: torch
Successfully installed torch-2.6.0


In [33]:
sample_data = ["I will be late","I am done with my work "]
tokenizer(sample_data, padding=True, truncation=True, max_length=512)

{'input_ids': [[101, 1045, 2097, 2022, 2397, 102, 0, 0], [101, 1045, 2572, 2589, 2007, 2026, 2147, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [34]:
X = list(data["email_text"])
y = list(data["label"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [35]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [36]:
print(X_train_tokenized['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


In [37]:
len(X_train),len(X_val)

(800, 200)

In [38]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
        self.label_map = {"ham": 0, "spam": 1}

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            label = self.labels[idx] 
            encoded_label = self.label_map.get(label, -1)
            item["labels"] = torch.tensor(encoded_label) 
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [39]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [40]:
train_dataset[5]

{'input_ids': tensor([ 101, 2292, 1005, 1055, 4608, 2039, 8811, 2279, 2733,  999,  102,    0,
            0,    0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]),
 'labels': tensor(0)}

In [41]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [43]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [44]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=100, training_loss=4.241120535880327e-05, metrics={'train_runtime': 114.1969, 'train_samples_per_second': 7.005, 'train_steps_per_second': 0.876, 'total_flos': 5755554336000.0, 'train_loss': 4.241120535880327e-05, 'epoch': 1.0})

In [45]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 3.6621024719352135e-06,
 'eval_accuracy': 1.0,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 3.743,
 'eval_samples_per_second': 53.434,
 'eval_steps_per_second': 6.679,
 'epoch': 1.0}

In [46]:
np.set_printoptions(suppress=True)

In [51]:
text = "Win a free bike now!"
#text = "The meeting is rescheduled"
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cpu')
outputs = model(**inputs)
print(outputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)
predictions = predictions.cpu().detach().numpy()
predictions

SequenceClassifierOutput(loss=None, logits=tensor([[-5.7110,  6.8172]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[3.6233e-06, 1.0000e+00]], grad_fn=<SoftmaxBackward0>)


array([[0.00000362, 0.9999964 ]], dtype=float32)

In [48]:
trainer.save_model('CustomModel')