<a href="https://colab.research.google.com/github/IonZhao/BERT_Classification/blob/main/BERT_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Load the dataset

In [None]:
import pandas as pd
# import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('./data/SPAM text message 20170820 - Data.csv')
df['Message'] = df['Message'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


# Step 2:Naive Bayes by hand
## Step 2.1：Training the model

In [None]:
from sklearn.model_selection import train_test_split
import re

train_df, test_df = train_test_split(df, test_size=0.2)
train_df.shape, test_df.shape

# Total number of documents
N = len(train_df)

# P(c_j), each document's category probability
P_ham = train_df['Category'].value_counts()['ham'] / N
P_spam = train_df['Category'].value_counts()['spam'] / N

# P(w|c_j), each word's probability in a category

word_count = {} # dictionary mapping word to count overall

spam_word_count = {} # same, but only for spam

ham_word_count = {} # same, but only for ham


for index, row in train_df.iterrows(): # loop through rows of training dataset

  document = row['Message']
  category = row['Category']
  # tokenization
  tokenized_document = re.split(r"\W+", document)
  for word in tokenized_document:
      word_count[word] = word_count.get(word, 0) + 1
      if category == 'spam':
        spam_word_count[word] = spam_word_count.get(word, 0) + 1
      else:
        ham_word_count[word] = ham_word_count.get(word, 0) + 1

V = len(word_count)
len_spam_vocab = len(spam_word_count)
len_ham_vocab = len(ham_word_count)

# Step 2.2: Predict in test data

In [None]:
# Training is complete

TP, TN, FP, FN = 0, 0, 0, 0
for index, row in test_df.iterrows(): # go through each test document
  document = row['Message']
  true_category = row['Category'] # ground truth

  # Probability of this doc given spam
  p_spam = P_spam
  for word in re.split(r"\W+", document):
    p_spam *= (spam_word_count.get(word, 0) + 1) / (len_spam_vocab + V)

  # Probability of this doc given ham
  p_ham = P_ham
  for word in re.split(r"\W+", document):
    p_ham *= (ham_word_count.get(word, 0) + 1) / (len_ham_vocab + V)

  # Check which probability is higher
  if p_spam > p_ham:
    output_category = 'spam'
  else:
    output_category = 'ham'

  # Calculate confusion matrix
  if output_category == 'spam' and true_category == 'spam':
    TP += 1
  elif output_category == 'ham' and true_category == 'ham':
    TN += 1
  elif output_category == 'spam' and true_category == 'ham':
    FP += 1
  elif output_category == 'ham' and true_category == 'spam':
    FN += 1

# Print results
print(f'TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}')
print(f'Accuracy: {(TP + TN) / (TP + TN + FP + FN)}')
print(f'Precision: {TP / (TP + FP)}')
print(f'Recall: {TP / (TP + FN)}')
print(f'F1: {2 * TP / (2 * TP + FP + FN)}')
print()

TP: 100, TN: 970, FP: 0, FN: 45
Accuracy: 0.9596412556053812
Precision: 1.0
Recall: 0.6896551724137931
F1: 0.8163265306122449



# Step 3: TF-idf By hand
## Step 3.1: Preprocessing the dataframe

In [None]:
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

train_texts, test_texts, train_labels, test_labels = train_test_split(df['Message'], df['Category'], test_size=0.2)

## Step 3.2: Build TF-idf

In [None]:
from sklearn.linear_model import LogisticRegression
from collections import Counter
import numpy as np
from sklearn.metrics import precision_recall_fscore_support


def tokenize(text):
    return re.split(r"\W+", text)

# Calculate TF-IDF
def compute_tf_idf(train_texts, test_texts):
    word_to_idx = {}
    tf_idf_train, tf_idf_test = [], []

    # Build vocabulary
    word_counts = Counter(token for text in train_texts for token in tokenize(text))
    print(word_counts)
    word_to_idx = {word: i for i, word in enumerate(word_counts)}

    # Calculate IDF
    doc_counts = Counter(word for text in train_texts for word in set(tokenize(text)))
    num_docs = len(train_texts)
    idf = {word: np.log((num_docs + 1) / (doc_counts[word] + 1)) + 1 for word in word_to_idx}

    # Compute TF-IDF for each document
    def tf_idf_vector(text):
        tokens = tokenize(text)
        # Calculate term frequency
        term_freq = Counter(tokens)
        tf_idf = [term_freq[word] * idf.get(word, 0) for word in word_to_idx]
        return np.array(tf_idf)

    #Convert texts to vectors
    tf_idf_train = np.array([tf_idf_vector(text) for text in train_texts])
    tf_idf_test = np.array([tf_idf_vector(text) for text in test_texts])
    return tf_idf_train, tf_idf_test

tf_idf_train, tf_idf_test = compute_tf_idf(train_texts, test_texts)

Counter({'': 2679, 'i': 2457, 'you': 1788, 'to': 1782, 'a': 1184, 'the': 1057, 'u': 957, 'and': 797, 'in': 721, 'is': 688, 'me': 633, 'my': 617, 'for': 557, 'it': 557, 'your': 555, 'of': 507, 'call': 482, 'have': 478, 'that': 469, 's': 468, 'on': 439, '2': 423, 'now': 407, 't': 388, 'are': 387, 'can': 383, 'so': 378, 'm': 361, 'but': 356, 'not': 351, 'at': 338, 'we': 327, 'or': 326, 'ur': 323, 'do': 322, 'be': 320, 'get': 318, 'if': 312, 'just': 308, 'with': 306, 'will': 303, 'no': 292, 'gt': 276, 'lt': 272, 'this': 270, '4': 263, 'up': 255, 'how': 244, 'go': 238, 'what': 235, 'from': 234, 'ok': 229, 'out': 224, 'free': 224, 'll': 224, 'all': 224, 'when': 223, 'know': 204, 'good': 200, 'he': 196, 'day': 196, 'then': 195, 'was': 194, 'there': 190, 'like': 190, 'come': 187, 'got': 183, 'am': 182, 'its': 182, 'time': 180, 'love': 180, 'only': 179, 'want': 166, 'text': 157, 'send': 150, 'as': 150, 'about': 144, 'she': 143, 'txt': 142, 'going': 140, 'one': 140, 'ü': 140, 'today': 140, 'by':

## Step 3.3: Logistic regression

In [None]:
# Logistic Regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
model = LogisticRegression()
model.fit(tf_idf_train, train_labels)
predictions = model.predict(tf_idf_test)

# Metrics
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='binary')
print(f"Tf-idf by hand - Precision: {precision}, Recall: {recall}, F1: {f1}")

Tf-idf by hand - Precision: 1.0, Recall: 0.875, F1: 0.9333333333333333


# Step 4: Naive Bayes and Tf-idf using Scikit-learn
## Step 4.1： Traning and test the model

In [None]:
# Now let's use sklearn to do the same thing
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB
from sklearn.naive_bayes import MultinomialNB

tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train_df['Message'])

# Naive Bayes
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_texts)

print(tfidf.get_feature_names_out())
print(X_train.shape)

X_test = vectorizer.transform(test_texts)

nb_model = MultinomialNB()
nb_model.fit(X_train, train_labels)
nb_predictions = nb_model.predict(X_test)

# Metrics
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, nb_predictions, average='binary')
print(f"Scikit-learn Naive Bayes - Precision: {precision}, Recall: {recall}, F1: {f1}")

# Logistic Regression with Tf-idf
model = LogisticRegression()
model.fit(X_train, train_labels)
lr_predictions = model.predict(X_test)

# Metrics
precision_word, recall_word, f1_word, _ = precision_recall_fscore_support(test_labels, lr_predictions, average='binary')
print(f"Scikit-learn Tf-idf - Precision: {precision_word}, Recall: {recall_word}, F1: {f1_word}")


['00' '000' '000pes' ... 'zyada' 'èn' 'ú1']
(4457, 7699)
Scikit-learn Naive Bayes - Precision: 1.0, Recall: 0.6578947368421053, F1: 0.7936507936507936
Scikit-learn Tf-idf - Precision: 1.0, Recall: 0.7631578947368421, F1: 0.8656716417910447


## Step 4.2: Compare diffrent parameters

In [None]:
# Character-level tf-idf
vectorizer_char = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
X_train_char = vectorizer_char.fit_transform(train_texts)
X_test_char = vectorizer_char.transform(test_texts)

print(vectorizer_char.get_feature_names_out())
print(X_train_char.shape)

model_char = LogisticRegression()
model_char.fit(X_train_char, train_labels)
predictions_char = model_char.predict(X_test_char)

# Metrics for character-level
precision_char, recall_char, f1_char, _ = precision_recall_fscore_support(test_labels, predictions_char, average='binary')

# Print results
print(f"Word-level TF-IDF - Precision: {precision_word:.2f}, Recall: {recall_word:.2f}, F1 Score: {f1_word:.2f}")
print(f"Character-level TF-IDF - Precision: {precision_char:.2f}, Recall: {recall_char:.2f}, F1 Score: {f1_char:.2f}")

[' !' ' ! ' ' ! *' ... '鈥┾' '鈥┾?' '鈥┾??']
(4457, 54330)
Word-level TF-IDF - Precision: 1.00, Recall: 0.76, F1 Score: 0.87
Character-level TF-IDF - Precision: 1.00, Recall: 0.82, F1 Score: 0.90


## Step 4.3: Analyzation
We'll compare the performance of two models:
Word-level tf-idf: Default analyzer='word'
Character-level tf-idf: analyzer='char' with n-grams of size 2 to 4

### Explanation:
Word-level TF-IDF:
Extracts features based on complete words.
Works well for normal text but might miss patterns in noisy text.

Character-level TF-IDF:
Extracts features based on overlapping character n-grams.
Captures subword patterns and spelling variations (e.g., "fr3e" and "free").
Useful for SMS spam detection where abbreviations and non-standard spellings are common.

### Result:
The character-level model performs better than the word-level model due to its ability to capture patterns in noisy and creative text.
A potential downside is the increased computational cost due to larger feature space, especially for higher n-gram ranges.

# Step 5: BERT
## Step 5.1: Load BERT model and tokenizer

In [None]:
import torch

from transformers import BertTokenizer, BertForMaskedLM

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('Using GPU ', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('Using CPU')


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# model = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True)
# log_softmax = torch.nn.LogSoftmax(dim=0)
# mask_token = tokenizer.mask_token
# model.to(device)
# model.train()

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=10)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=10)


Using GPU  Tesla T4


In [None]:
import torch.nn as nn
from transformers import BertModel

class CustomBertClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=2):
        super(CustomBertClassifier, self).__init__()
        # Load the pretrained BERT model
        self.bert = BertModel.from_pretrained(bert_model_name)
        # # Add a dropout layer for regularization
        # self.dropout = nn.Dropout(0.3)
        # Add a linear layer for classification
        # Define the loss function
        self.loss_fn = nn.CrossEntropyLoss()
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        # Pass inputs through BERT
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        # Use the [CLS] token representation
        pooled_output = outputs[1]
        # Apply dropout
        # pooled_output = self.dropout(pooled_output)
        # Pass through the classification layer

        logits = self.classifier(pooled_output)
        if labels is not None:
            # Compute loss
            loss = self.loss_fn(logits, labels)
            return loss, logits
        return logits

## Step 5.2: Create a Dataset Class

In [None]:
import torch

class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SpamDataset(train_encodings, train_labels.tolist())
test_dataset = SpamDataset(test_encodings, test_labels.tolist())

print(train_dataset[0])
print(test_dataset[0])

{'input_ids': tensor([  101,  7514,  2000,  2663, 27708,  4882,   999,  2073,  2097,   102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor(1)}
{'input_ids': tensor([ 101, 5490, 5657, 4402, 4402, 4371,  999,  999, 2023,  102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor(0)}


## Step 5.3: Set up KFlod Cross Validation

In [None]:
from sklearn.model_selection import KFold
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import DataLoader, Subset

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
learning_rates = [5e-5, 3e-5, 2e-5]  # Learning rates to test
best_lr = None
lowest_loss = float('inf')
max_epochs = 20
early_stopping_patience = 4

# Function for evaluating the model on a fold
def train_and_evaluate_model(train_idx, val_idx, lr):
    # Create training and validation sets
    train_subset = Subset(train_dataset, train_idx)
    val_subset = Subset(train_dataset, val_idx)

    # DataLoaders
    # train_loader = DataLoader(train_subset, batch_size=10, shuffle=True)
    # val_loader = DataLoader(val_subset, batch_size=10, shuffle=False)

    # Model
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    # Initialize the custom BERT classifier
    custom_model = CustomBertClassifier(bert_model_name='bert-base-uncased', num_labels=2)
    custom_model.to(device)
    # model.to(device)

    # Trainer
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy='epoch',
        learning_rate=lr,
        per_device_train_batch_size=10,
        num_train_epochs=max_epochs,
        weight_decay=0.01,
        save_total_limit=1,
        logging_dir='./logs',
        load_best_model_at_end=True,
        save_strategy="epoch",
        metric_for_best_model="eval_loss",
        greater_is_better=False
    )

    trainer = Trainer(
        model=custom_model,
        args=training_args,
        train_dataset=train_subset,
        eval_dataset=val_subset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience)]
    )

    trainer.train()

    eval_metrics = trainer.evaluate()
    return eval_metrics['eval_loss'], model

## Step 5.4: Cross-validation

In [None]:
for lr in learning_rates:
    avg_loss = 0
    for fold, (train_idx, val_idx) in enumerate(kfold.split(train_dataset)):
        print(f"Learning Rate: {lr}, Fold: {fold+1}")
        fold_loss, _ = train_and_evaluate_model(train_idx, val_idx, lr)
        avg_loss += fold_loss

    avg_loss /= kfold.n_splits
    print(f"Learning Rate: {lr}, Average Loss: {avg_loss}")

    if avg_loss < lowest_loss:
        lowest_loss = avg_loss
        best_lr = lr

print(f"Best Learning Rate: {best_lr}")

Learning Rate: 5e-05, Fold: 1


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.139354
2,0.159100,0.172739
3,0.049300,0.124421
4,0.049300,0.175897
5,0.015600,0.136128
6,0.010500,0.239063
7,0.010500,0.214803


Learning Rate: 5e-05, Fold: 2




Epoch,Training Loss,Validation Loss
1,No log,0.130078
2,0.161000,0.129499
3,0.054100,0.169329
4,0.054100,0.15619
5,0.019000,0.139707
6,0.015900,0.160952


Learning Rate: 5e-05, Fold: 3




Epoch,Training Loss,Validation Loss
1,No log,0.173264
2,0.169000,0.269391
3,0.046100,0.225228
4,0.046100,0.261958
5,0.026800,0.251395


Learning Rate: 5e-05, Fold: 4




Epoch,Training Loss,Validation Loss
1,No log,0.181183
2,0.172600,0.114089
3,0.064500,0.19453
4,0.064500,0.246182
5,0.024600,0.1923
6,0.018100,0.2393


Learning Rate: 5e-05, Fold: 5




Epoch,Training Loss,Validation Loss
1,No log,0.18056
2,0.155500,0.207441
3,0.071700,0.201714
4,0.071700,0.181063
5,0.023800,0.239858


Learning Rate: 5e-05, Average Loss: 0.1443665474653244
Learning Rate: 3e-05, Fold: 1




Epoch,Training Loss,Validation Loss
1,No log,0.110664
2,0.169500,0.17455
3,0.052900,0.111071
4,0.052900,0.194717
5,0.013600,0.162352


Learning Rate: 3e-05, Fold: 2




Epoch,Training Loss,Validation Loss
1,No log,0.113672
2,0.167300,0.141799
3,0.049800,0.202904
4,0.049800,0.162461
5,0.015700,0.190105


Learning Rate: 3e-05, Fold: 3




Epoch,Training Loss,Validation Loss
1,No log,0.139502
2,0.169100,0.206871
3,0.048900,0.219216
4,0.048900,0.217422
5,0.025500,0.261367


Learning Rate: 3e-05, Fold: 4




Epoch,Training Loss,Validation Loss
1,No log,0.143726
2,0.166800,0.154486
3,0.046700,0.200564
4,0.046700,0.28747
5,0.015000,0.313298


Learning Rate: 3e-05, Fold: 5




Epoch,Training Loss,Validation Loss
1,No log,0.177481
2,0.152500,0.147781
3,0.055600,0.159763
4,0.055600,0.267278
5,0.025600,0.229626
6,0.010500,0.251149


Learning Rate: 3e-05, Average Loss: 0.1310691863298416
Learning Rate: 2e-05, Fold: 1




Epoch,Training Loss,Validation Loss
1,No log,0.126844
2,0.169500,0.126088
3,0.062100,0.158702
4,0.062100,0.188795
5,0.022600,0.177214


Epoch,Training Loss,Validation Loss
1,No log,0.126844
2,0.169500,0.126088
3,0.062100,0.158702
4,0.062100,0.188795
5,0.022600,0.177214
6,0.013000,0.180429


Learning Rate: 2e-05, Fold: 2




Epoch,Training Loss,Validation Loss
1,No log,0.115462
2,0.175700,0.100645
3,0.059000,0.106369
4,0.059000,0.152345
5,0.019300,0.190566
6,0.006000,0.174177


Learning Rate: 2e-05, Fold: 3




Epoch,Training Loss,Validation Loss
1,No log,0.151412
2,0.172300,0.165876
3,0.055700,0.170982
4,0.055700,0.18436
5,0.028300,0.197507


Learning Rate: 2e-05, Fold: 4




Epoch,Training Loss,Validation Loss
1,No log,0.143867
2,0.154100,0.172098
3,0.052000,0.225676
4,0.052000,0.189524
5,0.017000,0.187307


Learning Rate: 2e-05, Fold: 5




Epoch,Training Loss,Validation Loss
1,No log,0.155259
2,0.157600,0.168323
3,0.048400,0.192928
4,0.048400,0.317404
5,0.017300,0.243989


Learning Rate: 2e-05, Average Loss: 0.13545409440994263
Best Learning Rate: 3e-05


## Step 5.5: Traning on full Traning dataset and Evaluate on Test set

In [None]:
# Train final model
# final_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
final_model = CustomBertClassifier(bert_model_name='bert-base-uncased', num_labels=2)
best_lr = 3e-5
final_model.to(device)
training_args = TrainingArguments(
    output_dir='./final_results',
    evaluation_strategy='epoch',
    learning_rate=best_lr,
    per_device_train_batch_size=10,
    num_train_epochs=max_epochs,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir='./final_logs',
    load_best_model_at_end=True,
    save_strategy="epoch",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    prediction_loss_only=False
)

trainer = Trainer(
    model=final_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience)]
)

trainer.train()
test_metrics = trainer.evaluate()

# Generate predictions on the test set
predictions = trainer.predict(test_dataset)

# Extract logits
logits = predictions.predictions

# Convert logits to predicted labels
predicted_labels = torch.argmax(torch.tensor(logits), axis=1)

# Calculate metrics
precision, recall, f1, _ = precision_recall_fscore_support(
    test_labels,  # True labels
    predicted_labels,  # Predicted labels
    average='binary'
)

# Print metrics
print(f"Test Set Results - Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")




Epoch,Training Loss,Validation Loss
1,No log,0.163068
2,0.172000,0.102322
3,0.066300,0.136096
4,0.017200,0.146772
5,0.013900,0.19902
6,0.009400,0.193746


Test Set Results - Precision: 0.95, Recall: 0.90, F1 Score: 0.92
