In [1]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
     print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA device name: NVIDIA GeForce RTX 3060


In [5]:
import pandas as pd
import torch
import numpy as np
import time
import datetime
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns

from torch.utils.data import TensorDataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import classification_report

In [None]:
import pandas as pd
import glob
import os
import torch

#Configuration & Hyperparameters
MODEL_NAME = './local-bert-base-uncased'
DATASET_DIRECTORY = './datasets/'
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
ADAM_EPSILON = 1e-8
# ----------------------------------------



#Device Setup
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")





# Collects all CSV files from the given directory, normalizes column names and labels,
# merges them into a single dataset, removes duplicate rows, and randomly shuffles the final result.
def load_and_standardize_datasets(path):
    all_files = glob.glob(os.path.join(path, "*.csv"))
    if not all_files:
        print(f"Error: No CSV files were found in the directory '{path}'.")
        print("Please make sure your CSV files are inside the 'datasets' folder.")
        exit()

    print(f"Found {len(all_files)} dataset files to process...")

    df_list = []
    for filename in all_files:
        try:
            #Attempt to load with common encodings
            df = pd.read_csv(filename, encoding='latin-1')

            #Standardize Column Names
            column_rename_map = {
                'v1': 'label', 'v2': 'text',
                'Category': 'label', 'Message': 'text',
                'CLASS': 'label', 'CONTENT': 'text'
            }
            df = df.rename(columns=lambda c: c.strip().lower()).rename(columns=column_rename_map)

            if 'label' not in df.columns or 'text' not in df.columns:
                print(f"--> Skipping file: '{os.path.basename(filename)}'. Could not find required 'label' and 'text' columns.")
                continue

            df = df[['label', 'text']]

            #Standardize Labels
            label_map = {
                'ham': 0, 'spam': 1,
                '0': 0, '1': 1,
                'normal': 0,
                'legitimate': 0
            }
            df['label'] = df['label'].astype(str).str.lower().map(label_map)

            df.dropna(inplace=True)
            df['label'] = df['label'].astype(int)

            df_list.append(df)
            print(f"--> Successfully loaded and processed '{os.path.basename(filename)}', adding {len(df)} rows.")

        except Exception as e:
            print(f"--> Error processing file '{os.path.basename(filename)}': {e}")

    if not df_list:
        print("\nError: No data could be loaded from any of the files. Exiting.")
        exit()

    master_df = pd.concat(df_list, ignore_index=True)
    print(f"\nTotal combined rows: {len(master_df):,}")

    master_df.drop_duplicates(subset=['text'], inplace=True)
    print(f"Rows after removing duplicate text entries: {len(master_df):,}")

    master_df = master_df.sample(frac=1).reset_index(drop=True)
    print("Final dataset shuffled and ready for training.")
    print("\n--- Final Class Distribution ---")

    class_counts = master_df['label'].value_counts()
    
    ham_count = class_counts.get(0, 0)
    spam_count = class_counts.get(1, 0)
    
    print(f"Ham (0):  {ham_count:,}")
    print(f"Spam (1): {spam_count:,}")
    print("--------------------------------\n")
    # --------------------------------------------------------------------------

    return master_df


#Execution
df = load_and_standardize_datasets(DATASET_DIRECTORY)
print("\n--- Standardized DataFrame Head ---")
print(df.head(10))

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060
Found 1 dataset files to process...
--> Successfully loaded and processed 'cleaned_spam_dataset.csv', adding 16205 rows.

Total combined rows: 16,205
Rows after removing duplicate text entries: 16,205
Final dataset shuffled and ready for training.

--- Final Class Distribution ---
Ham (0):  14,016
Spam (1): 2,189
--------------------------------


--- Standardized DataFrame Head ---
   label                                               text
0      0         Sorry, I'll call later  &lt;#&gt; mins\r\n
1      0  Thts god's gift for birds as humans hav some n...
2      0       ['K..k...from tomorrow onwards started ah?']
3      0  NO GIFTS!! You trying to get me to throw mysel...
4      0     With my sis lor... We juz watched italian job.
5      0  ['"How are you, my Love ? Are you with your br...
6      0                   ['what is your account number?']
7      0                                Yup ok thanx...\r\n

In [2]:
#Calculates accuracy by comparing predicted class indices with true labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


#Converts elapsed time in seconds into a formatted hh:mm:ss string
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [8]:
#Load and Prepare Data
print("\n--- Loading and Preparing Data ---")
df = load_and_standardize_datasets(DATASET_DIRECTORY)


#Extract the text and labels into arrays
sentences = df.text.values
labels = df.label.values


#Tokenization (Initialize BERT tokenizer)
print(f"\n--- Tokenizing Data ---")
print(f"Loading BERT tokenizer from local path: '{MODEL_NAME}'...")
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)


#Lists to hold token IDs and attention masks for each sentence
input_ids = []
attention_masks = []


#Convert each sentence into token IDs and attention masks
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens=True,
                        max_length=MAX_LEN,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                   )
    
    
     #Save token IDs and attention mask
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])


#Concatenate all token IDs and attention masks into single tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


#Convert labels into a tensor
labels = torch.tensor(labels)


--- Loading and Preparing Data ---
Found 1 dataset files to process...
--> Successfully loaded and processed 'cleaned_spam_dataset.csv', adding 16205 rows.

Total combined rows: 16,205
Rows after removing duplicate text entries: 16,205
Final dataset shuffled and ready for training.

--- Final Class Distribution ---
Ham (0):  14,016
Spam (1): 2,189
--------------------------------


--- Tokenizing Data ---
Loading BERT tokenizer from local path: './local-bert-base-uncased'...


In [None]:
# 3. Create Datasets and DataLoaders
print("\n--- Creating DataLoaders ---")


#Wrap input IDs, attention masks, and labels into a single dataset
dataset = TensorDataset(input_ids, attention_masks, labels)


#Split dataset into training (90%) and validation (10%) sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


#Show dataset sizes
print(f'[ {train_size:,} ] training samples')
print(f'[ {val_size:,} ] validation samples')


#DataLoader for training set (random sampling for shuffling each epoch)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)


#DataLoader for validation set (sequential sampling, no shuffling)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)


--- Creating DataLoaders ---
[ 14,584 ] training samples
[ 1,621 ] validation samples


In [None]:
#Load Pre-trained Model
print(f"\n--- Loading Pre-trained Model ---")
print(f"Loading BERT model from local path: '{MODEL_NAME}'...")


#Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)


#Move model to the selected device (CPU or GPU)
model.to(device)


#Setup Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=ADAM_EPSILON)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./local-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Loading Pre-trained Model ---
Loading BERT model from local path: './local-bert-base-uncased'...


In [None]:
#Training Loop
print("\n--- Starting Training ---")

#Track total training time
total_t0 = time.time()


#Store training statistics (loss, accuracy, time per epoch, etc.)
training_stats = []


#Loop through each epoch
for epoch_i in range(0, EPOCHS):
    print(f"\n======== Epoch {epoch_i + 1} / {EPOCHS} ========")
    print('Training...')
    
    
    #Track epoch start time and training loss
    t0 = time.time()
    total_train_loss = 0
    
    #Put model into training mode
    model.train()

    
    #--- Training Phase ---
    for step, batch in enumerate(train_dataloader):
        
        
        #Print progress every 40 batches
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(f'  Batch {step:>5,}  of  {len(train_dataloader):>5,}.    Elapsed: {elapsed}.')

        
        #Unpack training batch and move tensors to GPU/CPU device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        
        #Reset gradients before each step
        model.zero_grad()
        
        
        #Forward pass → Compute loss & predictions
        result = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels, 
            return_dict=True
            )
        
        
        #Extract training loss
        loss = result.loss
        total_train_loss += loss.item()
        
        
        #Backward pass → compute gradients
        loss.backward()
        
        
        # Clip gradients to prevent exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        
        #Update model weights and learning rate according to scheduler
        optimizer.step()
        scheduler.step()
        
        
    #Calculate average loss over training batches
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    
    
    print(f"\n  Average training loss: {avg_train_loss:.2f}")
    print(f"  Training epoch took: {training_time}")
    
    
    #Validation Phase
    print("\nRunning Validation...")
    t0 = time.time()
    
    
    #Put model into evaluation mode
    model.eval()
    
    #Track validation metrics
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
        
        # Move validation batch to device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        
        #Disable gradient calculation during validation for efficiency
        with torch.no_grad():
            result = model(
                b_input_ids, 
                token_type_ids=None, 
                attention_mask=b_input_mask, 
                labels=b_labels, 
                return_dict=True
                )


        #Extract loss and logits
        loss = result.loss
        logits = result.logits
        total_eval_loss += loss.item()
        
        
        #Move predictions and labels to CPU for evaluation
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        
        # Compute accuracy for this batch
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
        
    # Compute average validation accuracy & loss
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    
    print(f"  Accuracy: {avg_val_accuracy:.2f}")
    print(f"  Validation Loss: {avg_val_loss:.2f}")
    print(f"  Validation took: {validation_time}")

    
    #Save stats for this epoch
    training_stats.append({
        'epoch': epoch_i + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accuracy.': avg_val_accuracy,
        'Training Time': training_time,
        'Validation Time': validation_time
    })
    
    
#  --- Training Finished ---
print("\nTraining complete!")
print(f"Total training took {format_time(time.time()-total_t0)} (h:mm:ss)")


--- Step 5: Starting Training ---

Training...
  Batch    40  of    912.    Elapsed: 0:00:17.
  Batch    80  of    912.    Elapsed: 0:00:32.
  Batch   120  of    912.    Elapsed: 0:00:48.
  Batch   160  of    912.    Elapsed: 0:01:03.
  Batch   200  of    912.    Elapsed: 0:01:19.
  Batch   240  of    912.    Elapsed: 0:01:34.
  Batch   280  of    912.    Elapsed: 0:01:50.
  Batch   320  of    912.    Elapsed: 0:02:06.
  Batch   360  of    912.    Elapsed: 0:02:21.
  Batch   400  of    912.    Elapsed: 0:02:37.
  Batch   440  of    912.    Elapsed: 0:02:52.
  Batch   480  of    912.    Elapsed: 0:03:08.
  Batch   520  of    912.    Elapsed: 0:03:23.
  Batch   560  of    912.    Elapsed: 0:03:39.
  Batch   600  of    912.    Elapsed: 0:03:54.
  Batch   640  of    912.    Elapsed: 0:04:10.
  Batch   680  of    912.    Elapsed: 0:04:25.
  Batch   720  of    912.    Elapsed: 0:04:41.
  Batch   760  of    912.    Elapsed: 0:04:57.
  Batch   800  of    912.    Elapsed: 0:05:12.
  Batch   84

In [17]:
# --- Final Prediction and Accuracy Calculation ---
print("\n--- Evaluating on Validation Set ---")

#Put the model in evaluation mode
model.eval()

#Track predictions and true labels across batches
predictions, true_labels = [], []


#Generate Predictions
for batch in validation_dataloader:
    #Add batch to GPU/CPU
    batch = tuple(t.to(device) for t in batch)
    
    
    #Unpack the inputs
    b_input_ids, b_input_mask, b_labels = batch
    
    
    #Disable gradient calculations for efficiency
    with torch.no_grad():
        
        #Forward pass, calculate logit predictions
        result = model(
                        b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       return_dict=True
                       )

    logits = result.logits

    #Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    #Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

#--- Process Predictions and Calculate Metrics ---

#Combine the results from all batches
flat_predictions = np.concatenate(predictions, axis=0)
flat_true_labels = np.concatenate(true_labels, axis=0)

#For each sample, pick the label (0 or 1) with the higher score.
predicted_labels = np.argmax(flat_predictions, axis=1).flatten()

#Calculate the overall accuracy
accuracy = np.sum(predicted_labels == flat_true_labels) / len(flat_true_labels)
print(f"\nTotal Accuracy: {accuracy:.4f}")


#Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(flat_true_labels, predicted_labels, target_names=['Ham (Class 0)', 'Spam (Class 1)']))


--- Evaluating on Validation Set ---
    DONE.

Total Accuracy: 0.8260

Classification Report:
                precision    recall  f1-score   support

 Ham (Class 0)       0.87      0.94      0.90      1419
Spam (Class 1)       0.03      0.01      0.02       202

      accuracy                           0.83      1621
     macro avg       0.45      0.48      0.46      1621
  weighted avg       0.77      0.83      0.79      1621



In [None]:
#--- Plotting training statistics ---
#Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

#Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

#Use plot styling from seaborn
sns.set_theme(style='darkgrid')

#Increase the plot size and font size
sns.set_theme(font_scale=0.8)
plt.rcParams["figure.figsize"] = (8,4)


#Plot the learning curve for Loss
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

#Add titles and labels
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([i for i in range(1, EPOCHS + 1)])
plt.show()


#Plot the learning curve for Accuracy
plt.plot(df_stats['Valid. Accur.'], 'r-o', label="Validation Accuracy")

#Label the plot
plt.title("Validation Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.xticks([i for i in range(1, EPOCHS + 1)])
plt.show()

NameError: name 'training_stats' is not defined

In [None]:
#--- Saving Final Model ---
output_dir = './model_save/'

#Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#Save trained model (weights + config)
print(f"Saving final model to {output_dir}")
model.save_pretrained(output_dir)

#Save tokenizer (vocabulary + config)
tokenizer.save_pretrained(output_dir)
print("Save complete.")

Saving final model to ./model_save/
Save complete.


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F


# Path to the directory where the fine-tuned model is saved
MODEL_PATH = './model_save/'

# IMPORTANT: This must be the same MAX_LEN that you used during training.
MAX_LEN = 256

def predict_spam(text):
    
    # --- Device Setup ---
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    #Load Tokenizer and Model
    try:
        tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
        
        #Load the fine-tuned model
        model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
        
        #Move model to the correct device (GPU or CPU)
        model.to(device)

        #Set the model to evaluation mode 
        model.eval()

    except Exception as e:
        print(f"Error loading model or tokenizer: {e}")
        return None
        
    # --- Preprocess the Input Text ---
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',  # Return PyTorch tensors
    )

    #Move tensors to the same device as the model
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    # --- Make a Prediction ---
    #Use torch.no_grad() to disable gradient calculations for inference
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    
    logits = outputs.logits

    # --- Process the Output ---
    probabilities = F.softmax(logits, dim=1).cpu().numpy()[0]
    prediction_index = torch.argmax(logits, dim=1).item()
    labels = ['Ham', 'Spam']
    prediction = labels[prediction_index]

    #Return a dictionary with the results
    return {
        'prediction': prediction,
        'confidence': {
            'ham': f"{probabilities[0]*100:.2f}%",
            'spam': f"{probabilities[1]*100:.2f}%"
        }
    }


#Define the sentence you want to test
test_sentence_spam = "Hey everyone! Check out this amazing deal! https://ssww.blog.ss"

#Call the prediction function
result = predict_spam(test_sentence_spam)

#Print the results in a clean format
if result:
    print(f"Analyzing text: \"{test_sentence_spam}\"")
    print("-" * 30)
    print(f"Prediction: {result['prediction']}")
    print(f"Confidence: ")
    print(f"  - Ham:  {result['confidence']['ham']}")
    print(f"  - Spam: {result['confidence']['spam']}")
    print("-" * 30)

Analyzing text: "Hey everyone! Check out this amazing deal! https://ssww.blog.ss"
------------------------------
Prediction: Spam
Confidence: 
  - Ham:  0.03%
  - Spam: 99.97%
------------------------------
