In [5]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from transformers import DistilBertTokenizer,DistilBertConfig, DistilBertForSequenceClassification, Trainer, TrainingArguments,EarlyStoppingCallback
from torch.utils.data import Dataset
from torch import nn

In [6]:
# Configurations
MODEL_NAME = 'distilbert-base-uncased' # Pre-trained model
MAX_LEN = 128  # Max sequence length
BATCH_SIZE = 32
EPOCHS = 10

# Defining Tensors for later use
# Assign the text and the labels
class SpamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
# Fetches items for the model from the dataset
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
# Calculates the length of the dataset
    def __len__(self):
        return len(self.labels)
# Metrics Calculation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
# Check for GPU to speed up calculations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and Prepare Data
print("Loading dataset...")
try:
    df = pd.read_csv("03_Cleaned_Updated_Dataset.csv")
except FileNotFoundError:
    print("Error: Dataset not found. Please ensure the file is in the same directory.")
    exit()

text_col = "cleaned_text"
label_col = "label_encoded"

# Ensure data integrity
if text_col not in df.columns or label_col not in df.columns:
    print(f"Error: Columns '{text_col}' or '{label_col}' not found in dataset.")
    print(f"Available columns: {df.columns.tolist()}")
    exit()

# Fill missing values and convert to lists
df[text_col] = df[text_col].astype(str).fillna("")
X = df[text_col].tolist()
y = df[label_col].tolist()

# Split dataset 20% test, 80% train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Further split training data into training and validation sets (10% of training data for validation)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.125, random_state=42, stratify=y_train
)

print(f"Train size: {len(X_train)}, Val size: {len(X_val)}, Test size: {len(X_test)}")

# Tokenization
print("Tokenizing data...")
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) # Use the pre-trained tokenizer

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=MAX_LEN)
val_encodings   = tokenizer(X_val, truncation=True, padding=True, max_length=MAX_LEN)
test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=MAX_LEN)

# Create Torch Datasets
train_dataset = SpamDataset(train_encodings, y_train)
val_dataset   = SpamDataset(val_encodings, y_val)
test_dataset  = SpamDataset(test_encodings, y_test)

# Modify Dropout rate
# Correct config for classification
config = DistilBertConfig.from_pretrained(
    MODEL_NAME,
    num_labels=2, # Binary classification
    hidden_dropout_prob=0.3,
    seq_classif_dropout=0.3,
    id2label={0: "Ham", 1: "Spam"}, # Output labels
    label2id={"Ham": 0, "Spam": 1} # Input labels
)

# Load pretrained DistilBERT with correct config
model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    config=config
).to(device)

Using device: cuda
Loading dataset...
Train size: 14072, Val size: 2011, Test size: 4021
Tokenizing data...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',      
    num_train_epochs=EPOCHS,             # Set high (e.g., 10), let Early Stopping handle the rest
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE*2, 
    warmup_steps=300,                
    weight_decay=0.05,               
    logging_dir='./logs',            
    logging_strategy="epoch",
    eval_strategy="epoch",           # Evaluate every epoch
    save_strategy="epoch",           # Save checkpoint every epoch
    save_total_limit=2,
    load_best_model_at_end=True,     # Load the best model when finished
    metric_for_best_model="eval_loss", # Evaluate using validation loss
    greater_is_better=False,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    report_to="none",
)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

# Weights
class_counts = df[label_col].value_counts().sort_index().values
num_samples = sum(class_counts)
class_weights = num_samples / (len(class_counts) * class_counts)

#Create tensor of class weights
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print(f"Computed Class Weights: {class_weights}")

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Extract labels
        labels = inputs.get("labels")
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Compute custom loss with weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer( 
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)
print("Starting Training...")
trainer.train()

# Evaluate on Validation Set
print("\nEvaluating on Test Set...")
metrics_val = trainer.evaluate()
print("Validation Set Metrics:", metrics_val)

# Evaluate on Test Set
metrics_test = trainer.evaluate(test_dataset)
print("Test Set Metrics:", metrics_test)

# Generate Full Classification Report
print("\nGenerating detailed report...")
predictions = trainer.predict(test_dataset)
y_preds = np.argmax(predictions.predictions, axis=-1)

# ADDED: Confusion Matrix Printout
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_preds))

print("\nClassification Report:")
print(classification_report(y_test, y_preds, target_names=['Ham', 'Spam']))

trainer.save_model("exported_model")
tokenizer.save_pretrained("exported_model")

Computed Class Weights: [0.70337975 1.72922759]
Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3272,0.198404,0.95276,0.914645,0.956767,0.876076
2,0.1119,0.138208,0.957235,0.927487,0.909091,0.946644
3,0.0542,0.179248,0.963202,0.936097,0.939341,0.932874
4,0.0285,0.274753,0.966683,0.940125,0.977695,0.905336
5,0.0117,0.240421,0.967181,0.942907,0.947826,0.938038



Evaluating on Test Set...


Validation Set Metrics: {'eval_loss': 0.1382080316543579, 'eval_accuracy': 0.9572352063649925, 'eval_f1': 0.927487352445194, 'eval_precision': 0.9090909090909091, 'eval_recall': 0.9466437177280551, 'eval_runtime': 6.6752, 'eval_samples_per_second': 301.263, 'eval_steps_per_second': 4.794, 'epoch': 5.0}
Test Set Metrics: {'eval_loss': 0.08853397518396378, 'eval_accuracy': 0.9726436209898035, 'eval_f1': 0.9537815126050421, 'eval_precision': 0.932621199671323, 'eval_recall': 0.9759243336199485, 'eval_runtime': 13.3621, 'eval_samples_per_second': 300.927, 'eval_steps_per_second': 4.715, 'epoch': 5.0}

Generating detailed report...

Confusion Matrix:
[[2776   82]
 [  28 1135]]

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.97      0.98      2858
        Spam       0.93      0.98      0.95      1163

    accuracy                           0.97      4021
   macro avg       0.96      0.97      0.97      4021
weighted avg       0.97

('exported_model\\tokenizer_config.json',
 'exported_model\\special_tokens_map.json',
 'exported_model\\vocab.txt',
 'exported_model\\added_tokens.json')