In [2]:
import pandas as pd
import torch
import os
import matplotlib
# Force non-interactive backend to avoid Tcl/Tk errors on Windows/Servers
matplotlib.use('Agg') 
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from torch.utils.data import Dataset

# ==========================================
# CONFIGURATION
# ==========================================
# Path to your BEST model (DistilBERT)
MODEL_PATH = 'models/distilbert-spam' 
TEST_DATA_PATH = 'data/test_processed.csv'
REPORT_DIR = 'reports/'

class SimpleDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings['input_ids'])

def analyze_errors():
    print("üïµÔ∏è Starting Deep Error Analysis on Best Model (DistilBERT)...")
    os.makedirs(REPORT_DIR, exist_ok=True)
    
    # 1. Load Model & Tokenizer
    # ------------------------------------------------
    if not os.path.exists(MODEL_PATH):
        print(f"‚ùå Error: Model not found at {MODEL_PATH}.")
        print("   Make sure you unzipped the best model from the training step.")
        return

    try:
        print(f"   Loading model & tokenizer from {MODEL_PATH}...")
        tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
        model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
    except Exception as e:
        print(f"‚ùå Failed to load model: {e}")
        return

    # 2. Load Test Data
    # ------------------------------------------------
    print("   Loading test data...")
    if not os.path.exists(TEST_DATA_PATH):
        print(f"‚ùå Error: Test data not found at {TEST_DATA_PATH}.")
        return

    df = pd.read_csv(TEST_DATA_PATH)
    
    # Ensure text is string and handle missing values
    text_col = 'text' if 'text' in df.columns else 'clean_text'
    texts = df[text_col].fillna("").astype(str).tolist()
    
    # Encode labels (Ham=0, Spam=1)
    le = LabelEncoder()
    true_labels = le.fit_transform(df['label']) 
    class_names = le.classes_ # Should be ['ham', 'spam']

    # 3. Generate Predictions
    # ------------------------------------------------
    print("   Running predictions (this might take a moment)...")
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
    dataset = SimpleDataset(encodings)
    
    trainer = Trainer(model=model)
    preds_output = trainer.predict(dataset)
    
    # Get predicted classes and confidence scores
    pred_labels = preds_output.predictions.argmax(-1)
    probs = torch.nn.functional.softmax(torch.tensor(preds_output.predictions), dim=-1)
    confidence = probs.max(dim=1).values.numpy()

    # Add columns to dataframe for analysis
    df['true_label'] = true_labels
    df['pred_label'] = pred_labels
    df['confidence'] = confidence

    # 4. Confusion Matrix Heatmap
    # ------------------------------------------------
    print("üìä Generating Confusion Matrix...")
    cm = confusion_matrix(true_labels, pred_labels)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix: DistilBERT')
    
    cm_path = os.path.join(REPORT_DIR, 'confusion_matrix.png')
    plt.savefig(cm_path)
    plt.close()
    print(f"   ‚úÖ Saved heatmap to: {cm_path}")

    # 5. Extract False Positives (Safe -> Marked as Spam)
    # ------------------------------------------------
    # High Precision is key, so we need to analyze these deeply
    fps = df[(df['true_label'] == 0) & (df['pred_label'] == 1)].sort_values(by='confidence', ascending=False)
    
    print(f"\nüö® FALSE POSITIVES (Safe emails classified as Spam): {len(fps)}")
    print("   Why this happens: Aggressive keywords ('free', 'money'), weird formatting, or sarcasm.")
    print("-" * 60)
    for i, row in fps.head(5).iterrows():
        print(f"üîπ Confidence: {row['confidence']:.2f}")
        print(f"   Text: \"{row[text_col]}\"")
        print("-" * 60)

    # 6. Extract False Negatives (Spam -> Marked as Ham)
    # ------------------------------------------------
    # These are spam emails that sneaked through
    fns = df[(df['true_label'] == 1) & (df['pred_label'] == 0)].sort_values(by='confidence', ascending=False)
    
    print(f"\nüïµÔ∏è FALSE NEGATIVES (Spam emails classified as Safe): {len(fns)}")
    print("   Why this happens: Short text, lack of keywords, or 'conversational' spam.")
    print("-" * 60)
    for i, row in fns.head(5).iterrows():
        print(f"üî∏ Confidence: {row['confidence']:.2f}")
        print(f"   Text: \"{row[text_col]}\"")
        print("-" * 60)

    # 7. Model Status Check
    # ------------------------------------------------
    print("\nüíæ Model & Vectorizer Status")
    # For Transformers, the 'Vectorizer' is the 'Tokenizer'
    if os.path.exists(os.path.join(MODEL_PATH, 'config.json')) and \
       os.path.exists(os.path.join(MODEL_PATH, 'vocab.txt')):
        print("   ‚úÖ Best Model (DistilBERT) and Tokenizer are already saved in:")
        print(f"      {MODEL_PATH}")
        print("   ready for the Application Phase.")
    else:
        print("   ‚ö†Ô∏è Warning: Model files seem incomplete. Please re-run training.")

if __name__ == "__main__":
    analyze_errors()

üïµÔ∏è Starting Deep Error Analysis on Best Model (DistilBERT)...
   Loading model & tokenizer from models/distilbert-spam...
   Loading test data...
   Running predictions (this might take a moment)...




üìä Generating Confusion Matrix...
   ‚úÖ Saved heatmap to: reports/confusion_matrix.png

üö® FALSE POSITIVES (Safe emails classified as Spam): 1
   Why this happens: Aggressive keywords ('free', 'money'), weird formatting, or sarcasm.
------------------------------------------------------------
üîπ Confidence: 1.00
   Text: "MY NO. IN LUTON 0125698789 RING ME IF UR AROUND! H*"
------------------------------------------------------------

üïµÔ∏è FALSE NEGATIVES (Spam emails classified as Safe): 10
   Why this happens: Short text, lack of keywords, or 'conversational' spam.
------------------------------------------------------------
üî∏ Confidence: 1.00
   Text: "RCT' THNQ Adrian for U text. Rgds Vatian"
------------------------------------------------------------
üî∏ Confidence: 1.00
   Text: "ROMCAPspam Everyone around should be responding well to your presence since you are so warm and outgoing. You are bringing in a real breath of sunshine."
----------------------------------

code for kaggle running in gpu t4 x2

In [None]:
import pandas as pd
import numpy as np
import torch
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch import nn

# ==========================================
# CONFIGURATION
# ==========================================
# Detect if running on Kaggle to set paths automatically
if os.path.exists('/kaggle/input'):
    # Try to find the file automatically in input directory
    found_train = glob.glob('/kaggle/input/email-processed/train_processed.csv', recursive=True)
    found_test = glob.glob('/kaggle/input/email-processed/test_processed.csv', recursive=True)
    
    if found_train and found_test:
        TRAIN_PATH = found_train[0]
        TEST_PATH = found_test[0]
        print(f"‚úÖ Auto-detected Kaggle paths:\n  Train: {TRAIN_PATH}\n  Test: {TEST_PATH}")
    else:
        # Fallback if auto-detection fails
        DATA_DIR = '/kaggle/input/email-processed' 
        TRAIN_PATH = os.path.join(DATA_DIR, 'train_processed.csv')
        TEST_PATH = os.path.join(DATA_DIR, 'test_processed.csv')
else:
    # Local paths
    TRAIN_PATH = 'data/train_processed.csv'
    TEST_PATH = 'data/test_processed.csv'

MODEL_DIR = './results/distilbert-spam' if os.path.exists('/kaggle/working') else 'models/distilbert-spam'

# Hyperparameters
MAX_LEN = 128     
BATCH_SIZE = 16   # Increased to 16 for T4 x2 (Effective batch size will be higher)
EPOCHS = 3        
LEARNING_RATE = 2e-5 

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üñ•Ô∏è Using device: {device} (Count: {torch.cuda.device_count()})")

class SpamDataset(torch.utils.data.Dataset):
    """Custom PyTorch Dataset for loading emails."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Labels are already integers here thanks to LabelEncoder
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(pred):
    """Callback to calculate Precision/Recall during training."""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

class CustomTrainer(Trainer):
    """
    Custom Trainer to handle Class Imbalance (Spam < Ham).
    We inject a weighted Loss Function directly into the training loop.
    """
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Calculate Class Weights: Higher weight for Spam (Index 1)
        # Weight 6.0 roughly balances a 13% spam / 87% ham ratio
        # FIX: Use labels.device instead of model.device. 
        # In multi-GPU (DataParallel), 'model' is wrapped and doesn't have .device, but inputs are guaranteed to be on the correct GPU.
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 6.0]).to(labels.device))
        
        # Hardcode num_labels=2 to avoid accessing config from wrapped model
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

def train_distilbert():
    # 1. Load Data
    # ------------------------------------------------
    print("üöÄ Loading data for DistilBERT...")
    if not os.path.exists(TRAIN_PATH):
        print(f"‚ùå Error: {TRAIN_PATH} not found.")
        print("   If on Kaggle, please upload 'train_processed.csv' and 'test_processed.csv' as a Dataset.")
        return

    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    
    text_col = 'text' if 'text' in train_df.columns else 'clean_text'
    print(f"   Using column: '{text_col}' for training.")
    
    # Handle NaNs just in case
    train_texts = train_df[text_col].fillna("").astype(str).tolist()
    train_labels_raw = train_df['label'].tolist()
    
    test_texts = test_df[text_col].fillna("").astype(str).tolist()
    test_labels_raw = test_df['label'].tolist()

    # --- CRITICAL FIX: Label Encoding (String -> Int) ---
    print("üîÑ Encoding labels (ham=0, spam=1)...")
    le = LabelEncoder()
    train_labels = le.fit_transform(train_labels_raw)
    test_labels = le.transform(test_labels_raw) # Use transform to ensure consistency
    
    # Print mapping to verify
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"   Class Mapping: {mapping}")
    
    if len(mapping) != 2:
        print("‚ö†Ô∏è Warning: Detected more than 2 classes. Ensure dataset only has 'spam' and 'ham'.")

    # Split Train into Train/Validation (90/10)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_texts, train_labels, test_size=0.1, random_state=42, stratify=train_labels
    )

    # 2. Tokenization
    # ------------------------------------------------
    print("üìö Tokenizing data...")
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LEN)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=MAX_LEN)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=MAX_LEN)

    # Create Datasets
    train_dataset = SpamDataset(train_encodings, train_labels)
    val_dataset = SpamDataset(val_encodings, val_labels)
    test_dataset = SpamDataset(test_encodings, test_labels)

    # 3. Initialize Model
    # ------------------------------------------------
    print("üß† Initializing DistilBERT model...")
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

    # 4. Training Arguments
    # ------------------------------------------------
    training_args = TrainingArguments(
        output_dir='./results',          
        num_train_epochs=EPOCHS,         
        per_device_train_batch_size=BATCH_SIZE,  
        per_device_eval_batch_size=BATCH_SIZE*2,
        learning_rate=LEARNING_RATE,     
        warmup_steps=100,                
        weight_decay=0.01,               
        logging_dir='./logs',            
        logging_steps=50,
        eval_strategy="epoch",           
        save_strategy="epoch",           
        load_best_model_at_end=True,     
        report_to="none",
        fp16=True,                       # ENABLE Mixed Precision for T4 GPUs (Faster)
        dataloader_num_workers=2         # Speed up data loading
    )

    # 5. Train
    # ------------------------------------------------
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    print(f"\nüèãÔ∏è Starting Training on {torch.cuda.device_count()} GPU(s)...")
    trainer.train()

    # 6. Final Evaluation
    # ------------------------------------------------
    print("\nüß™ Evaluating on Test Set...")
    results = trainer.evaluate(test_dataset)
    
    print("\nüèÜ DistilBERT Results:")
    print("-" * 30)
    print(f"Accuracy:  {results['eval_accuracy']:.4f}")
    print(f"Precision: {results['eval_precision']:.4f}")
    print(f"Recall:    {results['eval_recall']:.4f}")
    print(f"F1-Score:  {results['eval_f1']:.4f}")
    print("-" * 30)

    # 7. Save Model
    # ------------------------------------------------
    print(f"üíæ Saving model to {MODEL_DIR}...")
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    
    # Zip for easy download from Kaggle
    if os.path.exists('/kaggle/working'):
        import shutil
        shutil.make_archive('distilbert_spam_model', 'zip', MODEL_DIR)
        print("üì¶ Model zipped to distilbert_spam_model.zip for download.")

if __name__ == "__main__":
    train_distilbert()

üñ•Ô∏è Using device: cpu
üöÄ Loading data for DistilBERT...
   Using column: 'text' for training.
Tokenizing data...
Initializing DistilBERT model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
print("hello")

hello


```
üöÄ Loading processed data for benchmarking...   
Training samples: 4457 | Test samples: 1115üß† 
Training Naive Bayes...üß† 
Training SVM (Linear)...
üß† Training Random Forest...
üèÜ Model Benchmarking Results:------------------------------------------------------------               
Accuracy  Precision  Recall  F1-ScoreModel                                               
Naive Bayes      0.8511     0.4716  0.9463    0.6295
SVM (Linear)     0.9865     0.9786  0.9195    0.9481
Random Forest    0.9821     0.9924  0.8725    0.9286
------------------------------------------------------------
üí° Best Model for Precision (Avoiding False Positives): 
Random Forest   Precision Score: 0.9924  
 Saved Random Forest to models/
 üìä Generating ROC Curves...
 ‚ö†Ô∏è Model SVM (Linear) does not support probability prediction. Skipping ROC.   
 üìä ROC Curve saved to: data/roc_curve_comparison.png
```

```
‚úÖ Auto-detected Kaggle paths:
  Train: /kaggle/input/email-processed/train_processed.csv
  Test: /kaggle/input/email-processed/test_processed.csv
üñ•Ô∏è Using device: cuda (Count: 2)
üöÄ Loading data for DistilBERT...
   Using column: 'text' for training.
üîÑ Encoding labels (ham=0, spam=1)...
   Class Mapping: {'ham': 0, 'spam': 1}
üìö Tokenizing data...
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
üß† Initializing DistilBERT model...

üèãÔ∏è Starting Training on 2 GPU(s)...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/_functions.py:70: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn(
 [378/378 01:34, Epoch 3/3]
Epoch	Training Loss	Validation Loss	Accuracy	Precision	Recall	F1
1	0.132900	0.320092	0.984305	0.981818	0.900000	0.939130
2	0.166900	0.558723	0.982063	0.964286	0.900000	0.931034
3	0.086500	0.640158	0.982063	0.964286	0.900000	0.931034
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/_functions.py:70: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/_functions.py:70: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

üß™ Evaluating on Test Set...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/_functions.py:70: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn(
 [18/18 00:02]

üèÜ DistilBERT Results:
------------------------------
Accuracy:  0.9901
Precision: 0.9929
Recall:    0.9329
F1-Score:  0.9619
------------------------------
üíæ Saving model to ./results/distilbert-spam...
üì¶ Model zipped to distilbert_spam_model.zip for download.
```