# Email Spam Detection Model - Troubleshooting

This notebook contains code to analyze and fix the stratification error in the train_test_split function.

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
import os
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve
from sklearn.preprocessing import FunctionTransformer

# Check if the dataset file exists
dataset_path = "emails.csv"
if os.path.exists(dataset_path):
    print(f"Dataset file exists: {dataset_path}")
    print(f"File size: {os.path.getsize(dataset_path)} bytes")
else:
    print(f"Dataset file does not exist: {dataset_path}")
    # Check if there's an alternative file
    alt_path = "emails_old.csv"
    if os.path.exists(alt_path):
        print(f"Alternative dataset file exists: {alt_path}")
        print(f"File size: {os.path.getsize(alt_path)} bytes")

Dataset file exists: emails.csv
File size: 8989491 bytes


In [7]:
# Load the data
data = pd.read_csv("emails.csv", encoding='ISO-8859-1')
print(f"Original data shape: {data.shape}")

# Basic data exploration
print("\nValue counts of spam labels:")
print(data['spam'].value_counts())
print(f"\nPercentage of spam: {data['spam'].mean()*100:.2f}%")

# Check for and handle duplicates
duplicates = data.duplicated().sum()
print(f"\nNumber of duplicates: {duplicates}")
if duplicates > 0:
    data.drop_duplicates(inplace=True)
    print(f"After removing duplicates: {data.shape}")

# Check for and handle missing values
print("\nMissing values:")
print(data.isnull().sum())
data = data.dropna(subset=['text'])  # Drop rows with missing text
print(f"After removing missing values: {data.shape}")

# Visualize class distribution
plt.figure(figsize=(8, 5))
sns.countplot(x=data['spam'])
plt.title('Spam vs Non-Spam Distribution')
plt.savefig('class_distribution.png')
plt.close()

Original data shape: (6219, 2)

Value counts of spam labels:
spam
0    4443
1    1776
Name: count, dtype: int64

Percentage of spam: 28.56%

Number of duplicates: 49
After removing duplicates: (6170, 2)

Missing values:
text    0
spam    0
dtype: int64
After removing missing values: (6170, 2)


In [8]:
# Define better text preprocessing
def enhanced_text_cleaning(text):
    """
    More sophisticated text cleaning function that preserves some 
    potentially useful spam indicators
    """
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', ' ', text)
        
        # Replace URLs with 'urlplaceholder'
        text = re.sub(r'https?://\S+|www\.\S+', ' urlplaceholder ', text)
        
        # Replace currency symbols with 'moneysymbol'
        text = re.sub(r'[$€£¥]', ' moneysymbol ', text)
        
        # Replace email addresses with 'emailaddr'
        text = re.sub(r'\S+@\S+', ' emailaddr ', text)
        
        # Replace phone numbers with 'phonenumber'
        text = re.sub(r'\b(?:\d{3}[-.]?){2}\d{4}\b', ' phonenumber ', text)
        
        # Replace numbers with 'numbr'
        text = re.sub(r'\d+', ' numbr ', text)
        
        # Replace multiple exclamation/question marks with special tokens
        text = re.sub(r'!!+', ' multiexclaim ', text)
        text = re.sub(r'\?\?+', ' multiquestion ', text)
        
        # Replace remaining non-alphanumeric characters with space
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        
        # Replace multiple spaces with single space
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    else:
        return ""

In [9]:
# Apply enhanced text cleaning
X = data['text'].apply(enhanced_text_cleaning)
y = data['spam'].values

# Feature engineering: Add additional features
def extract_text_features(df):
    """Extract additional features from text"""
    if isinstance(df, pd.Series):
        series = df
    else:
        series = df.iloc[:, 0]  # Assuming the text is in the first column
        
    # Initialize empty DataFrame
    features = pd.DataFrame()
    
    # Text length
    features['text_length'] = series.apply(lambda x: len(x) if isinstance(x, str) else 0)
    
    # Count of specific patterns
    features['exclamation_count'] = series.apply(lambda x: x.count('!') if isinstance(x, str) else 0)
    features['question_count'] = series.apply(lambda x: x.count('?') if isinstance(x, str) else 0)
    features['uppercase_ratio'] = series.apply(
        lambda x: sum(1 for c in x if c.isupper()) / max(len(x), 1) if isinstance(x, str) else 0
    )
    
    # Presence of special tokens (from our preprocessing)
    features['has_url'] = series.apply(lambda x: 1 if 'urlplaceholder' in x else 0 if isinstance(x, str) else 0)
    features['has_money'] = series.apply(lambda x: 1 if 'moneysymbol' in x else 0 if isinstance(x, str) else 0)
    features['has_email'] = series.apply(lambda x: 1 if 'emailaddr' in x else 0 if isinstance(x, str) else 0)
    
    return features

# Create text features
text_features = extract_text_features(data['text'])
 

In [10]:
# Check if stratification is possible
class_counts = np.bincount(y)
min_class_count = np.min(class_counts[class_counts > 0])
print(f"\nMinimum class count: {min_class_count}")

# Train-test split with stratification if possible, otherwise without
if min_class_count >= 2:
    X_train, X_test, y_train, y_test, features_train, features_test = train_test_split(
        X, y, text_features, test_size=0.2, stratify=y, random_state=42
    )
    print("Used stratified sampling")
else:
    # If stratification is not possible, use regular train-test split
    X_train, X_test, y_train, y_test, features_train, features_test = train_test_split(
        X, y, text_features, test_size=0.2, random_state=42
    )
    print("Used regular sampling (stratification not possible)")

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Check class distribution in training and test sets
print("\nTraining set class distribution:")
print(np.bincount(y_train))
print("Test set class distribution:")
print(np.bincount(y_test))

# Create classifiers to compare
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0),
    'Linear SVM': LinearSVC(C=1.0, dual=False),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10)
}


Minimum class count: 1765
Used stratified sampling

Training data shape: (4936,)
Testing data shape: (1234,)

Training set class distribution:
[3524 1412]
Test set class distribution:
[881 353]


In [11]:
# Function to evaluate model and return metrics
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    confusion = confusion_matrix(y_test, y_pred)
    
    return {
        'accuracy': accuracy,
        'report': report,
        'confusion_matrix': confusion
    }


In [12]:
# TF-IDF approach
print("\n--- TF-IDF Approach ---")
for name, classifier in classifiers.items():
    print(f"\nTraining {name}...")
    
    # Create a pipeline with TF-IDF
    tfidf_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 2),
            max_features=10000,
            min_df=2
        )),
        ('classifier', classifier)
    ])
    
    # Train and evaluate
    tfidf_pipeline.fit(X_train, y_train)
    metrics = evaluate_model(tfidf_pipeline, X_test, y_test)
    
    # Display results
    print(f"{name} Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision (Spam): {metrics['report']['1']['precision']:.4f}")
    print(f"Recall (Spam): {metrics['report']['1']['recall']:.4f}")
    print(f"F1-Score (Spam): {metrics['report']['1']['f1-score']:.4f}")


--- TF-IDF Approach ---

Training Multinomial Naive Bayes...
Multinomial Naive Bayes Accuracy: 0.9789
Precision (Spam): 0.9767
Recall (Spam): 0.9490
F1-Score (Spam): 0.9626

Training Logistic Regression...
Multinomial Naive Bayes Accuracy: 0.9789
Precision (Spam): 0.9767
Recall (Spam): 0.9490
F1-Score (Spam): 0.9626

Training Logistic Regression...
Logistic Regression Accuracy: 0.9814
Precision (Spam): 0.9769
Recall (Spam): 0.9575
F1-Score (Spam): 0.9671

Training Linear SVM...
Logistic Regression Accuracy: 0.9814
Precision (Spam): 0.9769
Recall (Spam): 0.9575
F1-Score (Spam): 0.9671

Training Linear SVM...
Linear SVM Accuracy: 0.9862
Precision (Spam): 0.9746
Recall (Spam): 0.9773
F1-Score (Spam): 0.9760

Training Random Forest...
Linear SVM Accuracy: 0.9862
Precision (Spam): 0.9746
Recall (Spam): 0.9773
F1-Score (Spam): 0.9760

Training Random Forest...
Random Forest Accuracy: 0.9182
Precision (Spam): 0.9809
Recall (Spam): 0.7280
F1-Score (Spam): 0.8358
Random Forest Accuracy: 0.9182

In [13]:
# Best model (we'll choose TF-IDF with the classifier that performed best)
# Assuming Logistic Regression was the best based on common performance patterns
best_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 2),
        max_features=10000,
        min_df=2
    )),
    ('classifier', LogisticRegression(max_iter=1000, C=1.0))
])


In [14]:
# Fine-tune hyperparameters
print("\n--- Hyperparameter Tuning ---")
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_features': [5000, 10000],
    'classifier__C': [0.1, 1.0, 10.0]
}

grid_search = GridSearchCV(best_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_


--- Hyperparameter Tuning ---
Best parameters: {'classifier__C': 10.0, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
Best parameters: {'classifier__C': 10.0, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}


In [15]:
# Final evaluation
print("\n--- Final Model Evaluation ---")
final_metrics = evaluate_model(best_model, X_test, y_test)

print(f"Final Accuracy: {final_metrics['accuracy']:.4f}")
print("\nClassification Report:")
for label, metrics in final_metrics['report'].items():
    if label in ['0', '1']:
        class_name = 'Spam' if label == '1' else 'Not Spam'
        print(f"{class_name}:")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1-score']:.4f}")


--- Final Model Evaluation ---
Final Accuracy: 0.9870

Classification Report:
Not Spam:
  Precision: 0.9898
  Recall: 0.9921
  F1-Score: 0.9909
Spam:
  Precision: 0.9801
  Recall: 0.9745
  F1-Score: 0.9773
Final Accuracy: 0.9870

Classification Report:
Not Spam:
  Precision: 0.9898
  Recall: 0.9921
  F1-Score: 0.9909
Spam:
  Precision: 0.9801
  Recall: 0.9745
  F1-Score: 0.9773


In [16]:
# Confusion Matrix visualization
plt.figure(figsize=(8, 6))
cm = final_metrics['confusion_matrix']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Not Spam', 'Spam'],
            yticklabels=['Not Spam', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.close()

In [17]:
# Example prediction function
def predict_spam(model, texts):
    predictions = model.predict(texts)
    probabilities = None
    
    # Get probabilities if the model supports predict_proba
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(texts)[:, 1]
    
    results = []
    for i, text in enumerate(texts):
        result = {
            'text': text,
            'prediction': 'Spam' if predictions[i] == 1 else 'Not Spam'
        }
        if probabilities is not None:
            result['spam_probability'] = probabilities[i]
        results.append(result)
    

    return results

In [18]:
# Example usage
test_emails = [
    'Hey i am Elon Musk. Get a brand new car from Tesla',
    'Hi Mary, just checking in about our meeting tomorrow at 2pm. See you then!',
    'CONGRATULATIONS! You have won a FREE iPhone! Click here to claim now!!!',
    'The quarterly report is attached. Please review before the board meeting.'
]

# Check distribution in target variable
print("Target variable distribution:")
print(np.unique(y, return_counts=True))


Target variable distribution:
(array([0, 1]), array([4405, 1765]))


In [19]:

print("\n--- Example Predictions ---")
predictions = predict_spam(best_model, test_emails)
for pred in predictions:
    print(f"Text: {pred['text'][:50]}..." if len(pred['text']) > 50 else f"Text: {pred['text']}")
    print(f"Prediction: {pred['prediction']}")
    if 'spam_probability' in pred:
        print(f"Spam probability: {pred['spam_probability']:.4f}")
    print()


--- Example Predictions ---
Text: Hey i am Elon Musk. Get a brand new car from Tesla
Prediction: Spam
Spam probability: 0.7105

Text: Hi Mary, just checking in about our meeting tomorr...
Prediction: Not Spam
Spam probability: 0.0781

Text: CONGRATULATIONS! You have won a FREE iPhone! Click...
Prediction: Spam
Spam probability: 0.9818

Text: The quarterly report is attached. Please review be...
Prediction: Not Spam
Spam probability: 0.0234



In [20]:
# Save the model
model_filename = "spam_detection_model.pkl"
pickle.dump(best_model, open(model_filename, 'wb'))
print(f"Model saved as {model_filename}")

Model saved as spam_detection_model.pkl
