Mount Google Drive:

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Data Collection - Loading the Data

In [2]:
import os

# Define the paths
base_dir = '/content/drive/MyDrive/Spam_Email_Detection_2/archive'

# List files in each folder
easy_ham_dir = os.listdir(os.path.join(base_dir, 'easy_ham'))
hard_ham_dir = os.listdir(os.path.join(base_dir, 'hard_ham'))
spam_dir = os.listdir(os.path.join(base_dir, 'spam_2'))

# Display contents of each folder
print(f"Easy Ham: {len(easy_ham_dir)} files")
print(f"Hard Ham: {len(hard_ham_dir)} files")
print(f"Spam: {len(spam_dir)} files")


Easy Ham: 2 files
Hard Ham: 2 files
Spam: 2 files


In [3]:
# Define the correct paths
base_dir = '/content/drive/MyDrive/Spam_Email_Detection_2/archive'
easy_ham_path = os.path.join(base_dir, 'easy_ham', 'easy_ham')
hard_ham_path = os.path.join(base_dir, 'hard_ham', 'hard_ham')
spam_path = os.path.join(base_dir, 'spam_2', 'spam_2')

# List files in each folder again
easy_ham_dir = os.listdir(easy_ham_path)
hard_ham_dir = os.listdir(hard_ham_path)
spam_dir = os.listdir(spam_path)

# Display contents of each folder
print(f"Easy Ham: {len(easy_ham_dir)} files")
print(f"Hard Ham: {len(hard_ham_dir)} files")
print(f"Spam: {len(spam_dir)} files")


Easy Ham: 2551 files
Hard Ham: 250 files
Spam: 1397 files


Data Preprocessing

Read the email content from all the files in each category (easy ham, hard ham, and spam).

Clean the text by removing any HTML tags and unnecessary characters.

Tokenize the text for further analysis.




In [4]:
import re

# Function to read and clean email files
def read_and_clean_email(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        email_content = file.read()
        # Remove HTML tags
        email_content = re.sub(r'<.*?>', '', email_content)
        # Remove non-alphabetic characters
        email_content = re.sub(r'[^a-zA-Z\s]', '', email_content)
        return email_content

# Load and clean all emails in each category
easy_ham_emails = [read_and_clean_email(os.path.join(easy_ham_path, file)) for file in easy_ham_dir[:5]]  # Load first 5 emails for testing
hard_ham_emails = [read_and_clean_email(os.path.join(hard_ham_path, file)) for file in hard_ham_dir[:5]]
spam_emails = [read_and_clean_email(os.path.join(spam_path, file)) for file in spam_dir[:5]]

# Display sample of cleaned emails
print("Sample Easy Ham Email:\n", easy_ham_emails[0][:500])
print("\nSample Hard Ham Email:\n", hard_ham_emails[0][:500])
print("\nSample Spam Email:\n", spam_emails[0][:500])


Sample Easy Ham Email:
 From rssfeedsjmasonorg  Tue Oct    
ReturnPath 
DeliveredTo yyyylocalhostexamplecom
Received from localhost jalapeno 
	by jmasonorg Postfix with ESMTP id BEF
	for  Tue   Oct    IST
Received from jalapeno 
	by localhost with IMAP fetchmail
	for jmlocalhost singledrop Tue  Oct    IST
Received from dogmaslashnullorg localhost  by
    dogmaslashnullorg  with ESMTP id gK for
     Tue  Oct   
MessageId 
To yyyyexamplecom
From guardian 
Subject Factories go flat while we dither over the euro
Date Tue  

Sample Hard Ham Email:
 ReturnPath 
Received from abvsfoacmtaCNETCOM abvsfoacmtacnetcom 
	by dogmaslashnullorg  with ESMTP id gALeUJ
	for  Wed  Jul   
Received from abvsfoacagent  by abvsfoacmtaCNETCOM PowerMTATM v Wed  Jul    envelopefrom 
MessageID 
Date Wed  Jul    PDT
From CNET Newscom Investor 
To qqqqqqqqqqzdnetexamplecom
Subject NEWSCOM INVESTOR Tech stocks drop again on Qwest criminal probe news
MimeVersion 
ContentType texthtml charsetISO
ContentTransferEncodin

Feature Engineering

We will use TF-IDF (Term Frequency-Inverse Document Frequency) to transform the email text into numerical features.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all emails for TF-IDF
all_emails = easy_ham_emails + hard_ham_emails + spam_emails
labels = [0] * len(easy_ham_emails) + [0] * len(hard_ham_emails) + [1] * len(spam_emails)  # 0 = ham, 1 = spam

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)  # Limiting features to 3000 for efficiency

# Fit and transform the email text to TF-IDF features
X = vectorizer.fit_transform(all_emails)

# Display the shape of the feature matrix
print(f"Feature matrix shape: {X.shape}")


Feature matrix shape: (15, 2881)


Model Building

Now, we can move on to training classifiers for spam detection. A good starting point is the Naive Bayes classifier, as it is effective for text classification tasks like spam detection.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Initialize the Naive Bayes classifier
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[2 0]
 [1 0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


the model did not perform well on the test set, as indicated by the confusion matrix and classification report.

Classification Report Interpretation:

Precision for class 1 (spam) is 0.00 because there were no true positive predictions.

Recall for class 1 (spam) is also 0.00, indicating that the model did not capture any spam emails.

The model has a moderate accuracy of 67%, but this is misleading due to the imbalanced nature of the dataset and the fact that it did not predict any spam emails.

Possible Reasons for Poor Performance:

Imbalance in Classes: The dataset may have an imbalance between the classes (ham vs. spam). With only one spam sample in the test set, the model couldn't learn to recognize spam effectively.

Feature Selection: TF-IDF might not capture all relevant features effectively with the small number of samples.

Next step:
Adjust the Train-Test Split: we will try using a different split to ensure the test set contains more diverse samples of both ham and spam.


Try Other Models: You can also try other classifiers, such as Support Vector Machines (SVM) or Logistic Regression, to see if they perform better with the current dataset.


In [7]:
# Split the data into training and testing sets (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42, stratify=labels)

# Train the model again and evaluate
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model again
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[2 0]
 [1 0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Adjust the train-test split with stratification
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42, stratify=labels)

# Initialize the Naive Bayes classifier
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[2 0]
 [1 0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Support Vector Machine (SVM)

In [10]:
from sklearn.svm import SVC

# Initialize the SVM classifier
svm_model = SVC(kernel='linear')  # Linear kernel is often a good choice for text data

# Train the SVM model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
svm_y_pred = svm_model.predict(X_test)

# Evaluate the SVM model
print("SVM Confusion Matrix:")
print(confusion_matrix(y_test, svm_y_pred))
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_y_pred))


SVM Confusion Matrix:
[[2 0]
 [0 1]]

SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



still poor

Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
log_reg_model = LogisticRegression(max_iter=1000)

# Train the Logistic Regression model
log_reg_model.fit(X_train, y_train)

# Make predictions on the test set
log_reg_y_pred = log_reg_model.predict(X_test)

# Evaluate the Logistic Regression model
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, log_reg_y_pred))
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, log_reg_y_pred))


Logistic Regression Confusion Matrix:
[[2 0]
 [1 0]]

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


We'll focus on handling the class imbalance and adjusting the train-test split. As the confusion metrics are still poor.


Step 1: Use Stratified Sampling for Train-Test Split
This ensures each class is proportionally represented in both training and testing datasets.

Step 2: Handle Class Imbalance
We can use resampling techniques to balance the dataset.

We'll use the imblearn library for this purpose.

In [12]:
!pip install imbalanced-learn




In [14]:
# Import necessary libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

# Define the paths to email datasets
easy_ham_path = '/content/drive/MyDrive/Spam_Email_Detection_2/archive/easy_ham/easy_ham'
hard_ham_path = '/content/drive/MyDrive/Spam_Email_Detection_2/archive/hard_ham/hard_ham'
spam_path = '/content/drive/MyDrive/Spam_Email_Detection_2/archive/spam_2/spam_2'

# Function to load emails from a directory
def load_emails_from_folder(folder, label):
    emails = []
    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'r', errors='ignore') as file:
            emails.append((file.read(), label))
    return emails

# Load the datasets
easy_ham_emails = load_emails_from_folder(easy_ham_path, 'easy_ham')
hard_ham_emails = load_emails_from_folder(hard_ham_path, 'hard_ham')
spam_emails = load_emails_from_folder(spam_path, 'spam')

# Combine all emails into a single DataFrame
all_emails = easy_ham_emails + hard_ham_emails + spam_emails
data = pd.DataFrame(all_emails, columns=['text', 'label'])

# Prepare your features (X) and labels (y)
X = data['text']
y = data['label']

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance using oversampling for minority classes
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Initialize the Naive Bayes classifier
model = MultinomialNB()

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[507   2   1]
 [  1  47   2]
 [  8  10 262]]

Classification Report:
              precision    recall  f1-score   support

    easy_ham       0.98      0.99      0.99       510
    hard_ham       0.80      0.94      0.86        50
        spam       0.99      0.94      0.96       280

    accuracy                           0.97       840
   macro avg       0.92      0.96      0.94       840
weighted avg       0.97      0.97      0.97       840



Finally! Phew!

High Overall Accuracy: The model achieved an overall accuracy of 97%, which indicates that it is performing very well in classifying the emails.


Precision, Recall, and F1-Score:


Easy Ham:

Precision: 0.98 – Very few false positives, meaning most emails predicted as easy ham are indeed easy ham.

Recall: 0.99 – Very few easy ham emails are misclassified as spam or hard ham.

F1-Score: 0.99 – A good balance between precision and recall.

Hard Ham:
Precision: 0.80 – There are some false positives (2 emails predicted as hard ham that are not).

Recall: 0.94 – The model identifies 94% of actual hard ham emails correctly, with only 1 false negative.

F1-Score: 0.86 – Reasonable, but there's room for improvement.

Spam:
Precision: 0.99 – Very few false positives, indicating strong performance.

Recall: 0.94 – Good at identifying spam, though there are some misclassifications.

F1-Score: 0.96 – Overall very strong.
Macro and Weighted Averages:

The macro average reflects the average of precision, recall, and F1-score across classes, treating all classes equally.

The weighted average accounts for the support (number of true instances) of each class, giving a more accurate measure of performance when classes are imbalanced.

Experiment with Different Classifiers
We will like to experiment with different Classifiers

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# Define the classifiers to try
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Dictionary to store the results
results = {}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    # Create a pipeline with TF-IDF and the classifier
    pipeline = make_pipeline(TfidfVectorizer(), clf)

    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'],
                                                        test_size=0.2, random_state=42, stratify=data['label'])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Store results
    results[name] = {
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, output_dict=True)
    }

# Print the results
for name, result in results.items():
    print(f"Classifier: {name}")
    print("Confusion Matrix:")
    print(result['confusion_matrix'])
    print("\nClassification Report:")
    print(result['classification_report'])
    print("="*50)


Classifier: Random Forest
Confusion Matrix:
[[506   1   3]
 [  2  39   9]
 [  2   0 278]]

Classification Report:
{'easy_ham': {'precision': 0.9921568627450981, 'recall': 0.9921568627450981, 'f1-score': 0.9921568627450981, 'support': 510.0}, 'hard_ham': {'precision': 0.975, 'recall': 0.78, 'f1-score': 0.8666666666666667, 'support': 50.0}, 'spam': {'precision': 0.9586206896551724, 'recall': 0.9928571428571429, 'f1-score': 0.9754385964912281, 'support': 280.0}, 'accuracy': 0.9797619047619047, 'macro avg': {'precision': 0.9752591841334235, 'recall': 0.9216713352007471, 'f1-score': 0.9447540419676642, 'support': 840.0}, 'weighted avg': {'precision': 0.9799568965517241, 'recall': 0.9797619047619047, 'f1-score': 0.9791144527986634, 'support': 840.0}}
Classifier: SVM
Confusion Matrix:
[[509   1   0]
 [  3  38   9]
 [  2   0 278]]

Classification Report:
{'easy_ham': {'precision': 0.9902723735408561, 'recall': 0.9980392156862745, 'f1-score': 0.994140625, 'support': 510.0}, 'hard_ham': {'precis

Both classifiers—Random Forest and SVM—performed well, with SVM achieving slightly better accuracy and precision metrics overall. Here’s a quick comparison of their performance:


Random Forest Results:

Accuracy: 97.98%

Precision (Macro Average): 97.53%

Recall (Macro Average): 92.17%

F1-Score (Macro Average): 94.48%

SVM Results:

Accuracy: 98.21%

Precision (Macro Average): 97.78%

Recall (Macro Average): 91.70%

F1-Score (Macro Average): 94.29%

Key Takeaways:

Random Forest had a slight edge in classifying easy ham emails but struggled a bit more with hard ham emails.

SVM performed consistently across all categories, achieving high precision and recall.

Since SVM performed slightly better, we will choose it as the primary model for our spam detection.

We will be saving the work now

In [19]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the classifiers to try
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Dictionary to store the results
results = {}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    # Create a pipeline with TF-IDF and the classifier
    pipeline = make_pipeline(TfidfVectorizer(), clf)

    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'],
                                                        test_size=0.2, random_state=42, stratify=data['label'])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Store results
    results[name] = {
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, output_dict=True)
    }

    # Save the trained model
    joblib.dump(pipeline, f'{name.lower().replace(" ", "_")}_model.pkl')  # Save pipeline

# Print the results
for name, result in results.items():
    print(f"Classifier: {name}")
    print("Confusion Matrix:")
    print(result['confusion_matrix'])
    print("\nClassification Report:")
    print(result['classification_report'])
    print("="*50)


Classifier: Random Forest
Confusion Matrix:
[[506   1   3]
 [  2  39   9]
 [  2   0 278]]

Classification Report:
{'easy_ham': {'precision': 0.9921568627450981, 'recall': 0.9921568627450981, 'f1-score': 0.9921568627450981, 'support': 510.0}, 'hard_ham': {'precision': 0.975, 'recall': 0.78, 'f1-score': 0.8666666666666667, 'support': 50.0}, 'spam': {'precision': 0.9586206896551724, 'recall': 0.9928571428571429, 'f1-score': 0.9754385964912281, 'support': 280.0}, 'accuracy': 0.9797619047619047, 'macro avg': {'precision': 0.9752591841334235, 'recall': 0.9216713352007471, 'f1-score': 0.9447540419676642, 'support': 840.0}, 'weighted avg': {'precision': 0.9799568965517241, 'recall': 0.9797619047619047, 'f1-score': 0.9791144527986634, 'support': 840.0}}
Classifier: SVM
Confusion Matrix:
[[509   1   0]
 [  3  38   9]
 [  2   0 278]]

Classification Report:
{'easy_ham': {'precision': 0.9902723735408561, 'recall': 0.9980392156862745, 'f1-score': 0.994140625, 'support': 510.0}, 'hard_ham': {'precis