In [1]:
# # Email Spam Detector

# ## Introduction
# 
# This notebook builds and compares two machine learning models - Logistic Regression and Random Forest - for detecting spam emails. The models are trained on a dataset of email features and labels indicating whether each email is spam or not.

# ## Dataset Loading and Preprocessing

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [3]:
# Load the data
data = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m13/challenge/spam-data.csv')




In [4]:
# Explore the data
print(data.head())
print(data.info())
print(data.describe())



   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  char_freq_;  char_freq_(  \
0             0.00            0.00  ...         0.00        0.000   
1 

In [5]:
# Create feature matrix X and target vector y
feature_columns = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 
                   'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail', 
                   'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses', 
                   'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 
                   'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 
                   'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab',
                   'word_freq_labs', 'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415',
                   'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 
                   'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 
                   'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 
                   'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average',
                   'capital_run_length_longest', 'capital_run_length_total']

X = data[feature_columns]
y = data['spam']



In [7]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [8]:
# Check the class balance
print("Labels counts in y_train:")
print(y_train.value_counts())
print("Labels counts in y_test:") 
print(y_test.value_counts())





Labels counts in y_train:
spam
0    2257
1    1423
Name: count, dtype: int64
Labels counts in y_test:
spam
0    531
1    390
Name: count, dtype: int64


In [9]:
# Scale the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [10]:
# ## Model Training and Evaluation

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [11]:
# Logistic Regression
logreg = LogisticRegression(random_state=1)
logreg.fit(X_train_scaled, y_train)
logreg_pred = logreg.predict(X_test_scaled)
logreg_acc = accuracy_score(y_test, logreg_pred)
print(f'Logistic Regression Accuracy: {logreg_acc:.3f}')



Logistic Regression Accuracy: 0.920


In [12]:
# Random Forest
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
rf_acc = accuracy_score(y_test, rf_pred)
print(f'Random Forest Accuracy: {rf_acc:.3f}')



Random Forest Accuracy: 0.957


In [13]:
# Compare model results
if logreg_acc > rf_acc:
    print('Logistic Regression performed better')
else:
    print('Random Forest performed better')



Random Forest performed better


In [14]:
# Print classification report and confusion matrix
print("Logistic Regression:")
print(confusion_matrix(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))

print("Random Forest:")
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))



Logistic Regression:
[[506  25]
 [ 49 341]]
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       531
           1       0.93      0.87      0.90       390

    accuracy                           0.92       921
   macro avg       0.92      0.91      0.92       921
weighted avg       0.92      0.92      0.92       921

Random Forest:
[[520  11]
 [ 29 361]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       531
           1       0.97      0.93      0.95       390

    accuracy                           0.96       921
   macro avg       0.96      0.95      0.96       921
weighted avg       0.96      0.96      0.96       921



In [15]:
# ## Model Saving

import joblib

joblib.dump(logreg, 'logistic_regression.pkl')
joblib.dump(rf, 'random_forest.pkl')
print("Models saved successfully!")

# ## Conclusion
# 
# In this project, we built and compared Logistic Regression and Random Forest models for spam email detection. The Random Forest model achieved a higher accuracy of 95.7% compared to 92.0% for Logistic Regression on the test set.
# 
# Potential future enhancements could include experimenting with additional features, tuning model hyperparameters, or exploring other machine learning algorithms for this task.

Models saved successfully!
