# Spam Email Classification

### Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Understanding The Dataset

In [2]:
import pandas as pd

df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [4]:
df.shape

(5172, 3002)

In [5]:
df.dtypes

Email No.     object
the            int64
to             int64
ect            int64
and            int64
               ...  
military       int64
allowing       int64
ff             int64
dry            int64
Prediction     int64
Length: 3002, dtype: object

In [6]:
df.describe()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,2.024362,10.600155,...,0.005027,0.012568,0.010634,0.098028,0.004254,0.006574,0.00406,0.914733,0.006961,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,6.967878,19.281892,...,0.105788,0.199682,0.116693,0.569532,0.096252,0.138908,0.072145,2.780203,0.098086,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,167.0,223.0,...,4.0,7.0,2.0,12.0,3.0,4.0,3.0,114.0,4.0,1.0


In [7]:
unique_values = df['Prediction'].unique()
print("Unique values in 'Prediction Label':", unique_values)

Unique values in 'Prediction Label': [0 1]


# 1. Data Preparation

In [8]:
X = df.drop(columns=['Prediction'])
y = df['Prediction']

### Handling Missing Values

Cheking if there any missing values in any column of the dataset.

In [9]:
columns_with_null = df.columns[df.isnull().any()]
if len(columns_with_null) > 0:
    print("Columns with null values:")
    print(columns_with_null)
else:
    print("No columns have null values.")

No columns have null values.


As seen above, there are no null values in our dataset.

### Encoding Categorical Columns

First, checking the datatypes of each column

In [10]:
df.dtypes

Email No.     object
the            int64
to             int64
ect            int64
and            int64
               ...  
military       int64
allowing       int64
ff             int64
dry            int64
Prediction     int64
Length: 3002, dtype: object

Encoding Categorical Columns using Label Encoder

In [11]:
le = LabelEncoder()

for col in X:
    if X[col].dtypes == 'object':
        X[col] = le.fit_transform(X[col])

### Scaling Numerical Features

In [12]:
scaler = StandardScaler()

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

X.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,-0.565449,-0.649083,-0.293895,-0.508752,-0.667663,-0.421725,-0.611169,-0.571751,-0.290556,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,-0.329048,-0.070971
1,1111,0.115757,0.714508,1.337337,0.483741,0.614369,-0.100659,0.530831,-0.339949,3.584743,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,0.030672,-0.070971
2,2222,-0.565449,-0.649083,-0.293895,-0.508752,-0.667663,-0.421725,-0.542649,-0.571751,-0.290556,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,-0.329048,-0.070971
3,3333,-0.565449,-0.124625,1.19549,-0.508752,0.400697,-0.261192,-0.051589,-0.108147,1.14474,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,-0.329048,-0.070971
4,4444,0.030606,-0.019733,0.840875,-0.343336,0.400697,-0.100659,0.016931,-0.571751,1.00121,...,-0.02919,-0.047525,-0.062944,-0.091138,-0.172137,-0.044197,-0.04733,-0.056285,0.030672,-0.070971


### Saving the Cleaned Dataset in a New CSV File

In [14]:
df = pd.concat([X, y], axis=1)

df.to_csv('spam_dataset.csv', index=False)

# 2. Model Selection

By exploring given classification algorithms, i analyzed that:

-   Logistic regression is a linear model used for binary classification, predicting the probability of an event occurring. They are useful when the relationship between features and the probability of a certain outcome is linear or when interpreting model coefficients is important, such as in medical or social sciences.
    
-  SVM is a supervised learning model that finds the optimal hyperplane to separate classes in high-dimensional space. They are effective for binary classification tasks and scenarios with complex, non-linear decision boundaries, such as email spam classification with high-dimensional feature spaces.
    
-  Decision Trees: A decision tree is a tree-like model where nodes represent decisions based on features, leading to outcomes at leaf nodes. They are useful for classification, especially when interpreting and visualizing decision-making processes is valuable.

**For this problem, I have selected SVM over Decision Trees or Logistic Regression for email spam classification because of SVM's effectiveness in handling high-dimensional data, its ability to capture non-linear relationships through kernel methods, and its robust performance in scenarios where the decision boundary between spam and non-spam emails may be complex and non-linear.**

# 3. Model Training

In [15]:
df = pd.read_csv('spam_dataset.csv')
X = df.drop(columns=['Prediction'])
y = df['Prediction']

### Splitting the dataset into training and testing sets. (Performing 80/20 Split)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
svm_model = SVC(kernel='linear', C=1.0, random_state=42)

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

# 4. Model Evaluation

In [18]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_test, y_pred))

Accuracy: 0.93
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       739
           1       0.87      0.89      0.88       296

    accuracy                           0.93      1035
   macro avg       0.91      0.92      0.92      1035
weighted avg       0.93      0.93      0.93      1035



# Performing Cross-Validation

In [19]:
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.9384058  0.94323671 0.92382104 0.92261185 0.92261185]
Mean cross-validation score: 0.9301374504202957


As we can see, the cross-validation score is almost the same as the accuracy score, which typically indicates that our model is performing consistently across different folds of the cross-validation process.

# Experimenting with Ensemble Methods (Random Forest & Gradient Boosting)

### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_test, y_pred))

Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       739
           1       0.96      0.96      0.96       296

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



### Gradient Boosting

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_test, y_pred))

Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       739
           1       0.94      0.96      0.95       296

    accuracy                           0.97      1035
   macro avg       0.96      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035



As seen above, in this task, ensemble methods like Random Forest and Gradient Boosting outperformed SVM in terms of predictive accuracy and generalization ability.

# Tuning Hyperparameters to Optimize Model's Performance

### Grid Search

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [20, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.2f}")

print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best cross-validation score: 0.9700240669669196
Accuracy on test set: 0.98
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       739
           1       0.95      0.96      0.96       296

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



### Random Search

In [23]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': np.arange(50, 101, 50),
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': np.arange(2, 11, 2),
    'min_samples_leaf': np.arange(1, 5)
}

rf_model = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=100, 
                                   cv=5, scoring='accuracy', random_state=42, verbose=1)

random_search.fit(X_train, y_train)

print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

best_rf_model = random_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.2f}")

print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found: {'n_estimators': 100, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_depth': 40}
Best cross-validation score: 0.9692988451360778
Accuracy on test set: 0.97
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       739
           1       0.96      0.95      0.95       296

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035

