In [115]:
# This is a spam email classifier project, by using Naive Bayes classifier

# Import Libaries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm

## Importing Data

In [116]:
df = pd.read_csv ('spam_or_not_spam.csv')

In [117]:
# df
df.rename(columns={'email':'Email_Subject'}, inplace = True)
df #Label = 0 = Normal Email, Label = 1 = Spam Email

Unnamed: 0,Email_Subject,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


In [118]:
df.info()
# There is a "null subject" in email subject
# Label is in Integer type, do not need to further convert
df['Email_Subject'].fillna('', inplace = True) # filled 'Null' with ''(blank)
df['label'].value_counts()

# There are 2500 Normal email and 500 Spam email

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Email_Subject  2999 non-null   object
 1   label          3000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB


0    2500
1     500
Name: label, dtype: int64

## Train-test Split and tokenizer model 

In [119]:
# Import the model for splitting training set & testing set
X, y = df['Email_Subject'], df['label'] 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=1) 
# Using 8:2 portion for Train:Test

In [120]:
# Import the model for counting the terms frequency

from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words='english', lowercase = True)

# Count Vec automatically keep the data in Sparse Martrix to save memory

training_data = count_vector.fit_transform (X_train) 
testing_data = count_vector.transform (X_test)

In [121]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# MultinomialNB is for frequency count features
# BernoulliNB is for True/False features
# GaussianNB is for continuous features

In [122]:
%%time
model = MultinomialNB()
model.fit(training_data.toarray(),y_train)
pred = model.predict(testing_data.toarray()) 
# prob = model.predict_proba(testing_data.toarray()) #predict the probability that it could be a spam
# accuracy_score on test set
accuracy = accuracy_score(y_test, pred)  # TEST SET accuracy of MultinomialNB
print(f'Accuracy score: {accuracy}')

# precision_score, recall_score, f1_score on test set
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)

f1_scores['MultinomialNB'] = round(f1,3)
accuracy_scores['MultinomialNB'] = round(accuracy,3)

print(f'precision score: {precision}')
print(f'recall score: {recall}')
print(f'f1 score: {f1}')

Accuracy score: 0.9883333333333333
precision score: 0.99
recall score: 0.9428571428571428
f1 score: 0.9658536585365853
Wall time: 1.76 s


In [123]:
from sklearn import metrics
print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       495
           1       0.99      0.94      0.97       105

    accuracy                           0.99       600
   macro avg       0.99      0.97      0.98       600
weighted avg       0.99      0.99      0.99       600



In [124]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, pred)
print(confusion)

# True negative, false positive, false negative, true positive for binary classification
tn, fp, fn, tp = confusion.ravel()
print(f'TN: {tn}')
print(f'TP: {tp}')
print(f'FP: {fp}')
print(f'FN: {fn}')

[[494   1]
 [  6  99]]
TN: 494
TP: 99
FP: 1
FN: 6


## Test on other models

In [125]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV, cross_validate, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression

In [126]:
%%time
# Using GridSearch CV for tunning parameters
# for LinearSVC
params = [{'C':[0.1,0.5,1,5], 'max_iter':[75000,100000]}]

clf = LinearSVC()
search_cv = GridSearchCV(clf,params,cv=5, scoring='accuracy',return_train_score=True)
search_cv.fit(training_data.toarray(),y_train)
search_cv.best_params_

# Wall time: 53 s
# {'C': 0.1, 'max_iter': 75000}

Wall time: 50 s


{'C': 0.1, 'max_iter': 75000}

In [127]:
%%time
# for Random Forest
params = [{'n_estimators':[400,500,600],
          'max_depth':[30,35,40]}]

clf = RandomForestClassifier()
search_cv = GridSearchCV(clf,params,cv=5, scoring='accuracy',return_train_score=True)
search_cv.fit(training_data,y_train)
search_cv.best_params_

# Wall time: 4min 29s
# Result: {'max_depth': 40, 'n_estimators': 500} 

Wall time: 4min 21s


{'max_depth': 40, 'n_estimators': 400}

In [128]:
# Randomized CV search first
params = [{'C':[0.1,0.5,1,5],
           'solver':['lbfgs', 'liblinear'],
          'max_iter':[5000,7000,10000]
          }]
clf = LogisticRegression()
random_cv = RandomizedSearchCV(clf,params,cv=5, n_iter=24,scoring='accuracy',return_train_score=True)
random_cv.fit(training_data,y_train)
random_cv.best_params_
#{'solver': 'lbfgs', 'max_iter': 5000, 'C': 5}

{'solver': 'lbfgs', 'max_iter': 5000, 'C': 5}

In [129]:
%%time
# for SVC
params = [{'C':[0.5,1.0,1.5],
           'kernel':['linear','poly','rbf','sigmoid']}]

clf = SVC()
search_cv = GridSearchCV(clf,params,cv=5, scoring='accuracy',return_train_score=True)
search_cv.fit(training_data,y_train)
search_cv.best_params_

Wall time: 1min 35s


{'C': 0.5, 'kernel': 'linear'}

In [130]:
%%time
# for Logistic Regression
params = [{'C':[1,5],
           'solver':['lbfgs','liblinear'],
          'max_iter':[5000,7000]
          }]

clf = LogisticRegression()
search_cv = GridSearchCV(clf,params,cv=5, scoring='accuracy',return_train_score=True)
search_cv.fit(training_data,y_train)
search_cv.best_params_

Wall time: 7.88 s


{'C': 5, 'max_iter': 5000, 'solver': 'lbfgs'}

## Running Models

In [131]:
f1_scores = {}
accuracy_scores = {}
running_time = {}

In [132]:
%%time 

model = LinearSVC(C=0.1,max_iter=75000)
model.fit(training_data.toarray(),y_train)
pred = model.predict(testing_data.toarray()) 
# Accuracy score
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)

# saving accuracy score, f1 score and time
f1_scores['LinearSVC'] = round(f1,3)
accuracy_scores['LinearSVC'] = round(accuracy,3)

print('Accuracy score:' + str(accuracy))
print('precision score:' + str(precision))
print(f'recall score: {recall}')
print(f'f1 score: {f1}')

Accuracy score:0.9883333333333333
precision score:0.99
recall score: 0.9428571428571428
f1 score: 0.9658536585365853
Wall time: 955 ms


In [133]:
%%time
model = RandomForestClassifier(n_estimators=600, max_depth=40)
model.fit(training_data.toarray(),y_train)
pred = model.predict(testing_data.toarray()) 

# Accuracy score
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)

f1_scores['Random_Forest'] = round(f1,3)
accuracy_scores['Random_Forest'] = round(accuracy,3)

print(f'Accuracy score: {accuracy}')
print(f'precision score: {precision}')
print(f'recall score: {recall}')
print(f'f1 score: {f1}')

Accuracy score: 0.965
precision score: 0.9772727272727273
recall score: 0.819047619047619
f1 score: 0.8911917098445595
Wall time: 1min 3s


In [134]:
%%time
model = LogisticRegression(C=5, max_iter=5000, solver='lbfgs')
model.fit(training_data.toarray(),y_train)
pred = model.predict(testing_data.toarray()) 

# Accuracy score
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)

f1_scores['Logistic_Regression'] = round(f1,3)
accuracy_scores['Logistic_Regression'] = round(accuracy,3)

print(f'Accuracy score: {accuracy}')
print(f'precision score: {precision}')
print(f'recall score: {recall}')
print(f'f1 score: {f1}')

Accuracy score: 0.9866666666666667
precision score: 0.98989898989899
recall score: 0.9333333333333333
f1 score: 0.9607843137254903
Wall time: 8.92 s


In [135]:
%%time
model = SVC(C=0.5, kernel='linear')
model.fit(training_data.toarray(),y_train)
pred = model.predict(testing_data.toarray()) 
# Accuracy score
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)

f1_scores['SVC'] = round(f1,3)
accuracy_scores['SVC'] = round(accuracy,3)

print(f'Accuracy score: {accuracy}')
print(f'precision score: {precision}')
print(f'recall score: {recall}')
print(f'f1 score: {f1}')

Accuracy score: 0.985
precision score: 0.9897959183673469
recall score: 0.9238095238095239
f1 score: 0.9556650246305419
Wall time: 20.2 s


In [136]:
print(f1_scores)
print(accuracy_scores)

{'LinearSVC': 0.966, 'Random_Forest': 0.891, 'Logistic_Regression': 0.961, 'SVC': 0.956}
{'LinearSVC': 0.988, 'Random_Forest': 0.965, 'Logistic_Regression': 0.987, 'SVC': 0.985}


In [137]:
# turning the scores to df

df_f1_scores = pd.DataFrame(list(f1_scores.items()))
df_f1_scores.rename(columns = {0:'Models', 1:'F1 scores'}, inplace=True)

df_accuracy = pd.DataFrame(list(accuracy_scores.items()))
df_accuracy.rename(columns = {0:'Model', 1:'Accuracy scores'}, inplace=True)

#joining both df

df_model_performance = pd.concat([df_f1_scores,df_accuracy],axis=1,join='inner')
df_model_performance.drop(['Model'],axis=1,inplace=True)
print(df_model_performance)

                Models  F1 scores  Accuracy scores
0            LinearSVC      0.966            0.988
1        Random_Forest      0.891            0.965
2  Logistic_Regression      0.961            0.987
3                  SVC      0.956            0.985


In [138]:
# plotting the graph based on the f1 and accuracy score
import plotly.graph_objects as go

fig = go.Figure(data=[go.Bar(name='F1 socre',x=df_model_performance['Models'], 
                             y=df_model_performance['F1 scores'], text=df_model_performance['F1 scores']),
                       go.Bar(name='Accuracy',x=df_model_performance['Models'], 
                          y=df_model_performance['Accuracy scores'], text=df_model_performance['Accuracy scores'])])


fig.update_layout(barmode='group')
fig.show()