In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
events = pd.read_csv("events_augmented.csv")
events.head()


Unnamed: 0.1,Unnamed: 0,index,id,date,attribute_count,Tag,header,from,feedback_time,host,from_count,host_count,host_count_ln,subject
0,0,19625,287833,2022-04-01,28,0,Delivered-To: UNDISCLOSEDFORPRIVACY@PRIVACYDOM...,earleenlpwoodman2593@gmail.com|earleenlpwoodma...,2022-03-31T23:21:12.000000+0000|2022-03-31T23:...,www1.nyc.gov|www1.nyc.gov,2,2,0.693147,Welcome to Your Survey-6668
1,1,19626,287834,2022-04-01,79,0,Delivered-To: UNDISCLOSEDFORPRIVACY@PRIVACYDOM...,quickbooks@notification.intuit.com|quickbooks@...,2022-04-01T00:26:28.000000+0000|2022-04-01T00:...,links.notification.intuit.com|links.notificati...,2,12,2.484907,Invoice 30148805 from Norton Billing
2,2,19627,287835,2022-04-01,89,1,Received: from 10.194.205.88\r\n by atlas109.s...,fortune@msg.fortune.com|fortune@msg.fortune.com,2022-04-01T00:58:47.000000+0000|2022-04-01T00:...,fortune.com|fortune.com|fortune.com|fortune.co...,2,16,2.772589,CBD - The Hottest Thing In 2022 - New Year New...
3,3,19628,287836,2022-04-01,159,1,Received: from 10.213.249.35\r\n by atlas106.s...,miltongrovesc.yahoo.com@send.mailchimpapp.com|...,2022-04-01T00:55:55.000000+0000|2022-04-01T00:...,mailchi.mp|miltongrovesc.us15.list-manage.com|...,2,30,3.401197,Happy Dietday!
4,4,19629,287837,2022-04-01,62,0,Received: from 10.194.205.88\r\n by atlas103.s...,pedidos@riooffsite.com.br|pedidos@riooffsite.c...,2022-04-01T00:56:38.000000+0000|2022-04-01T00:...,u10225017.ct.sendgrid.net|usaa.com|u10225017.c...,2,10,2.302585,Verify mobile number on your profile


In [3]:
events["numbers_in_subject"] = ''

def count_numbers(row):
    return sum(c.isdigit() for c in str(row["subject"]))

events["numbers_in_subject"] = events.apply(lambda row: count_numbers(row), axis=1)

events.head()

Unnamed: 0.1,Unnamed: 0,index,id,date,attribute_count,Tag,header,from,feedback_time,host,from_count,host_count,host_count_ln,subject,numbers_in_subject
0,0,19625,287833,2022-04-01,28,0,Delivered-To: UNDISCLOSEDFORPRIVACY@PRIVACYDOM...,earleenlpwoodman2593@gmail.com|earleenlpwoodma...,2022-03-31T23:21:12.000000+0000|2022-03-31T23:...,www1.nyc.gov|www1.nyc.gov,2,2,0.693147,Welcome to Your Survey-6668,4
1,1,19626,287834,2022-04-01,79,0,Delivered-To: UNDISCLOSEDFORPRIVACY@PRIVACYDOM...,quickbooks@notification.intuit.com|quickbooks@...,2022-04-01T00:26:28.000000+0000|2022-04-01T00:...,links.notification.intuit.com|links.notificati...,2,12,2.484907,Invoice 30148805 from Norton Billing,8
2,2,19627,287835,2022-04-01,89,1,Received: from 10.194.205.88\r\n by atlas109.s...,fortune@msg.fortune.com|fortune@msg.fortune.com,2022-04-01T00:58:47.000000+0000|2022-04-01T00:...,fortune.com|fortune.com|fortune.com|fortune.co...,2,16,2.772589,CBD - The Hottest Thing In 2022 - New Year New...,4
3,3,19628,287836,2022-04-01,159,1,Received: from 10.213.249.35\r\n by atlas106.s...,miltongrovesc.yahoo.com@send.mailchimpapp.com|...,2022-04-01T00:55:55.000000+0000|2022-04-01T00:...,mailchi.mp|miltongrovesc.us15.list-manage.com|...,2,30,3.401197,Happy Dietday!,0
4,4,19629,287837,2022-04-01,62,0,Received: from 10.194.205.88\r\n by atlas103.s...,pedidos@riooffsite.com.br|pedidos@riooffsite.c...,2022-04-01T00:56:38.000000+0000|2022-04-01T00:...,u10225017.ct.sendgrid.net|usaa.com|u10225017.c...,2,10,2.302585,Verify mobile number on your profile,0


In [4]:
events["subject_size"] = ''

def subject_size(row):
    return len(str(row["subject"]))

events["subject_size"] = events.apply(lambda row: subject_size(row), axis=1)

events.head()

Unnamed: 0.1,Unnamed: 0,index,id,date,attribute_count,Tag,header,from,feedback_time,host,from_count,host_count,host_count_ln,subject,numbers_in_subject,subject_size
0,0,19625,287833,2022-04-01,28,0,Delivered-To: UNDISCLOSEDFORPRIVACY@PRIVACYDOM...,earleenlpwoodman2593@gmail.com|earleenlpwoodma...,2022-03-31T23:21:12.000000+0000|2022-03-31T23:...,www1.nyc.gov|www1.nyc.gov,2,2,0.693147,Welcome to Your Survey-6668,4,27
1,1,19626,287834,2022-04-01,79,0,Delivered-To: UNDISCLOSEDFORPRIVACY@PRIVACYDOM...,quickbooks@notification.intuit.com|quickbooks@...,2022-04-01T00:26:28.000000+0000|2022-04-01T00:...,links.notification.intuit.com|links.notificati...,2,12,2.484907,Invoice 30148805 from Norton Billing,8,36
2,2,19627,287835,2022-04-01,89,1,Received: from 10.194.205.88\r\n by atlas109.s...,fortune@msg.fortune.com|fortune@msg.fortune.com,2022-04-01T00:58:47.000000+0000|2022-04-01T00:...,fortune.com|fortune.com|fortune.com|fortune.co...,2,16,2.772589,CBD - The Hottest Thing In 2022 - New Year New...,4,68
3,3,19628,287836,2022-04-01,159,1,Received: from 10.213.249.35\r\n by atlas106.s...,miltongrovesc.yahoo.com@send.mailchimpapp.com|...,2022-04-01T00:55:55.000000+0000|2022-04-01T00:...,mailchi.mp|miltongrovesc.us15.list-manage.com|...,2,30,3.401197,Happy Dietday!,0,15
4,4,19629,287837,2022-04-01,62,0,Received: from 10.194.205.88\r\n by atlas103.s...,pedidos@riooffsite.com.br|pedidos@riooffsite.c...,2022-04-01T00:56:38.000000+0000|2022-04-01T00:...,u10225017.ct.sendgrid.net|usaa.com|u10225017.c...,2,10,2.302585,Verify mobile number on your profile,0,36


In [5]:
events["subject_special_chars"] = ''

def count_special_chars(row):
    return sum((not c.isdigit() and not c.isalpha() and not c.isspace()) for c in str(row["subject"]))

events["subject_special_chars"] = events.apply(lambda row: count_special_chars(row), axis=1)

events.head()

Unnamed: 0.1,Unnamed: 0,index,id,date,attribute_count,Tag,header,from,feedback_time,host,from_count,host_count,host_count_ln,subject,numbers_in_subject,subject_size,subject_special_chars
0,0,19625,287833,2022-04-01,28,0,Delivered-To: UNDISCLOSEDFORPRIVACY@PRIVACYDOM...,earleenlpwoodman2593@gmail.com|earleenlpwoodma...,2022-03-31T23:21:12.000000+0000|2022-03-31T23:...,www1.nyc.gov|www1.nyc.gov,2,2,0.693147,Welcome to Your Survey-6668,4,27,1
1,1,19626,287834,2022-04-01,79,0,Delivered-To: UNDISCLOSEDFORPRIVACY@PRIVACYDOM...,quickbooks@notification.intuit.com|quickbooks@...,2022-04-01T00:26:28.000000+0000|2022-04-01T00:...,links.notification.intuit.com|links.notificati...,2,12,2.484907,Invoice 30148805 from Norton Billing,8,36,0
2,2,19627,287835,2022-04-01,89,1,Received: from 10.194.205.88\r\n by atlas109.s...,fortune@msg.fortune.com|fortune@msg.fortune.com,2022-04-01T00:58:47.000000+0000|2022-04-01T00:...,fortune.com|fortune.com|fortune.com|fortune.co...,2,16,2.772589,CBD - The Hottest Thing In 2022 - New Year New...,4,68,3
3,3,19628,287836,2022-04-01,159,1,Received: from 10.213.249.35\r\n by atlas106.s...,miltongrovesc.yahoo.com@send.mailchimpapp.com|...,2022-04-01T00:55:55.000000+0000|2022-04-01T00:...,mailchi.mp|miltongrovesc.us15.list-manage.com|...,2,30,3.401197,Happy Dietday!,0,15,1
4,4,19629,287837,2022-04-01,62,0,Received: from 10.194.205.88\r\n by atlas103.s...,pedidos@riooffsite.com.br|pedidos@riooffsite.c...,2022-04-01T00:56:38.000000+0000|2022-04-01T00:...,u10225017.ct.sendgrid.net|usaa.com|u10225017.c...,2,10,2.302585,Verify mobile number on your profile,0,36,0


In [6]:
#separate features and prediction
X, y = events[["host_count", "numbers_in_subject", "subject_size", "subject_special_chars"]], events["Tag"]
X = (X - X.mean()) / (X.max() - X.min())
print(X.shape)
print(y.shape)
X.head()

(39323, 4)
(39323,)


Unnamed: 0,host_count,numbers_in_subject,subject_size,subject_special_chars
0,-0.009692,-0.000111,-0.004404,-0.009567
1,0.004178,0.004851,-0.003022,-0.010602
2,0.009725,-0.000111,0.001891,-0.007497
3,0.029143,-0.005074,-0.006247,-0.009567
4,0.001404,-0.005074,-0.003022,-0.010602


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=33)
print (X_train.shape, y_train.shape)

(26346, 4) (26346,)


In [8]:
#create linear classification model

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)

SGDClassifier()

In [9]:
#evaluate on training set
from sklearn import metrics
y_train_pred = clf.predict(X_train)
print (metrics.accuracy_score(y_train, y_train_pred))

0.7595080847187429


In [10]:
#evaluate on test set
#Measure accuracy on the testing set
y_pred = clf.predict(X_test)
print (metrics.accuracy_score(y_test, y_pred))

0.7660476227171149


In [11]:
#support vector machine classifier
from sklearn import svm
SVM_clf = svm.SVC().fit(X_train, y_train)
SVM_prediction = SVM_clf.predict(X_test)
SVM_accuracy=100*accuracy_score(y_test, SVM_prediction)
print('','Classification Report',classification_report(y_test, SVM_prediction),'',sep='\n'+(55*'=')+'\n')
print('',"Confusion Matrix: ", confusion_matrix(y_test, SVM_prediction),'',sep='\n'+(20*'-')+'\n')
print('',f"Accuracy: {SVM_accuracy}",'',sep='\n'+(50*'*')+'\n')


Classification Report
              precision    recall  f1-score   support

           0       0.77      1.00      0.87      9941
           1       0.82      0.04      0.08      3036

    accuracy                           0.77     12977
   macro avg       0.79      0.52      0.47     12977
weighted avg       0.78      0.77      0.68     12977



--------------------
Confusion Matrix: 
--------------------
[[9914   27]
 [2916  120]]
--------------------


**************************************************
Accuracy: 77.32141481081915
**************************************************



In [12]:
from sklearn import neighbors
knn_clf = neighbors.KNeighborsClassifier(n_neighbors=1)
knn_clf.fit(X_train,y_train)
knn_prediction = knn_clf.predict(X_test)
knn_accuracy=100*accuracy_score(y_test, knn_prediction)
print('','Classification Report',classification_report(y_test, knn_prediction),'',sep='\n'+(55*'=')+'\n')
print('',"Confusion Matrix: ", confusion_matrix(y_test, knn_prediction),'',sep='\n'+(20*'-')+'\n')
print('',f"Accuracy: {knn_accuracy}",'',sep='\n'+(50*'*')+'\n')


Classification Report
              precision    recall  f1-score   support

           0       0.84      0.66      0.74      9941
           1       0.34      0.58      0.43      3036

    accuracy                           0.64     12977
   macro avg       0.59      0.62      0.58     12977
weighted avg       0.72      0.64      0.67     12977



--------------------
Confusion Matrix: 
--------------------
[[6555 3386]
 [1284 1752]]
--------------------


**************************************************
Accuracy: 64.01325421900285
**************************************************



In [13]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train,y_train)
LR_prediction = LR.predict(X_test)
LR_accuracy=100*accuracy_score(y_test, LR_prediction)
print('','Classification Report',classification_report(y_test, LR_prediction),'',sep='\n'+(55*'=')+'\n')
print('',"Confusion Matrix: ", confusion_matrix(y_test, LR_prediction),'',sep='\n'+(20*'-')+'\n')
print('',f"Accuracy: {LR_accuracy}",'',sep='\n'+(50*'*')+'\n')


Classification Report
              precision    recall  f1-score   support

           0       0.77      1.00      0.87      9941
           1       0.18      0.00      0.00      3036

    accuracy                           0.77     12977
   macro avg       0.47      0.50      0.43     12977
weighted avg       0.63      0.77      0.66     12977



--------------------
Confusion Matrix: 
--------------------
[[9932    9]
 [3034    2]]
--------------------


**************************************************
Accuracy: 76.5508206827464
**************************************************



In [14]:
from sklearn import tree
DT = tree.DecisionTreeClassifier()
DT.fit(X_train, y_train)
DT_prediction = DT.predict(X_test)
DT_accuracy=100.0 * accuracy_score(y_test, DT_prediction)
print('','Classification Report',classification_report(y_test, DT_prediction),'',sep='\n'+(55*'=')+'\n')
print('',"Confusion Matrix: ", confusion_matrix(y_test, DT_prediction),'',sep='\n'+(20*'-')+'\n')
print('',f"Accuracy: {DT_accuracy}",'',sep='\n'+(50*'*')+'\n')


Classification Report
              precision    recall  f1-score   support

           0       0.82      0.89      0.86      9941
           1       0.51      0.35      0.42      3036

    accuracy                           0.77     12977
   macro avg       0.66      0.62      0.64     12977
weighted avg       0.75      0.77      0.75     12977



--------------------
Confusion Matrix: 
--------------------
[[8894 1047]
 [1966 1070]]
--------------------


**************************************************
Accuracy: 76.78199892116821
**************************************************



In [15]:
from sklearn.naive_bayes import GaussianNB
GB_clf=GaussianNB()
GB_clf.fit(X_train,y_train)
GB_prediction=GB_clf.predict(X_test)
GB_accuracy=100.0 * accuracy_score(y_test, GB_prediction)
print('','Classification Report',classification_report(y_test, GB_prediction),'',sep='\n'+(55*'=')+'\n')
print('',"Confusion Matrix: ", confusion_matrix(y_test, GB_prediction),'',sep='\n'+(20*'-')+'\n')
print('',f"Accuracy: {GB_accuracy}",'',sep='\n'+(50*'*')+'\n')


Classification Report
              precision    recall  f1-score   support

           0       0.78      0.92      0.84      9941
           1       0.33      0.12      0.18      3036

    accuracy                           0.74     12977
   macro avg       0.55      0.52      0.51     12977
weighted avg       0.67      0.74      0.69     12977



--------------------
Confusion Matrix: 
--------------------
[[9169  772]
 [2661  375]]
--------------------


**************************************************
Accuracy: 73.5455035832627
**************************************************

