In [1]:
# IMPORT LIBRARIES
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# LOADING DATA
df = pd.read_csv("emails.csv")

In [3]:
# PRINTS NUMBER OF ROWS AND COLUMNS PRESENT IN THE DATASET
print('Rows,','Columns')
df.shape

Rows, Columns


(5172, 3002)

### Data Pre-Processing

In [4]:
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [5]:
# CHECKING ALL THE FEATURES PRESENT
df.columns

Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)

In [6]:
# DROPPING NON-ESSENTIAL FEATURES
df.drop(['hou','connevey','jay','lay','allowing','ff','dry'],axis=1,inplace=True)

In [7]:
# CHECK FOR DUPLICATE VALUES AND REMOVE THEM
df.drop_duplicates(inplace = True)

In [8]:
# CHECK FOR NUMBER OF MISSING DATA OR NULL VALUES FOR EACH FEATURE
df.isnull().sum()

Email No.         0
the               0
to                0
ect               0
and               0
for               0
of                0
a                 0
you               0
in                0
on                0
is                0
this              0
enron             0
i                 0
be                0
that              0
will              0
have              0
with              0
your              0
at                0
we                0
s                 0
are               0
it                0
by                0
com               0
as                0
from              0
                 ..
matters           0
batch             0
continuing        0
winning           0
symbol            0
offsystem         0
decisions         0
produced          0
ended             0
greatest          0
degree            0
solmonson         0
imbalances        0
fall              0
fear              0
hate              0
fight             0
reallocated       0
debt              0


In [9]:
# IF NULL VALUES ARE PRESENT AND HAD TO BE REMOVED
'''
from sklearn.preprocessing import Imputer
im=Imputer(missing_values=np.nan,strategy='most_frequent',axis=0)
data['column_name']=im.fit_transform(data[['column_name']].values)
'''
# NO NULL VALUES ARE PRESENT

"\nfrom sklearn.preprocessing import Imputer\nim=Imputer(missing_values=np.nan,strategy='most_frequent',axis=0)\ndata['column_name']=im.fit_transform(data[['column_name']].values)\n"

In [10]:
# VIEW BASIC STATISTICAL DETAILS
df.describe()

Unnamed: 0,the,to,ect,and,for,of,a,you,in,on,...,australia,plain,prompt,remains,ifhsc,enhancements,valued,infrastructure,military,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,10.600155,10.935808,...,0.006961,0.03751,0.008894,0.00406,0.00522,0.005607,0.010634,0.004254,0.006574,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,19.281892,17.999402,...,0.162025,0.239546,0.099885,0.066569,0.084428,0.192108,0.116693,0.096252,0.138908,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,12.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,223.0,302.0,...,9.0,3.0,2.0,2.0,2.0,10.0,2.0,3.0,4.0,1.0


In [11]:
df.corr() 

Unnamed: 0,the,to,ect,and,for,of,a,you,in,on,...,australia,plain,prompt,remains,ifhsc,enhancements,valued,infrastructure,military,Prediction
the,1.000000,0.852715,0.337249,0.841200,0.784112,0.796397,0.784451,0.471392,0.845670,0.731607,...,0.113811,0.107622,0.099324,0.116635,-0.001618,0.018378,0.225586,0.101768,0.129466,-0.004421
to,0.852715,1.000000,0.375480,0.825474,0.781971,0.752722,0.896466,0.508513,0.881759,0.839835,...,0.158259,0.162019,0.120079,0.105437,-0.004343,0.020223,0.232847,0.093322,0.091639,0.055277
ect,0.337249,0.375480,1.000000,0.272863,0.369777,0.178028,0.400009,0.155783,0.298387,0.505063,...,0.011581,0.027257,0.020236,0.044289,0.011714,0.018263,0.046080,0.004393,-0.007690,-0.120782
and,0.841200,0.825474,0.272863,1.000000,0.751287,0.809665,0.815196,0.476764,0.874276,0.751678,...,0.126400,0.140115,0.071578,0.160684,-0.012139,0.020448,0.272963,0.151980,0.084147,0.114364
for,0.784112,0.781971,0.369777,0.751287,1.000000,0.681457,0.744098,0.495852,0.762659,0.715465,...,0.101367,0.157442,0.083252,0.095820,-0.018776,0.005459,0.236213,0.134469,0.067151,-0.003101
of,0.796397,0.752722,0.178028,0.809665,0.681457,1.000000,0.715502,0.420209,0.852972,0.671119,...,0.147987,0.093737,0.055989,0.134686,-0.007695,0.016937,0.332653,0.132294,0.073004,0.197234
a,0.784451,0.896466,0.400009,0.815196,0.744098,0.715502,1.000000,0.442994,0.890047,0.895453,...,0.134088,0.164934,0.084988,0.103005,-0.001359,0.015943,0.211254,0.114222,0.111685,0.107776
you,0.471392,0.508513,0.155783,0.476764,0.495852,0.420209,0.442994,1.000000,0.435958,0.400882,...,0.092732,0.129950,0.112428,0.032456,-0.025800,0.014809,0.071191,0.062744,0.006498,0.130293
in,0.845670,0.881759,0.298387,0.874276,0.762659,0.852972,0.890047,0.435958,1.000000,0.825691,...,0.168146,0.152635,0.068017,0.125411,-0.007271,0.011569,0.325050,0.130958,0.120620,0.154055
on,0.731607,0.839835,0.505063,0.751678,0.715465,0.671119,0.895453,0.400882,0.825691,1.000000,...,0.135892,0.174673,0.087445,0.116424,0.002766,0.011401,0.199566,0.098275,0.063902,0.056968


### TRAIN & TESTING

In [12]:
X = df.iloc[:,1:2995]
X

Unnamed: 0,the,to,ect,and,for,of,a,you,in,on,...,australia,plain,prompt,remains,ifhsc,enhancements,valued,infrastructure,military,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,18,21,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,8,0,4,2,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,1,5,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,3,12,...,0,0,0,0,0,0,0,0,0,0
5,4,5,1,4,2,3,45,1,16,12,...,0,0,0,0,0,0,0,0,0,1
6,5,3,1,3,2,1,37,0,9,4,...,0,0,0,0,0,0,0,0,0,0
7,0,2,2,3,1,2,21,6,2,6,...,0,0,0,0,0,0,0,0,0,1
8,2,2,3,0,0,1,18,0,3,3,...,0,0,0,0,0,0,0,0,0,0
9,4,4,35,0,1,0,49,1,9,4,...,0,0,0,0,0,0,0,0,0,0


In [13]:
Y = df.iloc[:,-1].values
Y

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.25,random_state=0)

### K-NEAREST NEIGHBOUR

In [15]:
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
print("Accuracy score :")
print(accuracy_score(y_test,y_pred))

Accuracy score :
0.8723897911832946


### NAIVE BAYES

In [16]:
mnb = MultinomialNB(alpha=1.9)
mnb.fit(X_train,y_train)
y_pred1 = mnb.predict(X_test)
print("Accuracy Score for Naive Bayes : ", accuracy_score(y_pred1,y_test))

Accuracy Score for Naive Bayes :  0.9605568445475638


### SUPPORT VECTOR MACHINE

In [17]:
svc = SVC(C=1.0,kernel='rbf',gamma='auto')         
svc.fit(X_train,y_train)
y_pred2 = svc.predict(X_test)
print("Accuracy Score for SVC : ", accuracy_score(y_pred2,y_test))

Accuracy Score for SVC :  0.9079659706109822


### DECISION TREE

In [18]:
dt=DecisionTreeClassifier(criterion='gini')
dt.fit(X_train,y_train)
y_pred3=dt.predict(X_test)
print("Accuracy Score :")
print(accuracy_score(y_pred3,y_test))

Accuracy Score :
1.0


### RANDOM FOREST

In [20]:
rfc = RandomForestClassifier(n_estimators=100,criterion='gini')
rfc.fit(X_train,y_train)
y_pred4 = rfc.predict(X_test)
print("Accuracy Score of Random Forest Classifier : ", accuracy_score(y_pred4,y_test))

Accuracy Score of Random Forest Classifier :  0.9976798143851509
