In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip drive/My\ Drive/archive.zip > /dev/null #-d drive/My\ Drive/enron

In [8]:
import numpy as np 
import pandas as pd
from sklearn.datasets import load_files
from sklearn import metrics
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [6]:
X, y = [], []
email = load_files("enron1")
X = np.append(X, email.data)
y = np.append(y, email.target)   

In [9]:
df_all = pd.DataFrame(columns=['text', 'target'])
df_all['text'] = [x for x in X]
df_all['target'] = [t for t in y]

In [10]:
df_X = df_all.drop(['target'], axis=1)
df_y = df_all['target']

In [11]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [13]:
corpus = []
for i in range(0, len(df_X)):
    review = re.sub(r'\\r\\n', ' ', str(df_X['text'][i]))    # Remove special symbols

    review = re.sub('[^a-zA-Z]', ' ', review)    # Remove all symbols except letters

    review = re.sub(r'\s+', ' ', review)           # Replacing all gaps with spaces              

    review = re.sub(r'^b\s+', '', review)           # Remove 'b' in the beginning of each text

    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=9, test_size=0.2)

In [16]:
# MultinomialNB Model

from sklearn.naive_bayes import MultinomialNB

#train
model = MultinomialNB()
model.fit(X_train, y_train)

#predict
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_m = confusion_matrix(y_test, y_pred)

print("MultinomialNB")
print("Accuracy: ",accuracy*100)
print("Precision: ", precision*100)
print("Recall: ",recall*100)
print("Confusion Matrix: ")
print(conf_m)

MultinomialNB
Accuracy:  97.77777777777777
Precision:  96.05263157894737
Recall:  96.36963696369637
Confusion Matrix: 
[[720  12]
 [ 11 292]]


In [17]:
# KNN Model

from sklearn.neighbors import KNeighborsClassifier

#train model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

#predict
y_pred = knn.predict(X_test)

#Performance Measures

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_m = confusion_matrix(y_test, y_pred)

print("KNN")
print("Accuracy: ",accuracy*100)
print("Precision: ", precision*100)
print("Recall: ",recall*100)
print("Confusion Matrix: ")
print(conf_m)

KNN
Accuracy:  85.70048309178743
Precision:  68.76513317191282
Recall:  93.72937293729373
Confusion Matrix: 
[[603 129]
 [ 19 284]]


In [18]:
# Logistic Regression Model

from sklearn.linear_model import LogisticRegression

#train
Logression=LogisticRegression()
Logression.fit(X_train, y_train)

#predict
pred = Logression.predict(X_test)

#Performance Measures

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_m = confusion_matrix(y_test, y_pred)

print("Logistic Regression")
print("Accuracy: ",accuracy*100)
print("Precision: ", precision*100)
print("Recall: ",recall*100)
print("Confusion Matrix: ")
print(conf_m)



Logistic Regression
Accuracy:  85.70048309178743
Precision:  68.76513317191282
Recall:  93.72937293729373
Confusion Matrix: 
[[603 129]
 [ 19 284]]


In [19]:
# RandomForestClassifier Model

from sklearn.ensemble import RandomForestClassifier

# train
rfc=RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)
#predict
y_pred=rfc.predict(X_test)

#Performance Measures

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_m = confusion_matrix(y_test, y_pred)

print("RandomForestClassifier")
print("Accuracy: ",accuracy*100)
print("Precision: ", precision*100)
print("Recall: ",recall*100)
print("Confusion Matrix: ")
print(conf_m)




RandomForestClassifier
Accuracy:  97.97101449275362
Precision:  97.63513513513513
Recall:  95.37953795379538
Confusion Matrix: 
[[725   7]
 [ 14 289]]


In [20]:
# SVC (C-Support Vector Classification) Model

from sklearn.svm import SVC

#train using kernel='linear'
clf2 = SVC(kernel='linear')
clf2.fit(X_train,y_train)

#predict
y_pred = clf2.predict(X_test)

#Performance Measures

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_m = confusion_matrix(y_test, y_pred)

print("SVC (C-Support Vector Classification)")
print("Accuracy: ",accuracy*100)
print("Precision: ", precision*100)
print("Recall: ",recall*100)
print("Confusion Matrix: ")
print(conf_m)

SVC (C-Support Vector Classification)
Accuracy:  97.10144927536231
Precision:  95.04950495049505
Recall:  95.04950495049505
Confusion Matrix: 
[[717  15]
 [ 15 288]]


In [21]:
# Gaussian Naive Bayes Model

from sklearn.naive_bayes import GaussianNB

#train
gnb = GaussianNB()
gnb.fit(X_train, y_train)
  
#predict 
y_pred = gnb.predict(X_test)
  

#Performance Measures

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_m = confusion_matrix(y_test, y_pred)

print("Gaussian Naive Bayes")
print("Accuracy: ",accuracy*100)
print("Precision: ", precision*100)
print("Recall: ",recall*100)
print("Confusion Matrix: ")
print(conf_m)

Gaussian Naive Bayes
Accuracy:  93.81642512077295
Precision:  92.52669039145907
Recall:  85.8085808580858
Confusion Matrix: 
[[711  21]
 [ 43 260]]
