In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from nltk.stem.porter import PorterStemmer
import re

In [3]:
df = pd.read_csv('spam_ham_dataset.csv', encoding='latin-1')
df = df.drop(['Unnamed: 0'],axis=1)

In [4]:
df.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [5]:
# Replace ham with 0 and spam with 1
df = df.replace(['ham','spam'],[0, 1])

In [6]:
df.head()

Unnamed: 0,label,text,label_num
0,0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,1,"Subject: photoshop , windows , office . cheap ...",1
4,0,Subject: re : indian springs\r\nthis deal is t...,0


In [7]:
df['Count']=0
for i in np.arange(0,len(df.text)):
    df.loc[i,'Count'] = len(df.loc[i,'text'])

In [8]:
df.head()

Unnamed: 0,label,text,label_num,Count
0,0,Subject: enron methanol ; meter # : 988291\r\n...,0,327
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,97
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,2524
3,1,"Subject: photoshop , windows , office . cheap ...",1,414
4,0,Subject: re : indian springs\r\nthis deal is t...,0,336


In [9]:
# Total ham(0) and spam(1) messages
df['label'].value_counts()

0    3672
1    1499
Name: label, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5171 non-null   int64 
 1   text       5171 non-null   object
 2   label_num  5171 non-null   int64 
 3   Count      5171 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 161.7+ KB


In [11]:
corpus = []
ps = PorterStemmer()

In [12]:
# Original Messages

print (df['text'][0])
print (df['text'][1])

Subject: enron methanol ; meter # : 988291
this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary
flow data provided by daren } .
please override pop ' s daily volume { presently zero } to reflect daily
activity you can obtain from gas control .
this change is needed asap for economics purposes .
Subject: hpl nom for january 9 , 2001
( see attached file : hplnol 09 . xls )
- hplnol 09 . xls


In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self signed certificate in certificate chain
[nltk_data]     (_ssl.c:1129)>


In [14]:
for i in range(0, 5572):

    # Applying Regular Expression

    '''
    Replace email addresses with 'emailaddr'
    Replace URLs with 'httpaddr'
    Replace money symbols with 'moneysymb'
    Replace phone numbers with 'phonenumbr'
    Replace numbers with 'numbr'
    '''
    msg = df['text'][i]
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', df['text'][i])
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', df['text'][i])
    msg = re.sub('£|\$', 'moneysymb', df['text'][i])
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', df['text'][i])
    msg = re.sub('\d+(\.\d+)?', 'numbr', df['text'][i])

    ''' Remove all punctuations '''
    msg = re.sub('[^\w\d\s]', ' ', df['text'][i])

    if i<2:
        print("\t\t\t\t MESSAGE ", i)

    if i<2:
        print("\n After Regular Expression - Message ", i, " : ", msg)

    # Each word to lower case
    msg = msg.lower()
    if i<2:
        print("\n Lower case Message ", i, " : ", msg)

    # Splitting words to Tokenize
    msg = msg.split()
    if i<2:
        print("\n After Splitting - Message ", i, " : ", msg)

    # Stemming with PorterStemmer handling Stop Words
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    if i<2:
        print("\n After Stemming - Message ", i, " : ", msg)

    # preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    if i<2:
        print("\n Final Prepared - Message ", i, " : ", msg, "\n\n")

    # Preparing WordVector Corpus
    corpus.append(msg)

				 MESSAGE  0

 After Regular Expression - Message  0  :  Subject  enron methanol   meter     988291
this is a follow up to the note i gave you on monday   4   3   00   preliminary
flow data provided by daren    
please override pop   s daily volume   presently zero   to reflect daily
activity you can obtain from gas control  
this change is needed asap for economics purposes  

 Lower case Message  0  :  subject  enron methanol   meter     988291
this is a follow up to the note i gave you on monday   4   3   00   preliminary
flow data provided by daren    
please override pop   s daily volume   presently zero   to reflect daily
activity you can obtain from gas control  
this change is needed asap for economics purposes  

 After Splitting - Message  0  :  ['subject', 'enron', 'methanol', 'meter', '988291', 'this', 'is', 'a', 'follow', 'up', 'to', 'the', 'note', 'i', 'gave', 'you', 'on', 'monday', '4', '3', '00', 'preliminary', 'flow', 'data', 'provided', 'by', 'daren', 'pl

KeyError: 5171

In [15]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

In [16]:
y = df['label']
print (y.value_counts())

print(y[0])
print(y[1])

0    3672
1    1499
Name: label, dtype: int64
0
0


In [17]:
le = LabelEncoder()
y = le.fit_transform(y)

print(y[0])
print(y[1])

0
0


In [18]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size= 0.20, random_state = 0)

In [19]:
bayes_classifier = GaussianNB()
bayes_classifier.fit(xtrain, ytrain)

GaussianNB()

In [20]:
# Predicting
y_pred = bayes_classifier.predict(xtest)

In [21]:
# Evaluating
cm = confusion_matrix(ytest, y_pred)

In [22]:
cm

array([[718,  14],
       [ 34, 269]], dtype=int64)

In [23]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, bayes_classifier.predict(xtest)))
print (classification_report(ytest, bayes_classifier.predict(xtest)))

Accuracy : 0.95362 


              precision    recall  f1-score   support

           0       0.95      0.98      0.97       732
           1       0.95      0.89      0.92       303

    accuracy                           0.95      1035
   macro avg       0.95      0.93      0.94      1035
weighted avg       0.95      0.95      0.95      1035



In [24]:
# Evaluate the model
accuracy = accuracy_score(ytest, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9536231884057971


In [25]:
dt = DecisionTreeClassifier(random_state=50)
dt.fit(xtrain, ytrain)

DecisionTreeClassifier(random_state=50)

In [26]:
# Predicting
y_pred_dt = dt.predict(xtest)

In [27]:
# Evaluating
cm = confusion_matrix(ytest, y_pred_dt)

print(cm)

[[704  28]
 [ 33 270]]


In [28]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, dt.predict(xtest)))
print (classification_report(ytest, dt.predict(xtest)))

Accuracy : 0.94106 


              precision    recall  f1-score   support

           0       0.96      0.96      0.96       732
           1       0.91      0.89      0.90       303

    accuracy                           0.94      1035
   macro avg       0.93      0.93      0.93      1035
weighted avg       0.94      0.94      0.94      1035



In [29]:
# Evaluate the model
accuracy = accuracy_score(ytest, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9536231884057971


In [30]:
import joblib
# Save the model
joblib.dump(dt,'spam_model.pkl')
joblib.dump(cv, 'vectorizer.joblib')

['vectorizer.joblib']