In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk 
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import string

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maxme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df=pd.read_csv('spam_ham_dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [4]:
plt.figure(figsize=(10,6))

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [5]:
df['text']=df['text'].apply(lambda x: x.replace('\r\n',' '))


In [6]:
df.text.iloc[2]

"Subject: neon retreat ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time ! i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute . on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about . i think we all agree that it ' s important for us to get together and have some time to recharge our batteries before we get to far into the spring semester , but it can be a lot of trouble and difficult for us to get away without kids , etc . so , brad came up with a pote

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [8]:
stemmer=PorterStemmer()
corpus=[]
stopwords_set=set(stopwords.words('english'))
for i in range(len(df)):
    text=df['text'].iloc[i].lower()
    text=text.translate(str.maketrans('','',string.punctuation)).split()
    text=[stemmer.stem(word) for word in text if word not in stopwords_set] 
    text=' '.join(text)
    corpus.append(text)

In [9]:
df['text'].iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [10]:
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [11]:
vectorizer=CountVectorizer()
X=vectorizer.fit_transform(corpus).toarray()
y=df.label_num
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
clf=RandomForestClassifier(n_jobs=-1)
mb=MultinomialNB()
gb=GaussianNB()
bb=BernoulliNB()
clf.fit(X_train,y_train)
mb.fit(X_train,y_train)
gb.fit(X_train,y_train)
bb.fit(X_train,y_train)

In [13]:
print("Random Forest classifier: ",clf.score(X_test,y_test))
print("Multinomial Gaussian Bayes: ",mb.score(X_test,y_test))
print("Gaussian Bayes: ",gb.score(X_test,y_test))
print("Bernoulli Bayes: ",bb.score(X_test,y_test))

Random Forest classifier:  0.9797101449275363
Multinomial Gaussian Bayes:  0.9748792270531401
Gaussian Bayes:  0.9497584541062802
Bernoulli Bayes:  0.8560386473429952


In [14]:
email_to_classify=df.text.values[10]

In [15]:
email_to_classify

"Subject: vocable % rnd - word asceticism vcsc - brand new stock for your attention vocalscape inc - the stock symbol is : vcsc vcsc will be our top stock pick for the month of april - stock expected to bounce to 12 cents level the stock hit its all time low and will bounce back stock is going to explode in next 5 days - watch it soar watch the stock go crazy this and next week . breaking news - vocalscape inc . announces agreement to resell mix network services current price : $ 0 . 025 we expect projected speculative price in next 5 days : $ 0 . 12 we expect projected speculative price in next 15 days : $ 0 . 15 vocalscape networks inc . is building a company that ' s revolutionizing the telecommunications industry with the most affordable phone systems , hardware , online software , and rates in canada and the us . vocalscape , a company with global reach , is receiving international attention for the development of voice over ip ( voip ) application solutions , including the award 

In [16]:
email_text=email_to_classify.lower().translate(str.maketrans('','',string.punctuation)).split()
email_text=[stemmer.stem(word) for word in text if word not in stopwords_set]
email_text=' '.join(email_text)
email_corpus=[email_text]
X_email=vectorizer.transform(email_corpus).toarray()

In [17]:
clf.predict(X_email)

array([1], dtype=int64)

In [18]:
df.label_num.iloc[10]

1

In [19]:
import pickle
filename = 'randomforrest_spam_model.sav'
pickle.dump(clf, open(filename, 'wb'))