In [291]:
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [292]:
nltk.download('stopwords')
stopwords_set=set(stopwords.words('english'))
stopwords_set

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JAISON\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [293]:
stemmer=PorterStemmer()

In [294]:
def transform_text(text):
    e_text=text.lower().translate(str.maketrans('','',string.punctuation)).split()
    e_text=[stemmer.stem(word) for word in e_text if word not in stopwords_set]
    e_text=' '.join(e_text)
    return e_text

In [295]:
df=pd.read_csv('spam_ham_dataset.csv')

In [296]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [297]:
df['text']=df['text'].apply(lambda x:x.replace('\r\n',' '))

In [298]:
df['text']

0       Subject: enron methanol ; meter # : 988291 thi...
1       Subject: hpl nom for january 9 , 2001 ( see at...
2       Subject: neon retreat ho ho ho , we ' re aroun...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs this deal is to b...
                              ...                        
5166    Subject: put the 10 on the ft the transport vo...
5167    Subject: 3 / 4 / 2000 and following noms hpl c...
5168    Subject: calpine daily gas nomination > > juli...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert dear v...
Name: text, Length: 5171, dtype: object

In [299]:
df.text.iloc[2]

"Subject: neon retreat ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time ! i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute . on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about . i think we all agree that it ' s important for us to get together and have some time to recharge our batteries before we get to far into the spring semester , but it can be a lot of trouble and difficult for us to get away without kids , etc . so , brad came up with a pote

In [300]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [301]:
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [302]:
df.duplicated().sum()

0

In [303]:
df = df.drop_duplicates(keep='first')

In [304]:
corpus=[]

In [305]:
len(df)

5171

In [306]:
for i in range(len(df)):
    text=transform_text(df['text'].iloc[i])
    corpus.append(text)

In [307]:
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [308]:
df.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [309]:
vectorizer=CountVectorizer()

X=vectorizer.fit_transform(corpus).toarray()
y=df.label_num
X


array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [310]:
y

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64

In [311]:
X[0]

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [312]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [313]:
X_train.shape

(4136, 42637)

In [314]:
X_test.shape

(1035, 42637)

In [315]:
y_train.shape

(4136,)

In [316]:
y_test.shape

(1035,)

In [317]:
clf=RandomForestClassifier(n_jobs=-1)
clf.fit(X_train, y_train)

In [318]:
clf.score(X_test, y_test)

0.9777777777777777

In [319]:
email_to_classify=df.text.values[3]
email_text=transform_text(email_to_classify)
email_corpus=[email_text]
X_email=vectorizer.transform(email_corpus)
flag=clf.predict(X_email)
print(flag)
if flag==1:
    print("The email is spam")
else:
    print("The email is ham")

[1]
The email is spam


In [320]:
df2=df
df2

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291 thi...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001 ( see at...",0
2,3624,ham,"Subject: neon retreat ho ho ho , we ' re aroun...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs this deal is to b...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft the transport vo...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms hpl c...,0
5168,2933,ham,Subject: calpine daily gas nomination > > juli...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [321]:
from tqdm import tqdm

for i in tqdm(range(len(df))):
    email_to_classify=df.text.values[i]
    email_text=transform_text(email_to_classify)
    email_corpus=[email_text]
    X_email=vectorizer.transform(email_corpus)
    flag=clf.predict(X_email)
    df2.at[i, 'predicted_flag'] = flag[0]

100%|██████████████████████████████████████████████████████████████████████████████| 5171/5171 [04:18<00:00, 20.01it/s]


In [322]:
df2

Unnamed: 0.1,Unnamed: 0,label,text,label_num,predicted_flag
0,605,ham,Subject: enron methanol ; meter # : 988291 thi...,0,0.0
1,2349,ham,"Subject: hpl nom for january 9 , 2001 ( see at...",0,0.0
2,3624,ham,"Subject: neon retreat ho ho ho , we ' re aroun...",0,0.0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,1.0
4,2030,ham,Subject: re : indian springs this deal is to b...,0,0.0
...,...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft the transport vo...,0,0.0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms hpl c...,0,0.0
5168,2933,ham,Subject: calpine daily gas nomination > > juli...,0,0.0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0,0.0


In [324]:
a=0
for i in range(len(df)):
    if df2.label_num.iloc[i]!=df2.predicted_flag.iloc[i]:
        a=a+1
a

23