In [1]:
import pandas as pd
import warnings as wr
wr.filterwarnings('ignore')

In [2]:
df = pd.read_csv('emails.csv')

In [3]:
df.head(3)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1


In [4]:
df.spam.value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [5]:
df.drop_duplicates(inplace = True)

In [7]:
df.shape

(5695, 2)

In [8]:
df['text'][0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

In [9]:
import nltk
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mdsam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [10]:
from nltk.corpus import stopwords, words

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
stopwords.words('English')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Text Processing

In [16]:
def clean_data(text): 
    remove_punc = [char for char in text if char not in string.punctuation]
    clean_words = ''.join(remove_punc)
    
    remove_stopwords = [word for word in clean_words.split() if word.lower() not in stopwords.words('english')]
    return remove_stopwords

In [17]:
df['text'].head(1).apply(clean_data)

0    [Subject, naturally, irresistible, corporate, ...
Name: text, dtype: object

In [18]:
df['text'].apply(clean_data)

0       [Subject, naturally, irresistible, corporate, ...
1       [Subject, stock, trading, gunslinger, fanny, m...
2       [Subject, unbelievable, new, homes, made, easy...
3       [Subject, 4, color, printing, special, request...
4       [Subject, money, get, software, cds, software,...
                              ...                        
5723    [Subject, research, development, charges, gpg,...
5724    [Subject, receipts, visit, jim, thanks, invita...
5725    [Subject, enron, case, study, update, wow, day...
5726    [Subject, interest, david, please, call, shirl...
5727    [Subject, news, aurora, 5, 2, update, aurora, ...
Name: text, Length: 5695, dtype: object

# Text Extraction

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tf = TfidfVectorizer(analyzer = clean_data).fit_transform(df['text'])

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
xtrain, xtest, ytrain, ytest = train_test_split (tf, df['spam'], test_size=.30)

In [26]:
xtrain

<3986x37229 sparse matrix of type '<class 'numpy.float64'>'
	with 397547 stored elements in Compressed Sparse Row format>

In [27]:
xtrain.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
ytrain.shape

(3986,)

In [31]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [32]:
ber = BernoulliNB()

In [33]:
ber.fit(xtrain, ytrain)

BernoulliNB()

In [34]:
ber.score(xtrain, ytrain)

0.9897139989964877

In [35]:
mul = MultinomialNB()

In [36]:
mul.fit(xtrain, ytrain)

MultinomialNB()

In [38]:
mul.score(xtrain, ytrain)

0.9234821876567988