In [1]:
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

### Inspecting dataset

In [2]:
df = pd.read_csv('spam_or_not_spam.csv')
df.sample(15)

Unnamed: 0,email,label
349,r robert harley harley argote ch writes r dep...,0
356,at NUMBER NUMBER am NUMBER on NUMBER NUMBER NU...,0
714,in a message dated NUMBER NUMBER NUMBER NUMBE...,0
2971,get access to the largest free adult site on t...,1
2814,senior advocate of nigeria barr williams falan...,1
2399,url URL date not supplied berkeley s impact th...,0
1788,use perl daily headline mailer new perl monger...,0
666,well it looks like sun are going ahead with th...,0
13,from chris garrigues cwg exmh deepeddy com da...,0
1886,url URL date not supplied i ve put up a page c...,0


In [3]:
df.label.value_counts()

0    2500
1     500
Name: label, dtype: int64

In [4]:
df.isnull().any()

email     True
label    False
dtype: bool

In [5]:
df.dropna(inplace=True)

In [6]:
df.label.value_counts()

0    2500
1     499
Name: label, dtype: int64

### creating a vocab

In [7]:
vocab= []
for i in df.email:
    vocab = vocab + i.split(" ")

In [8]:
vocab[10]

'from'

In [9]:
## stemming

ps = PorterStemmer()

for i,word in enumerate(vocab):
    vocab[i] = ps.stem(word)

In [10]:
# make it unique (no duplicates)

vocab = list( dict.fromkeys(vocab) )

In [11]:
vocab

['',
 'date',
 'wed',
 'number',
 'aug',
 'from',
 'chri',
 'garrigu',
 'cwg',
 'numberfanumberd',
 'deepeddi',
 'com',
 'messag',
 'id',
 'tmda',
 'vircio',
 'i',
 'can',
 't',
 'reproduc',
 'thi',
 'error',
 'for',
 'me',
 'it',
 'is',
 'veri',
 'repeat',
 'like',
 'everi',
 'time',
 'without',
 'fail',
 'the',
 'debug',
 'log',
 'of',
 'pick',
 'happen',
 'pick_it',
 'exec',
 'inbox',
 'list',
 'lbrace',
 'subject',
 'ftp',
 'rbrace',
 'sequenc',
 'mercuri',
 'ftoc_pickmsg',
 'hit',
 'mark',
 'tkerror',
 'syntax',
 'in',
 'express',
 'int',
 'note',
 'if',
 'run',
 'command',
 'by',
 'hand',
 'delta',
 'that',
 's',
 'where',
 'come',
 'obvious',
 'version',
 'nmh',
 'm',
 'use',
 'compil',
 'on',
 'url',
 'at',
 'sun',
 'mar',
 'ict',
 'and',
 'relev',
 'part',
 'my',
 'mh_profil',
 'mhparam',
 'seq',
 'sel',
 'sinc',
 'work',
 'actual',
 'both',
 'them',
 'one',
 'explicit',
 'line',
 'search',
 'popup',
 'do',
 'get',
 'creat',
 'kre',
 'ps',
 'still',
 'code',
 'form',
 'a',
 'd

In [12]:
len(vocab)

25875

### Converting email to vector

In [13]:
def email_to_vector(email):
    vector = np.zeros(len(vocab))
    for word in email.split(" "):
        stemmed_word = ps.stem(word)
        if stemmed_word in vocab:
            vector[vocab.index(stemmed_word)] += 1
    return vector

In [14]:
vector = email_to_vector(df.email[0])

### initializing training set

In [15]:
y = df.label

In [16]:
# converting all eamils into vectors
X = np.array([email_to_vector(email) for email in df.email])

In [17]:
X.shape

(2999, 25875)

In [18]:
y.shape

(2999,)

In [19]:
X

array([[2., 2., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [2., 0., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 1., 1., 1.]])

### classifier (most fun part)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
model = LogisticRegression()

In [21]:
cross_val_score(model,X,y,cv=3,scoring='accuracy')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.972     , 0.974     , 0.98798799])