In [1]:
import pandas as pd
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('./dataset.csv')

In [3]:
df.head()

Unnamed: 0,text_type,text
0,spam,naturally irresistible your corporate identity...
1,spam,the stock trading gunslinger fanny is merrill ...
2,spam,unbelievable new homes made easy im wanting to...
3,spam,4 color printing special request additional in...
4,spam,do not have money get software cds from here s...


In [4]:
df.text_type.value_counts()

text_type
ham     14337
spam     6011
Name: count, dtype: int64

Do consider data is imbalanced

In [5]:
df.text.isna().sum()

0

##### Encoding the text_type

In [6]:
encoding = {'spam':1, 'ham': 0}
df['Type'] = df['text_type'].map(encoding)
df['Type']

0        1
1        1
2        1
3        1
4        1
        ..
20343    0
20344    0
20345    0
20346    0
20347    0
Name: Type, Length: 20348, dtype: int64

In [7]:
df.head()

Unnamed: 0,text_type,text,Type
0,spam,naturally irresistible your corporate identity...,1
1,spam,the stock trading gunslinger fanny is merrill ...,1
2,spam,unbelievable new homes made easy im wanting to...,1
3,spam,4 color printing special request additional in...,1
4,spam,do not have money get software cds from here s...,1


In [8]:
df.Type.value_counts()

Type
0    14337
1     6011
Name: count, dtype: int64

In [9]:
df.drop('text_type', axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,text,Type
0,naturally irresistible your corporate identity...,1
1,the stock trading gunslinger fanny is merrill ...,1
2,unbelievable new homes made easy im wanting to...,1
3,4 color printing special request additional in...,1
4,do not have money get software cds from here s...,1


##### handling the text

In [11]:
sampledata = 6011
df_ham = df[df['Type']==0].sample(sampledata, random_state=50, replace=True)
df_spam = df[df['Type']==1].sample(sampledata, random_state=50, replace=True)

In [12]:
df_ham

Unnamed: 0,text,Type
19996,stop aye,0
6964,robin lynn frank rlfrank paradigm omega com wr...,0
12376,buy one egg for me daplease,0
7617,url url date 1635465 1635465 1635465t1635465 1...,0
12471,boo what time u get out u were supposed to tak...,0
...,...,...
12226,hi di is yijue were meeting at 7 pm at esaplan...,0
6243,heh ten years ago saying the exact same words ...,0
11476,guess he wants alone time we could just show u...,0
7227,original message from crispin cowan mailto cri...,0


In [13]:
df_spam

Unnamed: 0,text,Type
19553,i never believe it is until i tried it to star...,1
19339,𝑪𝒐𝒏𝒈𝒓𝒂𝒕𝒖𝒍𝒂𝒕𝒊𝒐𝒏𝒔 𝒕𝒐 𝒎𝒆 𝒐𝒏 𝒎𝒚 𝒔𝒖𝒄𝒄𝒆𝒔𝒔𝒇𝒖𝒍 𝒘𝒊𝒕𝒉𝒅𝒓𝒂...,1
9660,hungry gay guys feeling hungry and up 4 it now...,1
10332,you have won a guaranteed £200 award or even £...,1
132,hey we owe you some money dear homeowner we se...,1
...,...,...
307,localized software all languages available hel...,1
14897,keithflores stop depending on monthly payments...,1
910,how are ya hey how ya been long time no see,1
9955,ur cashbalance is currently 500 pounds to maxi...,1


In [14]:
df_balance=pd.concat([df_spam,df_ham],ignore_index=True)

In [15]:
df_balance

Unnamed: 0,text,Type
0,i never believe it is until i tried it to star...,1
1,𝑪𝒐𝒏𝒈𝒓𝒂𝒕𝒖𝒍𝒂𝒕𝒊𝒐𝒏𝒔 𝒕𝒐 𝒎𝒆 𝒐𝒏 𝒎𝒚 𝒔𝒖𝒄𝒄𝒆𝒔𝒔𝒇𝒖𝒍 𝒘𝒊𝒕𝒉𝒅𝒓𝒂...,1
2,hungry gay guys feeling hungry and up 4 it now...,1
3,you have won a guaranteed £200 award or even £...,1
4,hey we owe you some money dear homeowner we se...,1
...,...,...
12017,hi di is yijue were meeting at 7 pm at esaplan...,0
12018,heh ten years ago saying the exact same words ...,0
12019,guess he wants alone time we could just show u...,0
12020,original message from crispin cowan mailto cri...,0


In [16]:
def tokens(data):
    token = re.findall('[\w]+', data)
    return token

In [17]:
df_balance.text = df_balance.text.apply(tokens)

In [18]:
df_balance.head()

Unnamed: 0,text,Type
0,"[i, never, believe, it, is, until, i, tried, i...",1
1,"[𝑪𝒐𝒏𝒈𝒓𝒂𝒕𝒖𝒍𝒂𝒕𝒊𝒐𝒏𝒔, 𝒕𝒐, 𝒎𝒆, 𝒐𝒏, 𝒎𝒚, 𝒔𝒖𝒄𝒄𝒆𝒔𝒔𝒇𝒖𝒍, ...",1
2,"[hungry, gay, guys, feeling, hungry, and, up, ...",1
3,"[you, have, won, a, guaranteed, 200, award, or...",1
4,"[hey, we, owe, you, some, money, dear, homeown...",1


In [19]:
spam = df_balance.text[df.Type == 1]

In [20]:
spam.shape

(2203,)

In [21]:
spam

0        [i, never, believe, it, is, until, i, tried, i...
1        [𝑪𝒐𝒏𝒈𝒓𝒂𝒕𝒖𝒍𝒂𝒕𝒊𝒐𝒏𝒔, 𝒕𝒐, 𝒎𝒆, 𝒐𝒏, 𝒎𝒚, 𝒔𝒖𝒄𝒄𝒆𝒔𝒔𝒇𝒖𝒍, ...
2        [hungry, gay, guys, feeling, hungry, and, up, ...
3        [you, have, won, a, guaranteed, 200, award, or...
4        [hey, we, owe, you, some, money, dear, homeown...
                               ...                        
11981      [da, is, good, good, playerwhy, he, is, unsold]
11989    [smile, in, pleasure, smile, in, pain, smile, ...
11995    [position, dear, dr, kaminski, my, name, is, j...
12007    [presentation, integrating, market, risk, and,...
12016        [ill, hand, her, my, phone, to, chat, wit, u]
Name: text, Length: 2203, dtype: object

In [22]:
spam_list = [j for i in spam for j in i]

In [23]:
spam_list

['i',
 'never',
 'believe',
 'it',
 'is',
 'until',
 'i',
 'tried',
 'it',
 'to',
 'start',
 'i',
 'deposited',
 '500',
 'for',
 'a',
 'test',
 'in',
 '24',
 'hours',
 'i',
 'got',
 'a',
 'return',
 'of',
 '1000',
 'i',
 'am',
 'so',
 'happy',
 'to',
 'have',
 'invest',
 'with',
 'julianfxtrade',
 'if',
 'you',
 'want',
 'to',
 'trade',
 'contact',
 'him',
 '𝑪𝒐𝒏𝒈𝒓𝒂𝒕𝒖𝒍𝒂𝒕𝒊𝒐𝒏𝒔',
 '𝒕𝒐',
 '𝒎𝒆',
 '𝒐𝒏',
 '𝒎𝒚',
 '𝒔𝒖𝒄𝒄𝒆𝒔𝒔𝒇𝒖𝒍',
 '𝒘𝒊𝒕𝒉𝒅𝒓𝒂𝒘𝒂𝒍',
 '𝒕𝒉𝒂𝒕',
 '𝒊',
 '𝒈𝒐𝒕',
 '𝒘𝒊𝒕𝒉𝒐𝒖𝒕',
 '𝒃𝒆𝒊𝒏𝒈',
 '𝒂𝒔𝒌𝒆𝒅',
 '𝒕𝒐',
 '𝒑𝒂𝒚',
 '𝒂𝒏𝒚',
 '𝒘𝒊𝒕𝒉𝒅𝒓𝒂𝒘𝒂𝒍',
 '𝒄𝒉𝒂𝒓𝒈𝒆𝒔',
 '𝒂𝒏𝒅',
 '𝒏𝒐',
 '𝒉𝒊𝒅𝒅𝒆𝒏',
 '𝒄𝒉𝒂𝒓𝒈𝒆𝒔',
 '𝒂𝒕𝒕𝒂𝒄𝒉ed',
 'hungry',
 'gay',
 'guys',
 'feeling',
 'hungry',
 'and',
 'up',
 '4',
 'it',
 'now',
 'call',
 '08718730555',
 'just',
 '10pmin',
 'to',
 'stop',
 'texts',
 'call',
 '08712460324',
 '10pmin',
 'you',
 'have',
 'won',
 'a',
 'guaranteed',
 '200',
 'award',
 'or',
 'even',
 '1000',
 'cashto',
 'claim',
 'ur',
 'award',
 'call',
 'free',
 'on',
 '08000407165',
 '18',
 '2',
 'stop',
 'getstop',
 'on',
 '88

In [24]:
stop_list = list(stopwords.words('english'))
def stopwords(data):
    words = [w for w in data if w not in stop_list]
    return words

In [25]:
df_balance.text = df_balance.text.apply(stopwords)

In [26]:
df.text

0        naturally irresistible your corporate identity...
1        the stock trading gunslinger fanny is merrill ...
2        unbelievable new homes made easy im wanting to...
3        4 color printing special request additional in...
4        do not have money get software cds from here s...
                               ...                        
20343                                                 /ban
20344                                                 /ban
20345                                                 /ban
20346                                            Kaisi hii
20347                                              Shock q
Name: text, Length: 20348, dtype: object

In [27]:
def Lower(data):
    new_list = []
    for i in data:
            word = i.lower()
            new_list.append(word)
    return new_list

In [28]:
df_balance.text = df_balance.text.apply(Lower)

In [29]:
df_balance.text

0        [never, believe, tried, start, deposited, 500,...
1        [𝑪𝒐𝒏𝒈𝒓𝒂𝒕𝒖𝒍𝒂𝒕𝒊𝒐𝒏𝒔, 𝒕𝒐, 𝒎𝒆, 𝒐𝒏, 𝒎𝒚, 𝒔𝒖𝒄𝒄𝒆𝒔𝒔𝒇𝒖𝒍, ...
2        [hungry, gay, guys, feeling, hungry, 4, call, ...
3        [guaranteed, 200, award, even, 1000, cashto, c...
4        [hey, owe, money, dear, homeowner, sent, email...
                               ...                        
12017    [hi, di, yijue, meeting, 7, pm, esaplanade, to...
12018    [heh, ten, years, ago, saying, exact, words, d...
12019      [guess, wants, alone, time, could, show, watch]
12020    [original, message, crispin, cowan, mailto, cr...
12021    [research, org, please, let, know, corrections...
Name: text, Length: 12022, dtype: object

In [30]:
# stemming
stemmer = PorterStemmer()
def stemm(data):
    stemmed = [stemmer.stem(word) for word in data]
    return stemmed
    

In [31]:
df_balance.text = df_balance.text.apply(stemm)

In [32]:
df_balance.text

0        [never, believ, tri, start, deposit, 500, test...
1        [𝑪𝒐𝒏𝒈𝒓𝒂𝒕𝒖𝒍𝒂𝒕𝒊𝒐𝒏𝒔, 𝒕𝒐, 𝒎𝒆, 𝒐𝒏, 𝒎𝒚, 𝒔𝒖𝒄𝒄𝒆𝒔𝒔𝒇𝒖𝒍, ...
2        [hungri, gay, guy, feel, hungri, 4, call, 0871...
3        [guarante, 200, award, even, 1000, cashto, cla...
4        [hey, owe, money, dear, homeown, sent, email, ...
                               ...                        
12017      [hi, di, yiju, meet, 7, pm, esaplanad, tonight]
12018    [heh, ten, year, ago, say, exact, word, defini...
12019        [guess, want, alon, time, could, show, watch]
12020    [origin, messag, crispin, cowan, mailto, crisp...
12021    [research, org, pleas, let, know, correct, sti...
Name: text, Length: 12022, dtype: object

In [33]:
def join(data):
    joined = ' '.join(data)
    return joined

In [34]:
df_balance.text = df_balance.text.apply(join)

In [35]:
df_balance

Unnamed: 0,text,Type
0,never believ tri start deposit 500 test 24 hou...,1
1,𝑪𝒐𝒏𝒈𝒓𝒂𝒕𝒖𝒍𝒂𝒕𝒊𝒐𝒏𝒔 𝒕𝒐 𝒎𝒆 𝒐𝒏 𝒎𝒚 𝒔𝒖𝒄𝒄𝒆𝒔𝒔𝒇𝒖𝒍 𝒘𝒊𝒕𝒉𝒅𝒓𝒂...,1
2,hungri gay guy feel hungri 4 call 08718730555 ...,1
3,guarante 200 award even 1000 cashto claim ur a...,1
4,hey owe money dear homeown sent email ago qual...,1
...,...,...
12017,hi di yiju meet 7 pm esaplanad tonight,0
12018,heh ten year ago say exact word definit parrot...,0
12019,guess want alon time could show watch,0
12020,origin messag crispin cowan mailto crispin wir...,0


##### Training and testing data

In [36]:
vec = TfidfVectorizer()

In [37]:
a = vec.fit(df_balance['text'])


In [42]:
vec.fit_transform(df_balance['text']).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
b = a.transform(df_balance['text']).toarray()

In [None]:
b

In [None]:
b.shape

In [None]:
# Training and splitting
X = df_balance.drop('Type', axis=1)
y = df_balance['Type']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(b, y, test_size=0.2, random_state=78)

In [None]:
X_train.shape

##### Model selection

In [None]:
model = GaussianNB()

In [None]:
model.fit(X_train, y_train)

In [None]:
test_prediction = model.predict(X_test)

In [None]:
print(classification_report(y_test, test_prediction))

In [None]:
accuracy_score(test_prediction, y_test)

In [None]:
model2 = MultinomialNB()

In [None]:
model2.fit(X_train, y_train)

In [None]:
test_prediction2 = model2.predict(X_test)

In [None]:
print(classification_report(y_test,test_prediction2))

In [None]:
accuracy_score(test_prediction2, y_test)

In [None]:
input_data = "Thank you for paying last month’s bill. We’re rewarding our very best customers with a gift for their loyalty. Click here!"

In [None]:
input_data = tokens(input_data)

In [None]:
input_data

In [None]:
input_data=Lower(input_data)


In [None]:
input_data = stopwords(input_data)
input_data

In [None]:
input_data = stemm(input_data)

In [None]:
input_data

In [None]:
input_data= join(input_data)

In [None]:
input_data

In [None]:
input_val = vec.transform([input_data]).toarray()

In [None]:
model.predict(input_val)

##### dumping models using pickle file

In [None]:
import pickle as pk

In [None]:
pk.dump(model2,open("MultinomialNB.pkl",'wb'))

In [None]:
pk.dump(a,open("TfidfVectorizer.pkl",'wb'))

In [None]:
with open('MultinomialNB.pkl', 'rb') as file:
    obj = pk.load(file)

In [None]:
obj.predict(input_val)