In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt1
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [2]:
#import the dataset
path = "./datasets/P6_P7_spam.csv"
df = pd.read_csv(path,encoding = 'latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.shape

(5572, 5)

In [5]:
df = df[['v1','v2']]
# df['length']=df['v2'].apply(len)

In [6]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# remove punctuations and stopwords
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    if(len(clean) < 9):
        for i in range(9-len(clean)):
            clean.append(11304)
    return clean[:9]

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
df['v2'] = df['v2'].apply(text_process)
df['length'] = df['v2'].apply(len)
unique_words = []
for msg in df['v2']:
    for word in msg:
        unique_words.append(word)
unique_words = set(unique_words)
df.head()

Unnamed: 0,v1,v2,length
0,ham,"[Go, jurong, point, crazy, Available, bugis, n...",9
1,ham,"[Ok, lar, Joking, wif, u, oni, 11304, 11304, 1...",9
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, final]",9
3,ham,"[U, dun, say, early, hor, U, c, already, say]",9
4,ham,"[Nah, dont, think, goes, usf, lives, around, t...",9


In [10]:
len(unique_words)

8508

In [11]:
vocab = {}
i=0
for word in unique_words:
    vocab[word]=i
    i+=1

In [24]:
vocab

{'oil': 0,
 'magical': 1,
 'narcotics': 2,
 '08702490080': 3,
 'forwarding': 4,
 'subtoitles': 5,
 'thinks': 6,
 'youd': 7,
 'site': 8,
 'hsbc': 9,
 'bold2': 10,
 'Sexy': 11,
 'humans': 12,
 'caps': 13,
 'directly': 14,
 'speechless': 15,
 'babyjontet': 16,
 'Nah': 17,
 'Oi': 18,
 'rightly': 19,
 'gotta': 20,
 'ron': 21,
 'King': 22,
 'demand': 23,
 'huai': 24,
 'tablets': 25,
 'begun': 26,
 'eating': 27,
 'knew': 28,
 'tops': 29,
 'bbs': 30,
 'laden': 31,
 'Kkyesterday': 32,
 'looks': 33,
 'stops': 34,
 'Erm': 35,
 'knickers': 36,
 'gay': 37,
 '1hr': 38,
 'kids': 39,
 'Jesus': 40,
 'syrup': 41,
 'Aft': 42,
 'Anything': 43,
 'Ure': 44,
 'sunlight': 45,
 'chicken': 46,
 'Summer': 47,
 'Plsi': 48,
 'loyal': 49,
 'NO165': 50,
 'semester': 51,
 'youwhen': 52,
 'Jolly': 53,
 'Superb': 54,
 'Old': 55,
 '7634': 56,
 'Detroit': 57,
 'unique': 58,
 'Persian': 59,
 'correctly': 60,
 'treadmill': 61,
 '01223585334': 62,
 'watches': 63,
 'ettans': 64,
 'India': 65,
 'bored': 66,
 'varaya': 67,
 'd

In [31]:
def transform_data(data_set,vocab):
    ds=[]
    for row in data_set:
        temp = []
        for word in row:
            temp.append(vocab[word])
        ds.append(temp)
    return ds
            

In [32]:
msg_train,msg_test,label_train,label_test = train_test_split(transform_data(df['v2'],vocab),df['v1'],test_size=0.2, random_state=1)

In [33]:
# improves accuracy significantly
# tfidf_transformer = TfidfTransformer(use_idf = False)
# msg_train=tfidf_transformer.transform(msg_train)
# msg_test=tfidf_transformer.transform(msg_test)
# msg_train.shape

In [34]:
msg_train[:5]

[[7574, 344, 1755, 4547, 2925, 2925, 2925, 2925, 2925],
 [4235, 1082, 4017, 6344, 3373, 2925, 2925, 2925, 2925],
 [7531, 1221, 3541, 3928, 2925, 2925, 2925, 2925, 2925],
 [199, 5822, 3953, 2925, 2925, 2925, 2925, 2925, 2925],
 [2901, 3681, 3210, 1720, 4404, 3221, 5301, 2520, 4237]]

In [35]:
spam_detect_model = MultinomialNB().fit(msg_train,label_train)

In [36]:
y_pred = spam_detect_model.predict(msg_test)
y_pred[:5]

array(['ham', 'spam', 'spam', 'ham', 'spam'], dtype='<U4')

In [37]:
print("accuracy: ",accuracy_score(label_test,y_pred))
print(classification_report(label_test,y_pred))
print(confusion_matrix(label_test,y_pred))

accuracy:  0.6053811659192825
              precision    recall  f1-score   support

         ham       0.91      0.61      0.73       976
        spam       0.17      0.58      0.27       139

    accuracy                           0.61      1115
   macro avg       0.54      0.59      0.50      1115
weighted avg       0.82      0.61      0.67      1115

[[595 381]
 [ 59  80]]
