In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt1
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [2]:
#import the dataset
path = "./datasets/P6_P7_spam.csv"
df = pd.read_csv(path,encoding = 'latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.shape

(5572, 5)

In [5]:
df = df[['v1','v2']]
# df['length']=df['v2'].apply(len)

In [6]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# remove punctuations and stopwords
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    if(len(clean) < 9):
        for i in range(9-len(clean)):
            clean.append(11304)
    return clean[:9]

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/saoalo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
df['v2'] = df['v2'].apply(text_process)
df['length'] = df['v2'].apply(len)
unique_words = []
for msg in df['v2']:
    for word in msg:
        unique_words.append(word)
unique_words = set(unique_words)
df.head()

Unnamed: 0,v1,v2,length
0,ham,"[Go, jurong, point, crazy, Available, bugis, n...",9
1,ham,"[Ok, lar, Joking, wif, u, oni, 11304, 11304, 1...",9
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, final]",9
3,ham,"[U, dun, say, early, hor, U, c, already, say]",9
4,ham,"[Nah, dont, think, goes, usf, lives, around, t...",9


In [10]:
len(unique_words)

8508

In [11]:
vocab = {}
i=0
for word in unique_words:
    vocab[word]=i
    i+=1

In [12]:
vocab

{'Waaaat': 0,
 'ILEAVE': 1,
 'ESPECIALLY': 2,
 'miwa': 3,
 'Kkits': 4,
 'itlet': 5,
 'CLAIRE': 6,
 'max6month': 7,
 'youmoney': 8,
 'pure': 9,
 'ystrdayice': 10,
 'safety': 11,
 'swashbuckling': 12,
 '450': 13,
 'receipts\x89ÛÓwell': 14,
 'helpful': 15,
 'Luckily': 16,
 'tacos': 17,
 'tactless': 18,
 'Plyr': 19,
 'training': 20,
 '077xxx': 21,
 'specific': 22,
 'required': 23,
 'indicate': 24,
 'mids': 25,
 'impression': 26,
 'Canada': 27,
 'DISCOUNT': 28,
 'shu': 29,
 'OREOS': 30,
 'senrddnot': 31,
 'cares': 32,
 'hont': 33,
 'vday': 34,
 'Holy': 35,
 'youdoing': 36,
 'Busy': 37,
 'MANEESHA': 38,
 'Lmaonice': 39,
 'trust': 40,
 'korte': 41,
 'outsider': 42,
 'shut': 43,
 '2bold': 44,
 'appropriate': 45,
 'banned': 46,
 'Normally': 47,
 'owe': 48,
 'respond': 49,
 'deleted': 50,
 'textin': 51,
 'LOST': 52,
 'Sunday': 53,
 'tirupur': 54,
 'åÒIts': 55,
 'drivin': 56,
 'logging': 57,
 'Aiyo': 58,
 'SAY': 59,
 'reslove': 60,
 'ave': 61,
 'isare': 62,
 'Brother': 63,
 'Genius': 64,
 '5years

In [13]:
def transform_data(data_set,vocab):
    ds=[]
    for row in data_set:
        temp = []
        for word in row:
            temp.append(vocab[word])
        ds.append(temp)
    return ds
            

In [14]:
msg_train,msg_test,label_train,label_test = train_test_split(transform_data(df['v2'],vocab),df['v1'],test_size=0.2)

In [15]:
# improves accuracy significantly
# tfidf_transformer = TfidfTransformer(use_idf = False)
# msg_train=tfidf_transformer.transform(msg_train)
# msg_test=tfidf_transformer.transform(msg_test)
# msg_train.shape

(4457, 9)

In [16]:
msg_train[:5]

<5x9 sparse matrix of type '<class 'numpy.float64'>'
	with 45 stored elements in Compressed Sparse Row format>

In [17]:
spam_detect_model = MultinomialNB().fit(msg_train,label_train)

In [18]:
y_pred = spam_detect_model.predict(msg_test)
y_pred[:5]

array(['ham', 'ham', 'ham', 'ham', 'ham'], dtype='<U4')

In [19]:
print("accuracy: ",accuracy_score(label_test,y_pred))
print(classification_report(label_test,y_pred))
print(confusion_matrix(label_test,y_pred))

accuracy:  0.8493273542600897
              precision    recall  f1-score   support

         ham       0.85      1.00      0.92       947
        spam       0.00      0.00      0.00       168

    accuracy                           0.85      1115
   macro avg       0.42      0.50      0.46      1115
weighted avg       0.72      0.85      0.78      1115

[[947   0]
 [168   0]]


  _warn_prf(average, modifier, msg_start, len(result))
