In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt1
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
%matplotlib inline

In [2]:
#import the dataset
path = "./datasets/P6_P7_spam.csv"
df = pd.read_csv(path,encoding = 'latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = df[['v1','v2']]

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# remove punctuations and stopwords
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    if(len(clean) < 9):
        for i in range(9-len(clean)):
            clean.append(11304)
    return clean[:9]

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/saoalo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
df['v2'] = df['v2'].apply(text_process)
df['length'] = df['v2'].apply(len)
unique_words = []
for msg in df['v2']:
    for word in msg:
        unique_words.append(word)
unique_words = set(unique_words)
df.head()

Unnamed: 0,v1,v2,length
0,ham,"[Go, jurong, point, crazy, Available, bugis, n...",9
1,ham,"[Ok, lar, Joking, wif, u, oni, 11304, 11304, 1...",9
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, final]",9
3,ham,"[U, dun, say, early, hor, U, c, already, say]",9
4,ham,"[Nah, dont, think, goes, usf, lives, around, t...",9


In [9]:
len(unique_words)

8508

In [10]:
vocab = {}
i=0
for word in unique_words:
    vocab[word]=i
    i+=1

In [11]:
vocab

{'school': 0,
 'Yummmm': 1,
 'toDo': 2,
 'convey': 3,
 'transfr': 4,
 'bold2': 5,
 'Holy': 6,
 'wasn\x89Û÷t': 7,
 'mondaynxt': 8,
 'LONG': 9,
 'lovable': 10,
 'awesome': 11,
 'loverakhesh': 12,
 'Rajnikant': 13,
 'missed': 14,
 'losers': 15,
 'authorise': 16,
 'ref': 17,
 'firmware': 18,
 'reminds': 19,
 'success': 20,
 'subscribers': 21,
 'Oru': 22,
 'proof': 23,
 'baskets': 24,
 'Celebrated': 25,
 '09058097218': 26,
 'Aiyo': 27,
 'normal': 28,
 'ryans': 29,
 'request': 30,
 'takes': 31,
 'watching': 32,
 '44': 33,
 'taunton': 34,
 'squeezed': 35,
 'lots': 36,
 'kwish': 37,
 'leonardo': 38,
 'Euro': 39,
 'improved': 40,
 'Prakesh': 41,
 'reckon': 42,
 'happiest': 43,
 'tomorrowtoday': 44,
 'diff': 45,
 'daytime': 46,
 'DAY': 47,
 'feel': 48,
 'Ah': 49,
 'bread': 50,
 'phne': 51,
 'since': 52,
 'mapquest': 53,
 'smidgin': 54,
 'toshiba': 55,
 'Nimbomsons': 56,
 'Chk': 57,
 'storelike': 58,
 'linear': 59,
 'shelf': 60,
 'frosty': 61,
 'volcanoes': 62,
 'housework': 63,
 'uv': 64,
 'snow

In [12]:
def transform_data(data_set,vocab):
    ds=[]
    for row in data_set:
        temp = []
        for word in row:
            temp.append(vocab[word])
        ds.append(temp)
    return ds

In [13]:
msg_train,msg_test,label_train,label_test = train_test_split(transform_data(df['v2'],vocab),df['v1'],test_size=0.2)

In [14]:
msg_train[:5]

[[1432, 173, 6740, 1849, 3070, 3070, 3070, 3070, 3070],
 [8121, 30, 1855, 1855, 22, 785, 2916, 1560, 4473],
 [5960, 2859, 6674, 1938, 8278, 2900, 8028, 2413, 3070],
 [854, 4858, 4839, 6566, 3062, 7248, 6566, 15, 3070],
 [6764, 2891, 6217, 5297, 3193, 7868, 1407, 6886, 7238]]

In [15]:
# improves accuracy
# tfidf_transformer = TfidfTransformer(use_idf = False)
# msg_train=tfidf_transformer.transform(msg_train)
# msg_test=tfidf_transformer.transform(msg_test)
# msg_train.shape

(4457, 9)

In [16]:
spam_detect_model = SVC()
spam_detect_model.fit(msg_train,label_train)

SVC()

In [17]:
y_pred = spam_detect_model.predict(msg_test)
y_pred[:5]

array(['ham', 'ham', 'ham', 'ham', 'ham'], dtype=object)

In [18]:
print("accuracy: ",accuracy_score(label_test,y_pred))
print(classification_report(label_test,y_pred))
print(confusion_matrix(label_test,y_pred))

accuracy:  0.8690582959641255
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       969
        spam       0.00      0.00      0.00       146

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.76      0.87      0.81      1115

[[969   0]
 [146   0]]


  _warn_prf(average, modifier, msg_start, len(result))
