In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt1
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
%matplotlib inline

In [3]:
#import the dataset
path = "./datasets/P6_P7_spam.csv"
df = pd.read_csv(path,encoding = 'latin-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df = df[['v1','v2']]

In [6]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# remove punctuations and stopwords
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    if(len(clean) < 9):
        for i in range(9-len(clean)):
            clean.append(11304)
    return clean[:9]

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
df['v2'] = df['v2'].apply(text_process)
df['length'] = df['v2'].apply(len)
unique_words = []
for msg in df['v2']:
    for word in msg:
        unique_words.append(word)
unique_words = set(unique_words)
df.head()

Unnamed: 0,v1,v2,length
0,ham,"[Go, jurong, point, crazy, Available, bugis, n...",9
1,ham,"[Ok, lar, Joking, wif, u, oni, 11304, 11304, 1...",9
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, final]",9
3,ham,"[U, dun, say, early, hor, U, c, already, say]",9
4,ham,"[Nah, dont, think, goes, usf, lives, around, t...",9


In [10]:
len(unique_words)

8508

In [11]:
vocab = {}
i=0
for word in unique_words:
    vocab[word]=i
    i+=1

In [12]:
vocab

{'step': 0,
 'Yup': 1,
 '44': 2,
 'Monthly': 3,
 'INDIA': 4,
 'calls': 5,
 'draws': 6,
 'Theory': 7,
 'Message': 8,
 'Sexy': 9,
 'sha': 10,
 'boundaries': 11,
 'discuss': 12,
 'thread': 13,
 'Concentrate': 14,
 'cardin': 15,
 'logoff': 16,
 'permission': 17,
 'simonwatson5120': 18,
 '120': 19,
 '0ANETWORKS': 20,
 'avoid': 21,
 'sar': 22,
 'httpalto18coukwavewaveaspo44345': 23,
 'boston': 24,
 'points': 25,
 'goggles': 26,
 'Guessin': 27,
 'nights': 28,
 'like': 29,
 'workAnd': 30,
 'happiness': 31,
 'Ha': 32,
 'blankets': 33,
 'broke': 34,
 'Simpsons': 35,
 'Rock': 36,
 'PASS': 37,
 'suffering': 38,
 'springs': 39,
 'THING': 40,
 'Wright': 41,
 'extract': 42,
 'alert': 43,
 '530': 44,
 'SURPRISE': 45,
 'sat': 46,
 'sonathaya': 47,
 'Boss': 48,
 'jay': 49,
 'sub': 50,
 'newsBy': 51,
 'latebut': 52,
 'flowing': 53,
 'YES762': 54,
 'BSLVYL': 55,
 'duo': 56,
 'blur': 57,
 'kuch': 58,
 'juan': 59,
 'Show': 60,
 'include': 61,
 '8714714': 62,
 'avatar': 63,
 'beendropping': 64,
 'messagesTex

In [13]:
def transform_data(data_set,vocab):
    ds=[]
    for row in data_set:
        temp = []
        for word in row:
            temp.append(vocab[word])
        ds.append(temp)
    return ds

In [14]:
msg_train,msg_test,label_train,label_test = train_test_split(transform_data(df['v2'],vocab),df['v1'],test_size=0.2)

In [15]:
msg_train

[[2987, 5940, 1585, 5023, 993, 7734, 4018, 2596, 7590],
 [7124, 4780, 6247, 8154, 2996, 2996, 2996, 2996, 2996],
 [4839, 8428, 1625, 6146, 2522, 2994, 1570, 8273, 2996],
 [8199, 3102, 7547, 309, 3633, 5654, 2996, 2996, 2996],
 [1229, 6055, 431, 5717, 6221, 8505, 6351, 6450, 2996],
 [6016, 6106, 3523, 8250, 2616, 3780, 2996, 2996, 2996],
 [5897, 8080, 429, 7969, 6131, 2996, 2996, 2996, 2996],
 [1588, 6731, 1826, 994, 507, 4432, 3435, 5535, 136],
 [7737, 507, 4714, 7119, 1390, 292, 7622, 1378, 8195],
 [1588, 5217, 4256, 5217, 7523, 4557, 5209, 2996, 2996],
 [7198, 6314, 287, 4308, 6016, 7014, 1097, 6578, 2298],
 [1, 3104, 7067, 5539, 2996, 2996, 2996, 2996, 2996],
 [7057, 2616, 7090, 6578, 5421, 8109, 1597, 5875, 2996],
 [2445, 4675, 5625, 5952, 8041, 5625, 2996, 2996, 2996],
 [4223, 7067, 4223, 7676, 5414, 3297, 1836, 4675, 4066],
 [993, 8267, 7327, 2996, 2996, 2996, 2996, 2996, 2996],
 [7800, 1812, 2575, 4223, 8428, 1051, 2996, 2996, 2996],
 [4307, 3219, 2907, 1702, 287, 7943, 2996, 29

In [16]:
msg_train[:5]

[[2987, 5940, 1585, 5023, 993, 7734, 4018, 2596, 7590],
 [7124, 4780, 6247, 8154, 2996, 2996, 2996, 2996, 2996],
 [4839, 8428, 1625, 6146, 2522, 2994, 1570, 8273, 2996],
 [8199, 3102, 7547, 309, 3633, 5654, 2996, 2996, 2996],
 [1229, 6055, 431, 5717, 6221, 8505, 6351, 6450, 2996]]

In [17]:
# improves accuracy
# tfidf_transformer = TfidfTransformer(use_idf = False)
# msg_train=tfidf_transformer.transform(msg_train)
# msg_test=tfidf_transformer.transform(msg_test)
# msg_train.shape

In [18]:
spam_detect_model = SVC()
spam_detect_model.fit(msg_train,label_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [19]:
y_pred = spam_detect_model.predict(msg_test)
y_pred[:5]

array(['ham', 'ham', 'ham', 'ham', 'ham'], dtype=object)

In [20]:
print("accuracy: ",accuracy_score(label_test,y_pred))
print(classification_report(label_test,y_pred))
print(confusion_matrix(label_test,y_pred))

accuracy:  0.9094170403587444
              precision    recall  f1-score   support

         ham       0.91      1.00      0.95       963
        spam       1.00      0.34      0.50       152

    accuracy                           0.91      1115
   macro avg       0.95      0.67      0.73      1115
weighted avg       0.92      0.91      0.89      1115

[[963   0]
 [101  51]]
