# SML project - sms spam detection

In [62]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.neural_network import MLPClassifier

import re # regex

In [13]:
#Import spam data and shaping it
smsList = pd.read_table("SMSSpamCollection.txt", header=None, names=['label', 'message'])

smsList["label"] = smsList["label"].map({"ham":0,"spam":1})

X = smsList.message
y = smsList.label

## Naive bayes

In [14]:
#Splitting dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
vect = CountVectorizer()

In [16]:
X_train_vect = vect.fit_transform(X_train)

In [17]:
X_test_vect = vect.transform(X_test)

In [18]:
nb = MultinomialNB()

nb.fit(X_train_vect, y_train)

y_prediction_class = nb.predict(X_test_vect)

metrics.accuracy_score(y_test, y_prediction_class)

0.98851399856424982

### With 10-fold cross validation :

In [11]:
kf = KFold(n_splits=10)
for train_i, test_i in kf.split(X, y):
    cv = CountVectorizer()
    train_set = cv.fit_transform([X[i] for i in train_i])
    train_class = [y[i] for i in train_i]
    test_set = cv.transform([X[i] for i in test_i])
    test_class = [y[i] for i in test_i]
    nb.fit(train_set,train_class)
    pred = nb.predict(test_set)
    print(metrics.accuracy_score(test_class, pred))

0.989247311828
0.978494623656
0.980251346499
0.991023339318
0.982046678636
0.992818671454
0.983842010772
0.989228007181
0.980251346499
0.992818671454


## Backpropagation neural network
We're implementing the technique presented in this article :
[RUAN, Guangchen et TAN, Ying. A three-layer back-propagation neural network for spam detection using artificial immune concentration. Soft computing, 2010, vol. 14, no 2, p. 139-150.](https://link.springer.com/content/pdf/10.1007%2Fs00500-009-0440-2.pdf)

In [118]:
ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"])

In [128]:
regex = re.compile('\w+')
words = {}
N = 0 # number of non-spam
S = 0 # number of spam

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

"""count words support in spam and non-spam sms
words['toto'][0] = number of non-spam sms containing the word 'toto'
words['toto'][1] = number of spam sms containing the word 'toto'
words['toto'][0] + words['toto'][1] = number of sms containing the word 'toto'
"""
for i in X_train.index:
    X_train[i] = X_train[i].lower()
    message = X_train[i]
    spam = y_train[i]
    # print(str(i) + " (" + str(spam) + ") - " + message)
    if spam == 0:
        N += 1
    else:
        S += 1
    messageWords = list(set(regex.findall(message))) # doesn't take duplicate
    for word in messageWords:
        if word in words:
            words[word][spam] += 1
        else:
            words[word] = [(spam + 1) % 2, spam]

# we only keep words that appears in less than 95% of sms
genes = []
for i in words:
    if words[i][0] + words[i][1] >= 0.95 * len(X_train):
        del words[i]
    else:
        words[i].append(words[i][0] / N) # frequency appearing in non-spam
        words[i].append(words[i][1] / S) # frequency appearing in spam
        words[i].append(words[i][2] - words[i][3]) # calculate "proclivity" Cf "4.2 Generation of gene libraries"
        genes.append(i)
    
genes.sort(key = lambda a: -words[a][4])

selfGenes = genes[:100]
nonSelfGenes = genes[-100:]

def getFeatures(message):
    self = 0
    nonSelf = 0
    messageWords = list(set(regex.findall(message)))
    for word in messageWords:
        if word in selfGenes:
            self += 1
        elif word in nonSelfGenes:
            nonSelf += 1
    #return [self / max(1, len(messageWords)), nonSelf / max(1, len(messageWords))]
    return [self, nonSelf] # got better results than the concentration proposed in the paper

In [129]:
X_train_features = []

for message in X_train:
    X_train_features.append(getFeatures(message))
    
X_test_features = []

for message in X_test:
    X_test_features.append(getFeatures(message))

In [147]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2), random_state=1, activation='logistic')
clf.fit(X_train_features, y_train)

test = clf.predict(X_test_features)
correct = 0

for i in range(len(test)):
    if test[i] == y_test[X_test.index[i]]:
        correct += 1
        
print(correct / len(test))

0.957847533632287


In [148]:
clf.predict([getFeatures('hey honey can u buy milk on your way home?')])

array([0])

In [149]:
clf.predict([getFeatures('wanna win a new car? call this phone number asap 01245145624!!!')])

array([1])