# Spam filter

We will work with file: ```spam.csv```

We have tags for messages as ham or spam, let's build model that predicts by Naive Bayes classifier

The accuracy of the messages must be greater than 95% for both validation and testing of the dataset

In [1]:
import pandas as pd
import numpy as np
data=pd.read_csv("spam.csv", encoding='latin-1')
data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True, axis=1)

In [2]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
X=data["v2"].values
y=data["v1"].values

In [4]:
from dataset import Dataset
dataset = Dataset(X,y)

In [5]:
datasets = dataset.split_dataset()
print(len(dataset.train[0]))
print(len(dataset.val[0]))
print(len(dataset.test[0]))

4458
557
557


In [6]:
from model import Model

In [7]:
model = Model()

In [8]:
model.fit(dataset)

In [9]:
len(model.vocab)

7815

In [10]:
model.spam

{'free': 172,
 'ring': 4,
 'tone': 52,
 'just': 69,
 'text': 101,
 'polys': 8,
 'to': 542,
 '87131': 4,
 'then': 8,
 'every': 28,
 'week': 48,
 'get': 68,
 'a': 306,
 'new': 54,
 '0870737910216yrs': 1,
 'only': 69,
 'å': 247,
 '1': 84,
 '50': 45,
 'wk': 27,
 'camera': 27,
 'you': 239,
 'are': 65,
 'awarded': 29,
 'sipix': 6,
 'digital': 6,
 'call': 294,
 '09061221066': 4,
 'fromm': 4,
 'landline': 27,
 'delivery': 17,
 'within': 7,
 '28': 6,
 'days': 11,
 'tbs': 1,
 'persolvo': 1,
 'been': 32,
 'chasing': 1,
 'us': 7,
 'since': 2,
 'sept': 3,
 'forå': 1,
 '38': 1,
 'definitely': 1,
 'not': 21,
 'paying': 1,
 'now': 167,
 'thanks': 14,
 'your': 213,
 'information': 7,
 'we': 44,
 'will': 37,
 'ignore': 1,
 'them': 2,
 'kath': 1,
 'manchester': 1,
 'know': 21,
 'someone': 12,
 'who': 42,
 'that': 22,
 'fancies': 1,
 '09058097218': 1,
 'find': 24,
 'out': 44,
 'pobox': 12,
 '6': 7,
 'ls15hb': 2,
 '150p': 60,
 'sorry': 3,
 'i': 45,
 'missed': 2,
 'let': 4,
 's': 71,
 'talk': 5,
 'when': 9,

In [11]:
model.ham

{'yeah': 64,
 'do': 314,
 'don': 124,
 'û': 26,
 't': 308,
 'stand': 5,
 'to': 1266,
 'close': 13,
 'tho': 15,
 'you': 1568,
 'll': 231,
 'catch': 7,
 'something': 55,
 'sleeping': 13,
 'nt': 12,
 'feeling': 14,
 'well': 88,
 'come': 178,
 'aftr': 5,
 'lt': 248,
 'decimal': 20,
 'gt': 249,
 'now': 235,
 'i': 2365,
 'm': 331,
 'cleaning': 4,
 'the': 916,
 'house': 30,
 'almost': 12,
 'there': 175,
 'see': 121,
 'u': 788,
 'in': 658,
 'a': 863,
 'sec': 4,
 'probably': 26,
 'earlier': 14,
 'than': 29,
 'that': 453,
 'hello': 41,
 'my': 566,
 'love': 156,
 'what': 226,
 'are': 303,
 'doing': 73,
 'did': 101,
 'get': 243,
 'interview': 4,
 'today': 112,
 'happy': 82,
 'being': 31,
 'good': 187,
 'boy': 26,
 'think': 101,
 'of': 413,
 'me': 595,
 'missing': 17,
 '2mro': 1,
 'am': 166,
 'not': 304,
 'coming': 39,
 'gym': 9,
 'machan': 3,
 'goodnight': 7,
 'small': 8,
 'problem': 31,
 'auction': 2,
 'punj': 1,
 'asking': 8,
 'tiwary': 2,
 'still': 123,
 'chance': 9,
 'if': 294,
 'search': 12,


In [12]:
model.inference('free entry in 2 a ')

'spam'

In [13]:
model.validation()

0.9748653500897666

In [14]:
model.test()

0.9838420107719928