In [361]:
import pandas as pd
import numpy as np

In [362]:
with open('../datasets/sms_spam/SMSSpamCollection.txt') as infile:
    data = pd.read_csv(infile, delimiter='\t', header=-1)

data.columns = ['spam', 'message']
data['spam'] = (data['spam'] == 'spam').astype(int)

In [363]:
data.head()

Unnamed: 0,spam,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Spam-filter

## Ord som aktiverer spam-filteret

In [382]:
free_list = ['free']
winner_list =  ['winner', 'win', 'won', 'award', 'selected'] #+ ['prize', 'www', 'awarded', 'cash']
congrats_list = ['congrats', 'congratulations'] #+ ['draw', 'free', 'claim', 'gift', 'guaranteed']
adult_list = ['xxx', 'babe', 'naked', 'dirty', 'flirty']
attention_list = ['urgent', 'attention', 'bonus', 'immediately']
ringtone_list = ['ringtone'] #+ ['unsubscribe', 'subscription', 'subscribe']
 
variable_names = ['free', 'winner', 'congrats', 'adult', 'attention', 'ringtone']
lists = (free_list, winner_list, congrats_list, attention_list, ringtone_list, news_list)
spam_lists = {}
for name, wordlist in zip(variable_names, lists):
    spam_lists[name] = wordlist

## Sett variablene til å være 'nei' som standard

In [383]:
for name in variable_names:
    data[name] = 0
data.head()

Unnamed: 0,spam,message,free,winner,congrats,adult,attention,ringtone
0,0,"Go until jurong point, crazy.. Available only ...",0,0,0,0,0,0
1,0,Ok lar... Joking wif u oni...,0,0,0,0,0,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,0,0,0,0,0,0
3,0,U dun say so early hor... U c already then say...,0,0,0,0,0,0
4,0,"Nah I don't think he goes to usf, he lives aro...",0,0,0,0,0,0


## Finnes ordene i meldingene?

In [384]:
from collections import Counter
word_counter = {}

for index, data_item in data.iterrows():
    message = data_item['message'].lower()
    
    for variable_name, word_list in spam_lists.items():
        for word in word_list:
            if word in message:
                word_counter[word] = word_counter.get(word, 0) + 1
        if any(word in message for word in word_list):
            data.loc[index, variable_name] = 1

data.head()
Counter(word_counter).most_common()

[('free', 265),
 ('win', 167),
 ('won', 167),
 ('urgent', 69),
 ('award', 59),
 ('ringtone', 40),
 ('selected', 29),
 ('subscribe', 28),
 ('winner', 23),
 ('bonus', 21),
 ('unsubscribe', 19),
 ('congrats', 19),
 ('congratulations', 15),
 ('immediately', 10),
 ('attention', 1)]

## Oppdeling i trening- og testsett

In [385]:
from sklearn.model_selection import train_test_split

np.random.seed(0)
train, test = train_test_split(data, test_size=0.2)

train_vars = train.drop(['spam', 'message'], axis=1)
train_response = train['spam']
test_vars = test.drop(['spam', 'message'], axis=1)
test_response = test['spam']

# Besluttningstrær

In [386]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(train_vars, train_response)

tree_predict = tree_model.predict(test_vars)
sum(tree_predict == test_response)/len(test)

0.90672645739910318

In [401]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_response, tree_predict)

array([[941,  14],
       [ 90,  70]])

## Logistisk regresjon

In [404]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(train_vars, train_response)
log_predict = log_model.predict(test_vars)

sum(log_predict == test_response)/len(test)

0.90672645739910318

In [403]:
confusion_matrix(test_response, tree_predict)

array([[941,  14],
       [ 90,  70]])

In [412]:
spam_weight = len(train[train['spam'] == 0])/len(train[train['spam'] == 1])

log_model_weight = LogisticRegression(class_weight={0: spam_weight})

log_model_weight.fit(train_vars, train_response)
log_weight_predict = log_model_weight.predict(test_vars)
sum(log_weight_predict == test_response)/len(test)

0.88968609865470849

## Forbedring av filteret

In [390]:
import re
from collections import Counter

def show_common_words(category, n=None):
    all_relevant_messages = ''
    for index, row in data[data[category] == 1].iterrows():
        all_relevant_messages += ' ' + row['message'].lower()
    
    words = re.findall(r'\w+', all_relevant_messages)
    counter = Counter(words)
    print('Word\tCount')
    for word, count in counter.most_common(n):
        print('%-12s%-12i' % (word, count))

In [392]:
show_common_words('adult')

Word	Count
urgent      70          
to          65          
call        62          
a           61          
your        44          
you         39          
prize       36          
from        31          
claim       30          
is          28          
i           24          
won         24          
2           23          
4           22          
bonus       21          
u           20          
contact     20          
150ppm      19          
or          19          
guaranteed  19          
awarded     18          
was         18          
mobile      18          
we          18          
only        17          
t           17          
collection  17          
have        17          
no          17          
holiday     16          
000         16          
are         15          
line        15          
land        15          
valid       15          
shows       15          
landline    14          
12hrs       14          
sae         14          
cash        14