In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as ply
%matplotlib inline
import os
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

import re

## Get data

In [2]:
home = os.path.expanduser('~')
spam_data_location = f"{home}/Dropbox/My-Portfolio/DataScience/Data/SMSSpamCollection.csv"

spam_data = pd.read_csv(spam_data_location, sep='\t', names=['Label','SMS'])[['SMS','Label']]
print(spam_data.shape)
spam_data.head()

(5572, 2)


Unnamed: 0,SMS,Label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


### Probability of getting a spam or ham message

In [3]:
prob_spam_ham = spam_data['Label'].value_counts(normalize=True).round(2).to_dict()
prob_spam_ham

{'ham': 0.87, 'spam': 0.13}

## Train test split

In [4]:
train = spam_data.sample(frac=0.8, random_state=1)
test = spam_data[~spam_data.index.isin(train.index)]
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
print('training samples',train.shape[0])
print('training class balance',train['Label'].value_counts(normalize=True).round(2).to_dict())
print('testing samples',test.shape[0])
print('testing class balance',test['Label'].value_counts(normalize=True).round(2).to_dict())

training samples 4458
training class balance {'ham': 0.87, 'spam': 0.13}
testing samples 1114
testing class balance {'ham': 0.87, 'spam': 0.13}


## Extract words for anyalsis

In [5]:
def filter_words(words, vocabulary):
    words = [w for w in words if len(w) > 1]
    vocabulary.update(set(words))
    return words

def extract_words(df):
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame([{'SMS':df}])
    # remove any strings that are not words
    df['SMS'] = df['SMS'].str.lower().str.replace('\W',' ')
    df['SMS'] = df['SMS'].str.findall('[a-z ]').str.join('').str.split(' ')
    # gather a set of all unique words in entire dataset
    vocabulary = set()
    df['SMS'] = df['SMS'].map(lambda x: filter_words(x, vocabulary))
    return sorted(list(set(vocabulary))), df
    
def create_word_cols(vocabulary, df):
    # create dummy columns for each word
    for word in vocabulary:
        df[word] = df['SMS'].astype(str).str.count(word)
    return vocabulary, df

vocabulary, train = extract_words(train)
vocabulary, train = create_word_cols(vocabulary, train)
print('vocabulary:',vocabulary[10:20])
train.head()

vocabulary: ['abi', 'ability', 'abiola', 'abj', 'able', 'abnormally', 'about', 'aboutas', 'above', 'abroad']


Unnamed: 0,SMS,Label,aa,aah,aaniye,aathi,ab,abbey,abdomen,abeg,abel,aberdeen,abi,ability,abiola,abj,able,abnormally,about,aboutas,above,abroad,absolutely,abt,abta,aburo,abuse,abusers,ac,academic,acc,accent,accenture,accept,access,accessible,accidant,accident,accidentally,accommodation,accommodationvouchers,accomodate,accomodations,accordin,accordingly,account,accounting,accounts,accumulation,achan,...,yijue,ym,ymca,yo,yoga,yogasana,yor,yorge,you,youdoing,youi,young,younger,youphone,your,youre,yourinclusive,yourjob,yours,yourself,youuuuu,youwanna,yoville,yowifes,yoyyooo,yr,yrs,ystrday,ything,yummy,yun,yunny,yuo,yuou,yup,yupz,zac,zaher,zealand,zebra,zed,zeros,zf,zhong,zindgi,zoe,zogtorius,zouk,zs,zyada
0,"[yep, by, the, pretty, sculpture]",ham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"[yes, princess, are, you, going, to, make, me,...",ham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"[welp, apparently, he, retired]",ham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,[havent],ham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"[forgot, ask, all, smth, there, card, on, da, ...",ham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Train

### Number of occourances of each word in either spam on ham messages

In [6]:
spam_ham = train.groupby('Label').sum().transpose()
spam_ham.head()

Label,ham,spam
aa,31,1
aah,5,0
aaniye,1,0
aathi,4,0
ab,408,36


### Total number of words (not unuque) in spam and ham messages

In [7]:
total_nums = spam_ham.sum(axis=0).to_dict()
total_nums

{'ham': 188704, 'spam': 51786}

### Calculate Naive Bayes probabilities for each word

In [8]:
p_ham, p_spam = prob_spam_ham.values()
n_ham, n_spam = total_nums.values()
alpha = 1

In [9]:
ham_words = (spam_ham['ham'] + alpha) / (n_ham + (1 * len(vocabulary)))
spam_words = (spam_ham['spam'] + alpha) / (n_spam + (1 * len(vocabulary)))
spam_words

aa           0.000034
aah          0.000017
aaniye       0.000017
aathi        0.000017
ab           0.000630
               ...   
zoe          0.000017
zogtorius    0.000017
zouk         0.000034
zs           0.000034
zyada        0.000017
Name: spam, Length: 6954, dtype: float64

In [10]:
ham_words = ham_words.to_dict()
spam_words = spam_words.to_dict()

In [11]:
def message_probability(words):
    words = list(words)
    spam_prob = 1
    ham_prob = 1
    for w in words:
        spam_prob *= spam_words.setdefault(w,1)
        ham_prob *= ham_words.setdefault(w,1)
    return {'spam_prob':spam_prob,'ham_prob':ham_prob}

vocab, df = extract_words(test)
probabilities = list(df['SMS'].map(message_probability).values)
df = pd.concat([df, pd.DataFrame(probabilities)],axis=1)
df['prediction'] = (df['spam_prob'] > df['ham_prob'])
df['prediction_bool'] = df['prediction'].map({True:'spam',False:'ham'})
df

Unnamed: 0,SMS,Label,spam_prob,ham_prob,prediction
0,"[free, entry, in, wkly, comp, to, win, fa, cup...",spam,5.736256e-78,1.025116e-92,spam
1,"[have, date, on, sunday, with, will]",ham,2.224103e-19,2.419818e-19,ham
2,"[xxxmobilemovieclub, to, use, your, credit, cl...",spam,4.890696e-61,2.002799e-68,spam
3,"[fine, if, that, the, way, feel, that, the, wa...",ham,3.670827e-36,9.398280e-29,ham
4,"[is, that, seriously, how, you, spell, his, name]",ham,8.681442e-28,4.900348e-26,ham
...,...,...,...,...,...
1109,"[ic, there, are, lotta, childporn, cars, then]",ham,3.536286e-18,1.128531e-16,ham
1110,"[you, know, wot, people, wear, shirts, jumpers...",ham,1.460457e-39,9.775078e-37,ham
1111,"[have, safe, trip, to, nigeria, wish, you, hap...",ham,2.300133e-58,1.499233e-55,ham
1112,"[yeh, indians, was, nice, tho, it, did, kane, ...",ham,2.237745e-98,3.018767e-88,ham


## Model accuracy

In [None]:
#### accurancy = # correctly classified messages / total number of messages

In [49]:
accuracy = (df['prediction'] == df['Label']).sum(axis=0) / len(df)
print('accuracy:', round(accuracy * 100,1),"%")

accuracy: 94.8 %


### Classification report

In [47]:
from sklearn.metrics import classification_report

df['prediction_bool'] = df['prediction'].map({'spam':True,'ham':False})
df['label_bool'] = df['Label'].map({'spam':True,'ham':False})

report = classification_report(df['label_bool'].values, df['prediction_bool'].values, target_names=['ham','spam'])
print(report)

              precision    recall  f1-score   support

         ham       0.99      0.95      0.97       967
        spam       0.73      0.95      0.83       147

   micro avg       0.95      0.95      0.95      1114
   macro avg       0.86      0.95      0.90      1114
weighted avg       0.96      0.95      0.95      1114



In [52]:
from sklearn.metrics import precision_recall_curve

disp = precision_recall_curve(df['label_bool'].values, df['prediction_bool'].values)
disp

(array([0.13195691, 0.73298429, 1.        ]),
 array([1.        , 0.95238095, 0.        ]),
 array([False,  True]))