In [1]:
import requests
import zipfile
import io
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, HTML
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Downloading the file

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)

In [3]:
folder_name = url.split('/')[-1][:-4]
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall("./" + folder_name)

In [4]:
! ls

Untitled.ipynb    [1m[36msmsspamcollection[m[m


## Exploring the data contents 

In [5]:
# Check file structure
! cd smsspamcollection; head -n 10 SMSSpamCollection; cd ../

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only

In [6]:
df = pd.read_csv("./smsspamcollection/SMSSpamCollection", 
                 sep="\t", 
                 header=None, 
                 names= ["label", "SMS"]
                )
df.head()

Unnamed: 0,label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
df.label.value_counts(normalize=True).round(2)

ham     0.87
spam    0.13
Name: label, dtype: float64

In [9]:
## Divide data into training and test

In [10]:
# Randomize entire dataframe
randomized_df = df.sample(frac=1, random_state=1)
randomized_df

Unnamed: 0,label,SMS
1078,ham,"Yep, by the pretty sculpture"
4028,ham,"Yes, princess. Are you going to make me moan?"
958,ham,Welp apparently he retired
4642,ham,Havent.
4674,ham,I forgot 2 ask ü all smth.. There's a card on ...
...,...,...
905,ham,"We're all getting worried over here, derek and..."
5192,ham,Oh oh... Den muz change plan liao... Go back h...
3980,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235,spam,Text & meet someone sexy today. U can find a d...


In [11]:
num_records = randomized_df.shape[0]
trainset_size = round(0.8*num_records)

training_df = randomized_df.iloc[:trainset_size].reset_index(drop=True)
test_df = randomized_df.iloc[trainset_size:].reset_index(drop=True)

print("TRAINING_DF SHAPE -->", training_df.shape)
print("TEST_DF SHAPE -->", test_df.shape)

TRAINING_DF SHAPE --> (4458, 2)
TEST_DF SHAPE --> (1114, 2)


## Check the distribution of Spam in training and test set

In [12]:
display(HTML("Proportion of spam messages in the three datasets:"))

pd.concat([df.label.value_counts(normalize=True).round(3),
           training_df.label.value_counts(normalize=True).round(3),
           test_df.label.value_counts(normalize=True).round(3)], 
           axis=1, 
           keys=['full_dataset', 'training_set', 'test_set']
         )

Unnamed: 0,full_dataset,training_set,test_set
ham,0.866,0.865,0.868
spam,0.134,0.135,0.132


## Letter case and punctuation

In [13]:
training_df["SMS"] = (training_df.SMS
                         .str.replace("\W", " ", regex=True)
                         .str.lower()
                     )
training_df.head()

Unnamed: 0,label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [14]:
training_df["SMS"] = training_df.SMS.str.split()
vocabulary = []

In [15]:
training_df.SMS.apply(lambda x: [vocabulary.append(item) for item in x]);

In [16]:
vocabulary = list(set(vocabulary))

In [17]:
len(vocabulary)

7783

In [18]:
word_count_per_SMS = {word: [0]* training_df.shape[0] for word in vocabulary}

In [19]:
for index, words in enumerate(training_df.SMS):
    for word in (words):
        word_count_per_SMS[word][index] += 1

In [20]:
word_count_df = pd.DataFrame(word_count_per_SMS)

In [21]:
training_set_clean = pd.concat([training_df, word_count_df], axis=1)
#pd.set_option('display.max_columns', None)

## Formulas

$$ P(Spam\mid w_1, w_2,...,w_n) \propto P(Spam) \cdot \prod\limits_{i=1}^{n}P(w_i\mid Spam)$$

$$ P(Ham\mid w_1, w_2,...,w_n) \propto P(Ham) \cdot \prod\limits_{i=1}^{n}P(w_i\mid Ham)$$



$$ P(w_i\mid Spam) = \frac{N_{(w_i\mid Spam)} + \alpha}{N_{spam} + \alpha \cdot N_{Vocabulary}} $$

$$ P(w_i\mid Ham) = \frac{N_{(w_i\mid Ham)} + \alpha}{N_{Ham} + \alpha \cdot N_{Vocabulary}} $$

## Calculating constants first

In [22]:
constants = {
    # Probability of spam
    'p_spam': training_set_clean.label.value_counts(normalize=True)['spam'],
    # Probability of non spam
    'p_ham':  training_set_clean.label.value_counts(normalize=True)['ham'],
    # number of words in all spam messages
    'n_spam': training_set_clean.groupby('label').sum().sum(axis=1)['spam'],
    # number of words in all non spam messages
    'n_ham': training_set_clean.groupby('label').sum().sum(axis=1)['ham'],
    # number of unique words in vocabulary
    'n_vocabulary': len(word_count_per_SMS),
    # Laplace smooting constant
    'alpha': 1
}
pd.DataFrame(constants, index=['values'])

Unnamed: 0,p_spam,p_ham,n_spam,n_ham,n_vocabulary,alpha
values,0.13459,0.86541,15190,57237,7783,1


## Calculating Parameters

In [23]:
spam_parameters = {item: 0 for item in vocabulary}
ham_parameters = {item: 0 for item in vocabulary}

In [24]:
spam_messages = training_set_clean.loc[training_set_clean['label'] == 'spam']
ham_messages = training_set_clean.loc[training_set_clean['label'] == 'ham']

In [25]:
for word in vocabulary:
    p_word_given_spam = ((spam_messages[word].sum() + constants['alpha'])/
                         (constants['n_spam'] + constants['alpha']*constants['n_vocabulary'])
                        )
    
    p_word_given_ham  = ((ham_messages[word].sum() + constants['alpha'])/
                         (constants['n_ham'] + constants['alpha']*constants['n_vocabulary'])
                        )
    
    spam_parameters[word] = p_word_given_spam
    ham_parameters[word] = p_word_given_ham

## Classifying a new message

In [26]:
def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()


    p_spam_given_message = constants['p_spam']
    p_ham_given_message = constants['p_ham']
    
    for word in message:
        if word in spam_parameters:
            p_spam_given_message *= spam_parameters[word]
            
        if word in ham_parameters:
            p_ham_given_message *= ham_parameters[word]
            

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [27]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [28]:
classify('Sounds good, Tom, then see u there')

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


## Measuring the spam filter's accuracy

In [29]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()


    p_spam_given_message = constants['p_spam']
    p_ham_given_message = constants['p_ham']
    
    for word in message:
        if word in spam_parameters:
            p_spam_given_message *= spam_parameters[word]
            
        if word in ham_parameters:
            p_ham_given_message *= ham_parameters[word]
            
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [30]:
test_df['predicted'] = test_df.SMS.apply(classify_test_set)
test_df

Unnamed: 0,label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham
...,...,...,...
1109,ham,"We're all getting worried over here, derek and...",ham
1110,ham,Oh oh... Den muz change plan liao... Go back h...,ham
1111,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...,ham
1112,spam,Text & meet someone sexy today. U can find a d...,spam


In [31]:
correct = (test_df.label == test_df.predicted).sum()
incorrect = test_df.shape[0] - correct
accuracy = correct/test_df.shape[0]

print(f'Correctly classified records: {correct}')
print(f'InCorrectly classified records: {incorrect}')
print(f'Filter accuracy: {round(accuracy, 3)}')

Correctly classified records: 1100
InCorrectly classified records: 14
Filter accuracy: 0.987


## Exploring wrong classifications

In [32]:
wrongly_classified = test_df[test_df.label != test_df.predicted]

In [33]:
for case in wrongly_classified['predicted'].unique():
    case_SMS = wrongly_classified.query('predicted == @case')
    print(f'Algorithm says {case}, human says {case_SMS.label.unique()[0]}')
    for index, sms in enumerate(case_SMS['SMS']):
        print(f'({index+1}.) {sms}')
        
    print('\n'*2)

Algorithm says ham, human says spam
(1.) Not heard from U4 a while. Call me now am here all night with just my knickers on. Make me beg for it like U did last time 01223585236 XX Luv Nikiyu4.net
(2.) More people are dogging in your area now. Call 09090204448 and join like minded guys. Why not arrange 1 yourself. There's 1 this evening. A£1.50 minAPN LS278BB
(3.) Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg £1.50
(4.) Hi babe its Chloe, how r u? I was smashed on saturday night, it was great! How was your weekend? U been missing me? SP visionsms.com Text stop to stop 150p/text
(5.) 0A$NETWORKS allow companies to bill for SMS, so they are responsible for their "suppliers", just as a shop has to give a guarantee on what they sell. B. G.
(6.) RCT' THNQ Adrian for U text. Rgds Vatian
(7.) 2/2 146tf150p
(8.) Hello. We need some posh birds and chaps to user trial prods for champneys. Can i put you down? I need your address and dob asap. T