In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

### Read Data

In [2]:
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Renaming the coulmns and dropping the irrelevant columns

In [3]:
df.rename(columns={'v1': 'spam', 'v2': 'text' },inplace =True)
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1, inplace=True)

### Converting 'spam' column to boolean

In [4]:
df.spam = df.spam.apply(lambda s: True if s=='spam' else False)

### Lowercase the text and remove punctuations

In [5]:
df.text = df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

In [6]:
# shuffling the dataset
df = df.sample(frac=1)

In [7]:
df

Unnamed: 0,spam,text
3442,False,your board is working fine the issue of overhe...
5066,True,83039 62735å£450 uk break accommodationvoucher...
1252,False,yeah like if it goes like it did with my frien...
4591,False,right it wasnt you who phoned it was someone w...
3804,False,dude while were makin those weirdy brownies my...
...,...,...
328,False,cool text me when youre parked
2768,False,i am on the way to ur home
2189,False,yup i shd haf ard 10 pages if i add figures ìï...
4059,True,this weeks savamob member offers are now acces...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5572 entries, 3442 to 4256
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   spam    5572 non-null   bool  
 1   text    5572 non-null   object
dtypes: bool(1), object(1)
memory usage: 92.5+ KB


In [9]:
df.describe()

Unnamed: 0,spam,text
count,5572,5572
unique,2,5142
top,False,sorry ill call later
freq,4825,30


In [10]:
df.spam.value_counts()

False    4825
True      747
Name: spam, dtype: int64

### Divide the dataset into Train set and Test set

In [11]:
# Train set
train_df = df.iloc[:int(len(df)*0.7)]

# Test set
test_df = df.iloc[int(len(df)*0.7):]

In [12]:
# Fraction of spam in the train set
frac_spam_texts = train_df.spam.mean()
print(frac_spam_texts)

0.13384615384615384


### Create Spam Bag of Words and Non-Spam Bag of Words

In [13]:
# Get all the words from spam and non-spam datasets
train_spam_words = ' '.join(train_df[train_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_df[train_df.spam == False].text).split(' ')

common_words = set(train_spam_words).intersection(set(train_non_spam_words))

In [14]:
train_spam_bow = dict()
for w in common_words:
  train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [15]:
train_non_spam_bow = dict()
for w in common_words:
  train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

In [16]:
def predict_text(t, verbose=False):
    # remove words that are neither in spam norin non-spam
    valid_words = [w for w in t if w in train_spam_bow]
    
    # probabilities of every valid word in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)
     
    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(frac_spam_texts)
    
    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-frac_spam_texts)
    
    #if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [17]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob      ratio
0  urgent   0.003296       0.000042  79.185370
1    call   0.019373       0.003538   5.475947
2    this   0.004582       0.003351   1.367540
3  number   0.001929       0.000895   2.155926
Spam Score: -23.30627961103032
Non-Spam Score: -28.592298783826827


True

In [18]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)

      word  spam_prob  non_spam_prob     ratio
0      hey   0.000161       0.001623  0.099044
1       do   0.001206       0.005307  0.227218
2      you   0.014871       0.026721  0.556542
3     want   0.001367       0.002539  0.538245
4       to   0.038344       0.022413  1.710778
5       go   0.001929       0.003746  0.515027
6        a   0.021222       0.015504  1.368796
7    movie   0.000080       0.000312  0.257513
8  tonight   0.000080       0.000853  0.094212
Spam Score: -60.492829565694336
Non-Spam Score: -50.094417151257524


False

In [19]:
predict_text('offer for unlimited money call now'.split(), verbose=True)

        word  spam_prob  non_spam_prob      ratio
0      offer   0.001367       0.000062  21.888639
1        for   0.011334       0.007242   1.565060
2  unlimited   0.000723       0.000062  11.588103
3      money   0.000241       0.000812   0.297131
4       call   0.019373       0.003538   5.475947
5        now   0.010932       0.004495   2.432071
Spam Score: -37.107836522328704
Non-Spam Score: -42.59986140782282


True

In [20]:
predict_text('are you at class yet'.split(), verbose=True)

    word  spam_prob  non_spam_prob     ratio
0    are   0.004180       0.005723  0.730402
1    you   0.014871       0.026721  0.556542
2     at   0.000965       0.005702  0.169169
3  class   0.000161       0.000562  0.286126
4    yet   0.000161       0.000749  0.214594
Spam Score: -36.111625521133305
Non-Spam Score: -28.776897426191336


False

In [21]:
predictions = test_df.text.apply(lambda t: predict_text(t.split()))

In [22]:
frac_spam_messages_correctly_detected = np.sum((predictions == True) & (test_df.spam == True)) / np.sum(test_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected)

Fraction Spam Correctly Detected: 0.9288888888888889


In [23]:
frac_valid_sent_to_spam = np.sum((predictions == True) & (test_df.spam == False)) / np.sum(test_df.spam == False)
print('Fraction Valid Messages Sent to Spam: %s'%frac_valid_sent_to_spam)

Fraction Valid Messages Sent to Spam: 0.028334485141672427
