In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("sms_train.csv")
df_train.columns = ['Class', 'Text']
df_train.head()


Unnamed: 0,Class,Text
0,ham,convey my regards to him
1,ham,û_ anyway many good evenings to u s
2,ham,my sort code is and acc no is the bank is n...
3,ham,sorry i din lock my keypad
4,spam,hi babe its chloe how r u i was smashed on s...


In [2]:
# make all volabulary list

vocabulary = []

df_train['Text'].head()

for sms in df_train['Text']:
    for word in str(sms).split():
        vocabulary.append(word)

vocabulary = list(set(vocabulary))
#print(vocabulary)
N_vocabulary = len(vocabulary)

print("Unique words in all messages", N_vocabulary)

Unique words in all messages 7766


In [3]:
spam_messages = df_train[df_train['Class'] == 'spam']
ham_messages = df_train[df_train['Class'] == 'ham']

n_words_per_spam_message = spam_messages['Text'].str.split().apply(len)

spam_total = np.sum(n_words_per_spam_message)

print("Number of words in spam", spam_total)

n_words_per_ham_message = ham_messages['Text'].str.split().apply(len)

ham_total = np.sum(n_words_per_ham_message)
print("Number of words in ham", ham_total)



Number of words in spam 15595
Number of words in ham 56168


In [4]:
# bag of words 

word_count_per_sms = {unique_word: [0] * len(df_train['Text']) for unique_word in vocabulary}


In [5]:
# word_count_per_sms[word][i] = num   means i-th sms messages have num words.

wordlist = []
for index, sms in enumerate(df_train['Text']):
    for word in str(sms).split():
        word_count_per_sms[word][index] = word_count_per_sms[word][index] + 1
        # write code here to make the bag of words dictionary.
        

print("Number of stop" , np.sum(word_count_per_sms["stop"]))
# convert dictionary to pandas dataframe
word_counts = pd.DataFrame(word_count_per_sms)

print(word_counts.head())

df_train_wcount = pd.concat([df_train, word_counts], axis=1)
df_train_wcount.head()

# Isolating spam and ham messages 
spam_messages = df_train_wcount[df_train_wcount['Class'] == 'spam']
ham_messages = df_train_wcount[df_train_wcount['Class'] == 'ham']

#print(spam_messages.head())

Number of stop 123
   angels  shaved  ollu  green  wamma  09061790126  sashimi  moseley  pop  \
0       0       0     0      0      0            0        0        0    0   
1       0       0     0      0      0            0        0        0    0   
2       0       0     0      0      0            0        0        0    0   
3       0       0     0      0      0            0        0        0    0   
4       0       0     0      0      0            0        0        0    0   

   4goten  ...  colleg  sorting  anyways  miles  round  a30  large  arent  \
0       0  ...       0        0        0      0      0    0      0      0   
1       0  ...       0        0        0      0      0    0      0      0   
2       0  ...       0        0        0      0      0    0      0      0   
3       0  ...       0        0        0      0      0    0      0      0   
4       0  ...       0        0        0      0      0    0      0      0   

   your  dearer  
0     0       0  
1     0       0  
2

In [6]:
callspam = np.sum(spam_messages["call"])
callham = np.sum(ham_messages["call"])

print("Number of call in spam" , callspam)
print("Number of call in ham" , callham)

Number of call in spam 282
Number of call in ham 190


In [7]:
alpha = 1
# Initiate parameters
# p_wi_spam[wi] means P(wi|Spam)
p_wi_spam = {unique_word:0 for unique_word in vocabulary}
p_wi_ham = {unique_word:0 for unique_word in vocabulary}

for word in vocabulary:
    n_wi_spam = np.sum(spam_messages[word])
    p_wi_spam[word] = (n_wi_spam + alpha)/(spam_total + alpha*N_vocabulary)
    # calculate P(wi|Spam) and P(wi|Ham) under alpha = 1
    # P(wi|Spam) = (# of times the word wi occurs in spam message) + alpha / n_spam + alpha * n_volabulary
    # P(wi|Ham) = (# of times the word wi occurs in ham message) + alpha / n_ham + alpha * n_volabulary
    
for word in vocabulary:
    n_wi_ham = np.sum(ham_messages[word])
    p_wi_ham[word] = (n_wi_ham + alpha)/(ham_total + alpha*N_vocabulary)
    # calculate P(wi|Spam) and P(wi|Ham) under alpha = 1
    # P(wi|Spam) = (# of times the word wi occurs in spam message) + alpha / n_spam + alpha * n_volabulary
    # P(wi|Ham) = (# of times the word wi occurs in ham message) + alpha / n_ham + alpha * n_volabulary
    
print("P(call|spam) is ", p_wi_spam["call"])
print("P(call|ham) is ", p_wi_ham["call"])

P(call|spam) is  0.01211420743975001
P(call|ham) is  0.0029874558138079893


In [8]:
p_spam = spam_total/(spam_total + ham_total)

p_ham = ham_total/(spam_total + ham_total)

print("p(spam) is ", p_spam)
print("p(ham) is ", p_ham)

p(spam) is  0.21731254267519473
p(ham) is  0.7826874573248053


In [9]:
import math

# logP_wi_spam = {unique_word:0 for unique_word in vocabulary}
# logP_wi_ham = {unique_word:0 for unique_word in vocabulary}

# for word in vocabulary:
#     logP_wi_spam[word] = math.log(p_wi_spam[word])
    
# for word in vocabulary:
#     logP_wi_ham[word] = math.log(p_wi_ham[word])
    
# P_spam_free_video = math.log(p_spam) + logP_wi_spam["free"] + logP_wi_spam["video"]

# P_ham_free_video = math.log(p_ham) + logP_wi_ham["free"] + logP_wi_ham["video"]


# print("P(spam|free, video) is ", P_spam_free_video)
# print("P(ham|free, video) is ", P_ham_free_video)


P_spam_free_video = math.exp(math.log(p_spam) + math.log(p_wi_spam["free"]) + math.log(p_wi_spam["video"]))

P_ham_free_video = math.exp(math.log(p_ham) + math.log(p_wi_ham["free"]) + math.log(p_wi_ham["video"]))


print("P(spam|free, video) is ", P_spam_free_video)
print("P(ham|free, video) is ", P_ham_free_video)


# calculate P(Spam|w1, w2, w3, ...) and P(Ham|w1, w2, w3, ...) and compare them

# if P(Spam|w1, w2, w3, ...) >= P(Ham|w1, w2, w3, ...) : Spam message
# if P(Spam|w1, w2, w3, ...) < P(Ham|w1, w2, w3, ...) : Ham message

P(spam|free, video) is  1.6119161193766106e-06
P(ham|free, video) is  3.216872903478259e-08


In [10]:
from collections import Counter


allspam_messages = df_train[df_train['Class'] == 'spam']
allham_messages = df_train[df_train['Class'] == 'ham']

#find most common spam words

most_occur_spam = Counter(" ".join(allspam_messages["Text"]).split()).most_common(100)

print(most_occur_spam)

most_word_spam = []
for i in most_occur_spam:
    a, b = i
    most_word_spam.append(a)
print(most_word_spam)

[('to', 561), ('a', 315), ('call', 282), ('å', 249), ('you', 238), ('your', 219), ('free', 175), ('2', 168), ('now', 166), ('the', 163), ('for', 160), ('or', 146), ('u', 142), ('txt', 138), ('is', 133), ('4', 120), ('ur', 117), ('on', 113), ('have', 108), ('from', 107), ('mobile', 103), ('text', 98), ('claim', 95), ('1', 94), ('and', 91), ('stop', 90), ('with', 88), ('reply', 83), ('of', 81), ('s', 76), ('prize', 75), ('www', 74), ('this', 73), ('t', 71), ('only', 66), ('get', 66), ('in', 65), ('are', 65), ('our', 64), ('won', 63), ('just', 63), ('no', 61), ('cash', 60), ('new', 60), ('win', 57), ('send', 57), ('nokia', 55), ('i', 53), ('150p', 53), ('urgent', 52), ('uk', 51), ('out', 49), ('week', 49), ('tone', 48), ('c', 47), ('contact', 47), ('com', 46), ('50', 46), ('service', 45), ('be', 43), ('16', 42), ('18', 41), ('customer', 41), ('3', 40), ('we', 40), ('phone', 39), ('please', 39), ('msg', 38), ('guaranteed', 38), ('per', 37), ('will', 36), ('100', 36), ('co', 35), ('500', 35

In [11]:
#find most common ham words

most_occur_ham = Counter(" ".join(allham_messages["Text"]).split()).most_common(100)

print(most_occur_ham)

most_word_ham = []
for i in most_occur_ham:
    a, b = i
    most_word_ham.append(a)
print(most_word_ham)

[('i', 2305), ('you', 1531), ('to', 1223), ('the', 877), ('a', 828), ('u', 819), ('and', 684), ('in', 643), ('me', 623), ('my', 607), ('is', 567), ('it', 563), ('of', 428), ('that', 421), ('for', 394), ('but', 350), ('s', 344), ('so', 344), ('can', 340), ('have', 334), ('not', 332), ('are', 331), ('your', 329), ('on', 314), ('m', 311), ('at', 304), ('do', 303), ('t', 286), ('we', 277), ('will', 273), ('be', 271), ('if', 266), ('gt', 256), ('2', 254), ('lt', 253), ('how', 251), ('just', 245), ('no', 239), ('get', 238), ('up', 237), ('now', 235), ('when', 226), ('ok', 223), ('this', 213), ('with', 212), ('what', 205), ('ll', 202), ('all', 195), ('good', 194), ('ur', 194), ('go', 193), ('got', 192), ('call', 190), ('like', 189), ('was', 187), ('know', 183), ('then', 180), ('come', 178), ('he', 178), ('or', 177), ('its', 174), ('out', 174), ('am', 170), ('love', 167), ('day', 166), ('time', 161), ('there', 155), ('4', 140), ('she', 136), ('home', 135), ('need', 132), ('lor', 128), ('from',

In [12]:
#find intersect of list

bothspamham = list(set(most_word_spam) & set(most_word_ham))
print(bothspamham)

['you', 'get', 'me', 'it', 'out', 'for', 'of', 'on', 'from', 'send', 'we', 'have', 's', 'ur', 'i', 'as', 'all', 'just', 'the', 'this', 'call', 'are', 'u', 'and', '4', 'now', 't', 'no', '2', 'by', 'or', 'only', 'in', 'is', 'be', 'with', 'to', 'a', 'your', 'will']


In [13]:
df_test = pd.read_csv("sms_test_dist.csv")

spamham = []
for sms in df_test['Text']:
    p_wi_spam_sum = 0
    p_wi_ham_sum = 0
    for word in str(sms).split():
        if word in vocabulary:
            #don't consider words that are common in both spam and ham
            if word not in bothspamham:
                p_wi_spam_sum = math.log(p_wi_spam[word]) +  p_wi_spam_sum 
                p_wi_ham_sum = math.log(p_wi_ham[word]) +  p_wi_ham_sum 
    p_spam_wi = math.exp(math.log(p_spam) + p_wi_spam_sum)
    p_ham_wi = math.exp(math.log(p_ham) + p_wi_ham_sum)
    if(p_spam_wi >= p_ham_wi):
        spamham.append("spam")
    else:
        spamham.append("ham")

df_test1 = df_test
df_test1['Class'] = spamham

df_test1.to_csv('full_data.csv')
