In [1]:
import pandas as pd
import nltk
from nltk import word_tokenize
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suryakant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def clean_text(text):
  """Cleans the given text by removing stop words, punctuation, and normalizing the text.

  Args:
    text: The text to be cleaned.

  Returns:
    The cleaned text.
  """

  # Remove stop words.
  stop_words = nltk.corpus.stopwords.words("english")
  text = " ".join([word for word in text.split() if word not in stop_words])

  # Remove punctuation.
  text = text.replace(",", " ").replace(".", " ").replace("!", " ").replace("?", " ")

  # Normalize the text.
  text = text.lower()

  return text


In [3]:
if __name__ == "__main__":
   # Load the sms_spam.csv dataset.
  df = pd.read_csv("sms_spam.csv")

  # Clean the text in the dataset.
  df["text"] = df["text"].apply(clean_text)

  # Save the cleaned dataset to a new file.
  df.to_csv("sms_spam_cleaned.csv", index=False)

In [4]:
df

Unnamed: 0,type,text
0,ham,go jurong point crazy available bugis n gre...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah i think goes usf lives around though
...,...,...
5569,spam,this 2nd time tried 2 contact u u £750 pound ...
5570,ham,will ü b going esplanade fr home
5571,ham,pity * mood that so any suggestions
5572,ham,the guy bitching i acted like i'd interested b...


In [5]:
df = df.sample(frac=1, random_state=42)

# Split the data into a training set and a testing set
train_size = int(len(df) * 0.9)
train_df = df.head(train_size)
test_df = df.tail(len(df) - train_size)

In [6]:
train_df

Unnamed: 0,type,text
3690,ham,you still coming tonight
3527,ham,"""hey babe far 2 spun-out 2 spk at da mo de..."
724,ham,ya even cookies jelly
3370,ham,sorry i've gone place i ll tomorrow really s...
468,ham,when going ride bike
...,...,...
1680,ham,"today ""song dedicated day "" which song u dedi..."
499,ham,kate jackson rec center 7ish right
5058,ham,hey next sun 1030 there's basic yoga course ...
1856,ham,k :)you girl waiting reception ah


In [7]:
test_df

Unnamed: 0,type,text
3719,ham,cool do like swimming i pool jacuzzi house
2386,spam,someone contacted dating service entered phone...
3236,ham,yes princess toledo
1540,ham,you're sure i'm trying make xavier smoke i wan...
1217,spam,you 1 new voicemail please call 08719181513
...,...,...
3772,ham,hi wlcome back wonder got eaten lion somethi...
5191,spam,ree entry 2 weekly comp chance win ipod txt p...
5226,ham,"""oh fuck juswoke up in a bed on a boatin the ..."
5390,ham,not much no fights it was a good nite


In [8]:
spam={}
ham={}
for type1,sentence in train_df.iterrows():
    stri=sentence['text']
    tokens = nltk.word_tokenize(stri)
    if sentence['type'] == "spam":
        for token in tokens:
            if token in spam:
                spam[token] += 1
            else:
                spam[token]=1
    else:
        for token in tokens:
            if token in ham:
                ham[token] += 1
            else:
                ham[token]=1

                    
 

In [9]:
all_words={}
for token in spam:
    if token in all_words:
                all_words[token] += 1
    else:
                 all_words[token]=1
for token in ham:
    if token in all_words:
                all_words[token] += 1
    else:
                all_words[token]=1

In [10]:
spam

{'free': 200,
 'message': 29,
 'activate': 4,
 '500': 22,
 'text': 106,
 'messages': 11,
 'replying': 5,
 'word': 23,
 'for': 32,
 'terms': 6,
 '&': 164,
 'conditions': 3,
 'visit': 6,
 'www': 73,
 '07781482378': 2,
 'com': 46,
 't-mobile': 3,
 'customer': 46,
 'may': 6,
 'claim': 102,
 'camera': 31,
 'phone': 37,
 'upgrade': 3,
 'pay': 1,
 'go': 24,
 'sim': 2,
 'card': 4,
 'loyalty': 7,
 'call': 309,
 '0845': 3,
 '021': 2,
 '3680': 2,
 'offer': 26,
 'ends': 4,
 '28thfeb': 1,
 't': 66,
 'c': 47,
 "'s": 65,
 'apply': 26,
 'kit': 1,
 'strip': 1,
 '-': 54,
 'billed': 2,
 '150p': 21,
 'netcollex': 4,
 'ltd': 14,
 'po': 30,
 'box': 32,
 '1013': 1,
 'ig11': 1,
 'oja': 1,
 'urgent': 57,
 'this': 35,
 '2nd': 18,
 'attempt': 20,
 'contact': 51,
 'u': 141,
 'your': 73,
 '£900': 7,
 'prize': 87,
 'yesterday': 3,
 'still': 6,
 'awaiting': 8,
 'collection': 25,
 'to': 79,
 'now': 118,
 '09061702893': 2,
 'acl03530150pm': 2,
 'germany': 3,
 '1': 40,
 'pence': 3,
 'per': 41,
 'minute': 7,
 'fixed': 2

In [11]:
all_words

{'free': 2,
 'message': 2,
 'activate': 1,
 '500': 1,
 'text': 2,
 'messages': 2,
 'replying': 2,
 'word': 2,
 'for': 2,
 'terms': 2,
 '&': 2,
 'conditions': 2,
 'visit': 2,
 'www': 2,
 '07781482378': 1,
 'com': 2,
 't-mobile': 1,
 'customer': 2,
 'may': 2,
 'claim': 1,
 'camera': 2,
 'phone': 2,
 'upgrade': 1,
 'pay': 2,
 'go': 2,
 'sim': 2,
 'card': 2,
 'loyalty': 1,
 'call': 2,
 '0845': 1,
 '021': 1,
 '3680': 1,
 'offer': 2,
 'ends': 2,
 '28thfeb': 1,
 't': 2,
 'c': 2,
 "'s": 2,
 'apply': 2,
 'kit': 1,
 'strip': 1,
 '-': 2,
 'billed': 1,
 '150p': 1,
 'netcollex': 1,
 'ltd': 1,
 'po': 2,
 'box': 2,
 '1013': 1,
 'ig11': 1,
 'oja': 1,
 'urgent': 2,
 'this': 2,
 '2nd': 2,
 'attempt': 2,
 'contact': 2,
 'u': 2,
 'your': 2,
 '£900': 1,
 'prize': 1,
 'yesterday': 2,
 'still': 2,
 'awaiting': 1,
 'collection': 1,
 'to': 2,
 'now': 2,
 '09061702893': 1,
 'acl03530150pm': 1,
 'germany': 1,
 '1': 2,
 'pence': 1,
 'per': 2,
 'minute': 2,
 'fixed': 2,
 'line': 2,
 'via': 2,
 'access': 2,
 'numbe

In [12]:
ham

{'you': 376,
 'still': 137,
 'coming': 46,
 'tonight': 53,
 '``': 203,
 'hey': 94,
 'babe': 65,
 'far': 14,
 '2': 278,
 'spun-out': 1,
 'spk': 3,
 'at': 42,
 'da': 130,
 'mo': 9,
 'dead': 8,
 'wrld': 1,
 'been': 10,
 'sleeping': 17,
 'on': 35,
 'sofa': 7,
 'all': 50,
 'day': 178,
 'had': 11,
 'a': 79,
 'cool': 38,
 'nytho': 1,
 'tx': 1,
 '4': 170,
 'fonin': 1,
 'hon': 3,
 'call': 217,
 '2mwen': 1,
 'im': 73,
 'bk': 2,
 'frmcloud': 1,
 '9': 27,
 'j': 6,
 'x': 39,
 "''": 99,
 'ya': 53,
 'even': 54,
 'cookies': 2,
 'jelly': 1,
 'sorry': 138,
 'i': 1989,
 "'ve": 70,
 'gone': 12,
 'place': 49,
 'll': 31,
 'tomorrow': 71,
 'really': 80,
 'when': 57,
 'going': 151,
 'ride': 5,
 'bike': 1,
 'daddy': 6,
 'shu': 2,
 'looking': 18,
 'u': 913,
 'wan': 74,
 'tell': 105,
 "'re": 37,
 'singapore': 1,
 'wat': 95,
 'do': 167,
 "n't": 105,
 'think': 113,
 'about': 16,
 'what': 126,
 'have': 86,
 'got': 224,
 'how': 170,
 'use': 37,
 'it': 251,
 'that': 149,
 'good': 212,
 'ni8': 12,
 'the': 85,
 'lay': 

In [13]:
len(all_words)
# len(ham)
# len(spam)

8549

In [14]:
values = list(ham.values())
count_ham= sum(values)
values = list(spam.values())
count_spam= sum(values)
vocab=len(all_words)
count_spam



13482

Prior probAbilities

In [15]:
prob_spam= len(spam)/(len(ham)+len(spam))
prob_ham= len(ham)/(len(ham)+len(spam))

In [16]:
cond_prob_spam={}
for item in spam:
    cond_prob_spam[item]=(spam[item]+1)/(count_spam+vocab)
cond_prob_ham={}
for item in ham:
    cond_prob_ham[item]=(ham[item]+1)/(count_spam+vocab)

In [17]:
cond_prob_spam

{'free': 0.009123507784485497,
 'message': 0.001361717579773955,
 'activate': 0.0002269529299623258,
 '500': 0.0010439834778266988,
 'text': 0.004856792701193773,
 'messages': 0.0005446870319095819,
 'replying': 0.00027234351595479097,
 'word': 0.0010893740638191639,
 'for': 0.0014978893377513504,
 'terms': 0.00031773410194725616,
 '&': 0.007489446688756752,
 'conditions': 0.00018156234396986065,
 'visit': 0.00031773410194725616,
 'www': 0.003358903363442422,
 '07781482378': 0.00013617175797739548,
 'com': 0.0021333575416458625,
 't-mobile': 0.00018156234396986065,
 'customer': 0.0021333575416458625,
 'may': 0.00031773410194725616,
 'claim': 0.004675230357223912,
 'camera': 0.0014524987517588852,
 'phone': 0.0017248422677136762,
 'upgrade': 0.00018156234396986065,
 'pay': 9.078117198493032e-05,
 'go': 0.0011347646498116291,
 'sim': 0.00013617175797739548,
 'card': 0.0002269529299623258,
 'loyalty': 0.0003631246879397213,
 'call': 0.0140710816576642,
 '0845': 0.00018156234396986065,
 '0

In [18]:
import random
# Create a list of all the sentences in the dataframe
sentences = test_df['text'].tolist()
classes=test_df['type'].tolist()
# Generate a random number between 0 and the number of sentences in the list
random_index = random.randint(0, len(sentences))

# Use the list index to access the sentence at the random index
random_sentence = sentences[random_index]
random_class =classes[random_index]

# Print the sentence
print(random_sentence +'"class"'+random_class)

st andre  virgil's cream"class"ham


In [19]:
# checking class of random selected sentence.
prob_test_spam=prob_spam
prob_test_ham=prob_ham
tokens = nltk.word_tokenize(random_sentence)
for token in tokens:
    if token in ham:
        prob_test_ham*=cond_prob_ham[token]
    else:
        prob_test_ham*=(1/(count_ham+vocab))
for token in tokens:
    if token in spam:
        prob_test_spam*=cond_prob_spam[token]
    else:
        prob_test_spam*=(1/(count_spam+vocab))
print("sentence taken: "+ random_sentence + " \n Belongs to class: "+ random_class)
print("probaility of sentence being ham: ")
print(prob_test_ham) 
print("Probablity of test being spam: ")
print(prob_test_spam)
if(prob_test_ham>prob_test_spam):
    print("Sentence predicted to be of class ham")
else:
    print("sentence predicted to be of class spam")

sentence taken: st andre  virgil's cream 
 Belongs to class: ham
probaility of sentence being ham: 
2.0222482420077829e-19
Probablity of test being spam: 
1.1332583553985182e-20
Sentence predicted to be of class ham


In [22]:
true_ham=0
true_spam=0
false_ham=0
false_spam=0
for type1,sentence in test_df.iterrows():
    stri=sentence['text']
    class_og= sentence['type']
    prob_test_spam=prob_spam
    prob_test_ham=prob_ham
    tokens = nltk.word_tokenize(stri)
    for token in tokens:
        if token in ham:
            prob_test_ham*=cond_prob_ham[token]
        else:
            prob_test_ham*=(1/(count_ham+vocab))
    for token in tokens:
        if token in spam:
            prob_test_spam*=cond_prob_spam[token]
        else:
            prob_test_spam*=(1/(count_spam+vocab))
    if(prob_test_ham>prob_test_spam):
        if(class_og=='ham'):
            true_ham+=1
        else:
            false_ham+=1
    else:
        if(class_og=='spam'):
            true_spam+=1
        else:
            false_spam+=1

In [23]:
accuracy=(true_spam+true_ham)/(true_ham+true_spam+false_spam+false_ham)
accuracy

0.96415770609319