<a href="https://colab.research.google.com/github/maryamshahani/google_colab/blob/main/SimpleSmsSpamFilter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Can you use SMSspam.csv dataset to build a prediction model that will accurately classify which texts are spam?


In [37]:
#connect to google drive:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
#dataset file path:
file_url = '/content/drive/MyDrive/GoogleColabFiles/SMSspam.csv'

In [39]:
#If you get a unicode decode error while loading dataset using Pandas: https://www.kaggle.com/paultimothymooney/how-to-resolve-a-unicodedecodeerror-for-a-csv-file
import chardet
with open(file_url, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'confidence': 0.7270322499829184, 'encoding': 'Windows-1252', 'language': ''}

# Step 1: Loading the Dataset using Pandas:

In [40]:
import pandas as pd
data = pd.read_csv(file_url,encoding='Windows-1252') # names=['LABEL', 'SMS','NO','NOO','NOOO']
data.head() 

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Step 2: Pre-Processing to make our data easiest to process


In [41]:
#load stopwords and punctuations
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

print(stopwords[:10])
print(len(stopwords))
print(stopwords[170:179])
print(punctuation[:5])
print(punctuation)
print(len(punctuation))
print(punctuation[22:32])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
179
["shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
!"#$%
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
32
[\]^_`{|}~


In [42]:
#removing stopwords and punctuations from our dataset
def pre_process(v2):
  lowercase_remove_punkt = "".join([char.lower() for char in v2 if char not in punctuation])
  tokenize = nltk.tokenize.word_tokenize(lowercase_remove_punkt)
  remove_stopwords = [word for word in tokenize if word not in stopwords]
  return remove_stopwords

 #adding a column to our data with our processed messages 
data['processed'] = data['v2'].apply(lambda x : pre_process(x))
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,processed
0,ham,"Go until jurong point, crazy.. Available only ...",,,,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,,,,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,,,,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,"[nah, dont, think, goes, usf, lives, around, t..."


In [43]:
data['processed'].head()

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, 2, wkly, comp, win, fa, cup, fin...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: processed, dtype: object

# Step 3: Categorizing and Counting Tokens

In [44]:
#categorizing ham/spam associated words
def categorize_words():
  spam_words = []
  ham_words = []
  
  for x in data['processed'][data['v1'] == 'spam']:
    for word in x:
      spam_words.append(word)

  for x in data['processed'][data['v1'] == 'ham']:
    for word in x:
      ham_words.append(word)

  return spam_words, ham_words

spam_words, ham_words = categorize_words()

print(spam_words[:6])
print(ham_words[:6])

['free', 'entry', '2', 'wkly', 'comp', 'win']
['go', 'jurong', 'point', 'crazy', 'available', 'bugis']


# Step 4: Predict Function


In [45]:
#itterate from all the words from the user input and count their accurances in both spam_words and ham_words
def predict(user_input):
  spam_counter = 0
  ham_counter = 0

  for word in user_input:
    spam_counter += spam_words.count(word)
    ham_counter += ham_words.count(word)

  print('***RESULTS***')
  #if the message is ham
  if ham_counter > spam_counter:
      accuracy = round((ham_counter / (ham_counter + spam_counter) * 100))
      print('messege is not spam, with {}% certainty'.format(accuracy))
  #if the message could be equally spam and ham
  elif ham_counter == spam_counter:
      print('message could be spam')
  #if the message is spam
  else:
      accuracy = round((spam_counter / (ham_counter + spam_counter)* 100))
      print('message is spam, with {}% certainty'.format(accuracy))


#Collecting User Input & Results
user_input = input("Please type a spam or ham message to check if our function predicts accurately\n") 

#pre-processing the input before prediction
processed_input = pre_process(user_input)

predict(processed_input)  


Please type a spam or ham message to check if our function predicts accurately
my name is Jack Capitan Jack
***RESULTS***
messege is not spam, with 73% certainty
