## Import the Dependencies

In [26]:
import pandas as pd
import numpy as np
import nltk                # natural language tool kit
import matplotlib.pyplot as plt

## Data collection and processing

In [27]:
messages = pd.read_csv(r"C:\Users\lenovo\Python data\Imarticus Learning\spam1.csv", encoding='cp1252')

In [28]:
# Printing the first 5 rows of the dataframe
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [29]:
# Checking the number of rows and columns
messages.shape

(6776, 5)

In [30]:
# Check the number of missing values in each columns
messages.isnull().sum()

v1               0
v2               0
Unnamed: 2    6720
Unnamed: 3    6760
Unnamed: 4    6768
dtype: int64

In [31]:
# There are so many nulls in last three columns so just drop them

In [32]:
messages = messages.iloc[:,[0,1]]
messages.head(3)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [33]:
# Rename the columns
messages.rename(columns={'v1':'label','v2':'message'},inplace=True)

In [34]:
messages.head(2)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [35]:
# checking the distribution of categorical data
messages.label.value_counts()

ham     5854
spam     922
Name: label, dtype: int64

In [36]:
messages.label.replace({'spam':1,'ham':0}, inplace=True)

In [37]:
messages.label.value_counts()

0    5854
1     922
Name: label, dtype: int64

- whenever you are doing NLP either convert all data into small case or upper case

### NLP data cleaning start

In [38]:
# covert all data in lower case
messages.message = messages.message.str.lower()
messages.message

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
6771    this is the 2nd time we have tried 2 contact u...
6772                will ì_ b going to esplanade fr home?
6773    pity, * was in mood for that. so...any other s...
6774    the guy did some bitching but i acted like i'd...
6775                           rofl. its true to its name
Name: message, Length: 6776, dtype: object

In [39]:
from nltk.corpus import stopwords

In [40]:
# We are just check the stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [41]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
import string

In [43]:
string.punctuation # leter we will remove them also

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [44]:
# we will remove the stop word, punctuation and 
# tokenization (assign id to each word)
# create a tdm
# then do sampling, 
# build the model, 
# do the prediction


In [45]:
def text_process(mess):            ### creating a function
    """                                                        ## a docstring
    1. remove the punctuation
    2. remove the stopwords
    3. return the list of clean textwords
    
    """
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = "".join(nopunc)
    
    return [ word for word in nopunc.split() if word not in stopwords.words("english")]


In [46]:
# messages['message'].apply(text_process) # only for explination purpose

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
# it is used to give count for each unique word

In [48]:
after_trans = CountVectorizer(analyzer=text_process).fit(messages['message'])

In [49]:
after_trans.vocabulary_

{'go': 3744,
 'jurong': 4640,
 'point': 6371,
 'crazy': 2459,
 'available': 1373,
 'bugis': 1836,
 'n': 5620,
 'great': 3841,
 'world': 9100,
 'la': 4799,
 'e': 2984,
 'buffet': 1834,
 'cine': 2174,
 'got': 3801,
 'amore': 1140,
 'wat': 8863,
 'ok': 5936,
 'lar': 4838,
 'joking': 4608,
 'wif': 8996,
 'u': 8527,
 'oni': 5968,
 'free': 3535,
 'entry': 3119,
 '2': 414,
 'wkly': 9052,
 'comp': 2290,
 'win': 9010,
 'fa': 3257,
 'cup': 2514,
 'final': 3381,
 'tkts': 8304,
 '21st': 434,
 'may': 5283,
 '2005': 421,
 'text': 8143,
 '87121': 836,
 'receive': 6769,
 'questionstd': 6660,
 'txt': 8511,
 'ratetcs': 6713,
 'apply': 1226,
 '08452810075over18s': 71,
 'dun': 2970,
 'say': 7123,
 'early': 2991,
 'hor': 4171,
 'c': 1885,
 'already': 1114,
 'nah': 5627,
 'dont': 2878,
 'think': 8217,
 'goes': 3758,
 'usf': 8657,
 'lives': 5000,
 'around': 1277,
 'though': 8236,
 'freemsg': 3543,
 'hey': 4067,
 'darling': 2578,
 '3': 518,
 'weeks': 8916,
 'word': 9086,
 'back': 1424,
 'id': 4290,
 'like': 4

In [50]:
len(after_trans.vocabulary_)
# when we create TDM it should have 9422 columns and 
# 6776 rows

9422

In [51]:
TdmSpamData = after_trans.transform(messages['message'])

In [52]:
TdmSpamData.shape
# TDM will act as my X values

(6776, 9422)

In [53]:
type(TdmSpamData)

scipy.sparse.csr.csr_matrix

In [54]:
TdmSpamData.toarray()[0::]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Spliting the data into Train and Test data

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
train_x , test_x , train_y , test_y = train_test_split(TdmSpamData, messages.label, test_size=.2)

### Apply Naive Bayes

In [57]:
from sklearn.naive_bayes import MultinomialNB

In [58]:
nb_spam = MultinomialNB()

In [59]:
nb_spam.fit(train_x,train_y)

MultinomialNB()

In [60]:
pred = nb_spam.predict(test_x)

In [61]:
from sklearn.metrics import confusion_matrix

In [62]:
tab = confusion_matrix(test_y,pred)
tab

array([[1153,   20],
       [  12,  171]], dtype=int64)

In [63]:
from sklearn.metrics import accuracy_score

In [64]:
accuracy_score(test_y,pred)*100

97.6401179941003

### Apply Decision Tree

In [65]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()                     # Default as gini

In [66]:
dt.fit(train_x,train_y)

DecisionTreeClassifier()

In [67]:
pred1 = dt.predict(test_x)
pred1

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [68]:
pred1.shape

(1356,)

In [69]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y,pred)

array([[1153,   20],
       [  12,  171]], dtype=int64)

In [70]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y,pred) *100

97.6401179941003

### Apply Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier
rfc_trf = RandomForestClassifier()

In [72]:
rfc_trf.fit(train_x,train_y)

RandomForestClassifier()

In [73]:
pred2 = rfc_trf.predict(test_x)
pred2

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [74]:
from sklearn.metrics import confusion_matrix

tab_trf2 = confusion_matrix(test_y,pred2) 
tab_trf2

array([[1173,    0],
       [  26,  157]], dtype=int64)

In [75]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y,pred2) *100

98.08259587020649

### Apply LogisticRegression 

In [76]:
from sklearn.linear_model import LogisticRegression 

In [77]:
log_cr= LogisticRegression()

In [78]:
log_cr.fit(train_x,train_y) 

LogisticRegression()

In [80]:
pred_test = log_cr.predict(test_x)
pred_test

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [81]:
from sklearn.metrics import confusion_matrix  # use this for matrix

In [82]:
tab_cr = confusion_matrix(test_y,pred_test)
tab_cr

array([[1171,    2],
       [  20,  163]], dtype=int64)

In [83]:
from sklearn.metrics import accuracy_score

In [93]:
accuracy_score(test_y,pred_test)*100

98.37758112094396

- We can see the Logistic regression model giving us good result