# Reading a text-based dataset into pandas

In [None]:
!pip install cufflinks

Collecting cufflinks
  Downloading https://files.pythonhosted.org/packages/1a/18/4d32edaaf31ba4af9745dac676c4a28c48d3fc539000c29e855bd8db3b86/cufflinks-0.17.3.tar.gz (81kB)
Collecting colorlover>=0.2.1 (from cufflinks)
  Downloading https://files.pythonhosted.org/packages/9a/53/f696e4480b1d1de3b1523991dea71cf417c8b19fe70c704da164f3f90972/colorlover-0.3.0-py3-none-any.whl
Building wheels for collected packages: cufflinks
  Building wheel for cufflinks (setup.py): started
  Building wheel for cufflinks (setup.py): finished with status 'done'
  Stored in directory: C:\Users\Software\AppData\Local\pip\Cache\wheels\7d\ba\8d\38b672c3e40d8bd22dd60b8e6e29965b43f2b4be4d064e44d5
Successfully built cufflinks
Installing collected packages: colorlover, cufflinks
Successfully installed colorlover-0.3.0 cufflinks-0.17.3


In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly 
import plotly.graph_objects as go
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Remove the Unnamed: 2, Unnamed: 3, Unnamed: 4 columns due to all the entries were null.

In [None]:
# see the null data here
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [None]:
df.info

<bound method DataFrame.info of         v1                                                 v2
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
10     ham  I'm gonna be home soon and i don't want to tal...
11    spam  SIX chances to win CASH! From 100 to 20,000 po...
12    spam  URGENT! You have won a 1 week FREE membership ...
13     ham  I've been searching for the right words to tha...
14     ham                I HAVE A DAT

In [None]:
#df.drop(['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [None]:
# search the most relevant message 
df['v2'].describe()

count                       5572
unique                      5169
top       Sorry, I'll call later
freq                          30
Name: v2, dtype: object

In [None]:
# count of ham and spam
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [None]:
# convert categorical v1 to numerical with new column
df['v1_nm'] = df.v1.map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,v1,v2,v1_nm
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# interactive plotly hist plot for numerical vi_nm columns(i,e ham and spam)
df['v1_nm'].iplot(kind='hist')

In [None]:
# creating a new column with message length using v2 column
df['v2_le'] = df.v2.apply(len)
df.head()

Unnamed: 0,v1,v2,v1_nm,v2_le
0,ham,"Go until jurong point, crazy.. Available only ...",0,111
1,ham,Ok lar... Joking wif u oni...,0,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,155
3,ham,U dun say so early hor... U c already then say...,0,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,61


In [None]:
# Histogram plot for spam and ham labels with respeect to message length
#plt.figure(figsize=(12,8))
#df[df['v1']=='ham'].v2_le.plot(bins = 50, kind= 'hist', color='blue', label='ham', alpha=0.75)
#df[df['v1']=='spam'].v2_le.plot(bins=50, kind= 'hist', color='red', label = 'spam', alpha=0.75)
#plt.legend()
#plt.xlabel('Message length')

In [None]:
# describe the ham for some numerical insights
df[df['v1']=='ham'].describe

<bound method NDFrame.describe of        v1                                                 v2  v1_nm  v2_le
0     ham  Go until jurong point, crazy.. Available only ...      0    111
1     ham                      Ok lar... Joking wif u oni...      0     29
3     ham  U dun say so early hor... U c already then say...      0     49
4     ham  Nah I don't think he goes to usf, he lives aro...      0     61
6     ham  Even my brother is not like to speak with me. ...      0     77
7     ham  As per your request 'Melle Melle (Oru Minnamin...      0    160
10    ham  I'm gonna be home soon and i don't want to tal...      0    109
13    ham  I've been searching for the right words to tha...      0    196
14    ham                I HAVE A DATE ON SUNDAY WITH WILL!!      0     35
16    ham                         Oh k...i'm watching here:)      0     26
17    ham  Eh u remember how 2 spell his name... Yes i di...      0     81
18    ham  Fine if that?s the way u feel. That?s the way ...      

In [None]:
# describe the spam some numerical insights
df[df['v1']=='spam'].describe

<bound method NDFrame.describe of         v1                                                 v2  v1_nm  v2_le
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...      1    155
5     spam  FreeMsg Hey there darling it's been 3 week's n...      1    147
8     spam  WINNER!! As a valued network customer you have...      1    157
9     spam  Had your mobile 11 months or more? U R entitle...      1    154
11    spam  SIX chances to win CASH! From 100 to 20,000 po...      1    136
12    spam  URGENT! You have won a 1 week FREE membership ...      1    155
15    spam  XXXMobileMovieClub: To use your credit, click ...      1    149
19    spam  England v Macedonia - dont miss the goals/team...      1    156
34    spam  Thanks for your subscription to Ringtone UK yo...      1    158
42    spam  07732584351 - Rodger Burns - MSG = We tried to...      1    172
54    spam  SMS. ac Sptv: The New Jersey Devils and the De...      1    120
56    spam  Congrats! 1 year special cinema pass for 2

In [None]:
# describe the both numerical columns
df.describe

<bound method NDFrame.describe of         v1                                                 v2  v1_nm  v2_le
0      ham  Go until jurong point, crazy.. Available only ...      0    111
1      ham                      Ok lar... Joking wif u oni...      0     29
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...      1    155
3      ham  U dun say so early hor... U c already then say...      0     49
4      ham  Nah I don't think he goes to usf, he lives aro...      0     61
5     spam  FreeMsg Hey there darling it's been 3 week's n...      1    147
6      ham  Even my brother is not like to speak with me. ...      0     77
7      ham  As per your request 'Melle Melle (Oru Minnamin...      0    160
8     spam  WINNER!! As a valued network customer you have...      1    157
9     spam  Had your mobile 11 months or more? U R entitle...      1    154
10     ham  I'm gonna be home soon and i don't want to tal...      0    109
11    spam  SIX chances to win CASH! From 100 to 20,00

In [None]:
# see in describe we have 910 word message, let's look at it
df[df['v2_le']==910].v2.iloc[0]

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

# Text Pre-processing

Our main issue with our data is that it is all in text format (strings). The classification algorithms that we usally use need some sort of numerical feature vector in order to perform the classification task. There are actually many methods to convert a corpus to a vector format. The simplest is the bag-of-words approach, where each unique word in a text will be represented by one number.

In this section we'll convert the raw messages (sequence of characters) into vectors (sequences of numbers).

As a first step, let's write a function that will split a message into its individual words and return a list. We'll also remove very common words, ('the', 'a', etc..). To do this we will take advantage of the NLTK library. It's pretty much the standard library in Python for processing text and has a lot of useful features. We'll only use some of the basic ones here.

Let's create a function that will process the string in the message column, then we can just use apply() in pandas do process all the text in the DataFrame.

First removing punctuation. We can just take advantage of Python's built-in string library to get a quick list of all the possible punctuation:

In [None]:
import string 
from nltk.corpus import stopwords

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

Now let's "tokenize" these messages. Tokenization is just the term used to describe the process of converting the normal text strings in to a list of tokens (words that we actually want).

In [None]:
df['clean_msg'] = df.v2.apply(text_process)

In [None]:
df.head()

Unnamed: 0,v1,v2,v1_nm,v2_le,clean_msg
0,ham,"Go until jurong point, crazy.. Available only ...",0,111,Go jurong point crazy Available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,0,29,Ok lar Joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,155,Free entry wkly comp win FA Cup final tkts 21s...
3,ham,U dun say so early hor... U c already then say...,0,49,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,61,Nah think goes usf lives around though


In [None]:
type(stopwords.words('english'))

list

In [None]:
from collections import Counter

words = df[df['v1']=='ham'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])
ham_words = Counter()

for msg in words:
    ham_words.update(msg)
    
print(ham_words.most_common(50))    

[('get', 303), ('ltgt', 276), ('ok', 272), ('go', 247), ('ill', 236), ('know', 232), ('got', 231), ('like', 229), ('call', 229), ('come', 224), ('good', 222), ('time', 189), ('day', 187), ('love', 185), ('going', 167), ('want', 163), ('one', 162), ('home', 160), ('lor', 160), ('need', 156), ('sorry', 153), ('still', 146), ('see', 137), ('n', 134), ('later', 134), ('da', 131), ('r', 131), ('back', 129), ('think', 128), ('well', 126), ('today', 125), ('send', 123), ('tell', 121), ('cant', 119), ('hi', 117), ('take', 112), ('much', 112), ('oh', 111), ('night', 107), ('hey', 106), ('happy', 105), ('great', 100), ('way', 100), ('hope', 99), ('thats', 98), ('pls', 98), ('work', 96), ('wat', 95), ('dear', 94), ('give', 92)]


In [None]:
words = df[df.v1=='spam'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])
spam_words = Counter()

for msg in words:
    spam_words.update(msg)
    
print(spam_words.most_common(50))

[('call', 347), ('free', 216), ('txt', 150), ('mobile', 123), ('text', 120), ('claim', 113), ('stop', 113), ('reply', 101), ('prize', 92), ('get', 83), ('new', 69), ('send', 67), ('nokia', 65), ('urgent', 63), ('cash', 62), ('win', 60), ('contact', 56), ('service', 55), ('please', 52), ('guaranteed', 50), ('customer', 49), ('16', 49), ('week', 49), ('tone', 48), ('per', 46), ('phone', 45), ('500', 44), ('18', 43), ('chat', 42), ('1000', 41), ('150', 39), ('awarded', 38), ('draw', 38), ('100', 37), ('latest', 36), ('1', 36), ('line', 35), ('150ppm', 34), ('2000', 34), ('mins', 34), ('receive', 33), ('camera', 33), ('every', 33), ('message', 32), ('holiday', 32), ('landline', 32), ('shows', 31), ('go', 31), ('box', 30), ('number', 30)]


# Vectorization

Currently, we have the messages as lists of tokens (also known as lemmas) and now we need to convert each of those messages into a vector the SciKit Learn's algorithm models can work with.

Now we'll convert each message, represented as a list of tokens (lemmas) above, into a vector that machine learning models can understand.

We'll do that in three steps using the bag-of-words model:

*    Count how many times does a word occur in each message (Known as term frequency)
*    Weigh the counts, so that frequent tokens get lower weight (inverse document frequency)
*    Normalize the vectors to unit length, to abstract from the original text length (L2 norm)

Let's begin the first step:

Each vector will have as many dimensions as there are unique words in the SMS corpus. We will first use SciKit Learn's CountVectorizer. This model will convert a collection of text documents to a matrix of token counts.

We can imagine this as a 2-Dimensional matrix. Where the 1-dimension is the entire vocabulary (1 row per word) and the other dimension are the actual documents, in this case a column per text message.

In [None]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = df.clean_msg
y = df.v1_nm
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [None]:
# split X and y into training and testing sets 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


There are a lot of arguments and parameters that can be passed to the CountVectorizer. In this case we will just specify the analyzer to be our own previously defined function:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer()

In [None]:
# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.transform(X_train)



# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

# examine the document-term matrix
X_train_dtm

<4179x7949 sparse matrix of type '<class 'numpy.int64'>'
	with 34707 stored elements in Compressed Sparse Row format>

In [None]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7949 sparse matrix of type '<class 'numpy.int64'>'
	with 9935 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)

<4179x7949 sparse matrix of type '<class 'numpy.float64'>'
	with 34707 stored elements in Compressed Sparse Row format>

# Building and evaluating a model

We will use multinomial Naive Bayes:

*    The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.


In [None]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

Wall time: 5 ms


MultinomialNB()

In [None]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [None]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9820531227566404

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1205,    8],
       [  17,  163]], dtype=int64)

In [None]:
# print message text for false positives (ham incorrectly classifier)
# X_test[(y_pred_class==1) & (y_test==0)]
X_test[y_pred_class > y_test]

2418    Madamregret disturbancemight receive reference...
4598                                laid airtel line rest
386                                   Customer place call
1289    HeyGreat dealFarm tour 9am 5pm 95pax 50 deposi...
5094    Hi ShanilRakhesh herethanksi exchanged uncut d...
494                                      free nowcan call
759     Call youcarlos isare phones vibrate acting mig...
3140                                  Customer place call
Name: clean_msg, dtype: object

In [None]:
# print message text for false negatives (spam incorrectly classifier)
X_test[y_pred_class < y_test]

4674    Hi babe Chloe r smashed saturday night great w...
3528    Xmas New Years Eve tickets sale club day 10am ...
1662    Hi lookin saucy daytime fun wiv busty married ...
3417    LIFE never much fun great came made truly spec...
2773    come takes little time child afraid dark becom...
1960    Guess Somebody know secretly fancies Wanna fin...
5       FreeMsg Hey darling 3 weeks word back Id like ...
2078                         85233 FREERingtoneReply REAL
1457    CLAIRE havin borin time alone wanna cum 2nite ...
190     unique enough Find 30th August wwwareyouunique...
2429    Guess IThis first time created web page WWWASJ...
3057    unsubscribed services Get tons sexy babes hunk...
1021    Guess Somebody know secretly fancies Wanna fin...
4067    TBSPERSOLVO chasing us since Sept for38 defini...
3358         Sorry missed call lets talk time 07090201529
2821    ROMCAPspam Everyone around responding well pre...
2247    Back work 2morro half term C 2nite sexy passio...
Name: clean_ms

In [None]:
# example of false negative 
X_test[4949]

'Hi probably much fun get message thought id txt cos bored james farting night'

In [None]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([2.12185035e-02, 3.98910124e-04, 1.05093852e-03, ...,
       1.32292747e-02, 1.00253243e-04, 5.90549255e-06])

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9767014747641293

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', MultinomialNB())])
pipe.fit(X_train, y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfid', TfidfTransformer()),
                ('model', MultinomialNB())])

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, y_pred)

0.9669777458722182

In [None]:
metrics.confusion_matrix(y_test, y_pred)

array([[1213,    0],
       [  46,  134]], dtype=int64)

# Comparing models

We will compare multinomial Naive Bayes with logistic regression:

*    Logistic regression, despite its name, is a linear model for classification rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.


In [None]:
# import an instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')

In [None]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

Wall time: 384 ms


LogisticRegression(solver='liblinear')

In [None]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [None]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([0.01683212, 0.01552902, 0.07417823, ..., 0.02178356, 0.00539848,
       0.00667062])

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.9842067480258435

In [None]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1213,    0],
       [  22,  158]], dtype=int64)

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9833287533205093