# Machine learning  with text

In [8]:
import pandas as pd
simple_train=['call you tonight', 'Call me a cab', 'please call me... PLEASE!']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer   

In [3]:
#instantiate the model
vec=CountVectorizer()

#fit model with data
vec.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
#fitted vocabulary
vec.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [6]:
#converting the training data into document-text matrix
dtm_simple_train=vec.transform(simple_train)

In [7]:
#converting into numeric array
dtm_simple_train.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [14]:
#Examine the document-term matrix and the features
pd.DataFrame(dtm_simple_train.toarray(),columns=vec.get_feature_names())


Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [15]:
type(dtm_simple_train)

scipy.sparse.csr.csr_matrix

### reading text-base file using pandas

In [17]:
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
sms = pd.read_table(url, header=None, names=['label', 'message'])

In [18]:
sms.shape

(5572, 2)

In [19]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
sms['label_num']=sms.label.map({'ham':0,'spam':1})
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


#### Splitting the data for training and testing.


In [21]:
X=sms.message
y=sms.label_num

In [23]:
from sklearn.cross_validation import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


#### Vectorizing the data

In [24]:
vec=CountVectorizer()

vec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
vec.get_feature_names()[1:5]

['000', '008704050406', '0121', '01223585236']

In [31]:
X_train_dtm=vec.transform(X_train).toarray()
X_train_dtm

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [32]:
pd.DataFrame(X_train_dtm,columns=vec.get_feature_names()).head()

Unnamed: 0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,02072069400,...,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada,èn,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
#Vectorize the test data
X_test_dtm=vec.transform(X_test)

In [34]:
from sklearn.naive_bayes import MultinomialNB

In [35]:
#instantiate 
nb=MultinomialNB()

In [38]:
#fit the model with data
%time nb.fit(X_train_dtm,y_train)

Wall time: 3.3 s


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
y_pred=nb.predict(X_test_dtm)

In [40]:
from sklearn  import metrics

In [41]:
metrics.accuracy_score(y_test, y_pred)

0.98851399856424982

In [43]:
metrics.confusion_matrix(y_test,y_pred)

array([[1203,    5],
       [  11,  174]])

In [46]:
#False positive: incorrectly tagging spam as ham
X_test[y_test < y_pred]

574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [47]:
#False negative: incorrectly classify ham as spam
X_test[y_test> y_pred]

3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: message, dtype: object

In [49]:
#Example of False negative
X_test[4514]

'Money i have won wining number 946 wot do i do next'

In [50]:
#Example of false positive
X_test[1988]

'No calls..messages..missed calls'

In [53]:
y_pred_prob=nb.predict_proba(X_test_dtm)[:,1]


In [54]:
#Calculate Auc
metrics.roc_auc_score(y_test,y_pred_prob)

0.98664310005369604

#### Comparing the model

We now compare the model with LogisticRegression model

In [55]:
from sklearn.linear_model import LogisticRegression

In [56]:
logreg=LogisticRegression()

In [57]:
logreg.fit(X_train_dtm,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [58]:
y_pred=logreg.predict(X_test_dtm)

In [59]:
metrics.confusion_matrix(y_test,y_pred)

array([[1207,    1],
       [  16,  169]])

In [60]:
#precision
metrics.precision_score(y_test,y_pred)

0.99411764705882355

In [61]:
#sensitivy
metrics.recall_score(y_test,y_pred)

0.91351351351351351

In [65]:
y_pred_prob=logreg.predict_proba(X_test_dtm)[:,1]

In [66]:
metrics.roc_auc_score(y_test,y_pred_prob)

0.99368176123143015

## Examining a model for further insight

In [75]:
X_train_token=vec.get_feature_names()
X_train_token[1:11]

['000',
 '008704050406',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414']

In [84]:
#number of times each token appeared in the classes
len(nb.feature_count_[0])

7456

In [86]:
token=pd.DataFrame({'Ham':nb.feature_count_[0],'Spam':nb.feature_count_[1],'Token':vec.get_feature_names()}).set_index('Token')
token.head()

Unnamed: 0_level_0,Ham,Spam
Token,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0,5.0
0,0.0,23.0
8704050406,0.0,2.0
121,0.0,1.0
1223585236,0.0,1.0


In [103]:
import numpy as np

index=[i for i in np.random.randint(0,7456,size=5)]
a=[vec.get_feature_names()[x] for x in index]
token.loc[a,['Ham', 'Spam']]

Unnamed: 0_level_0,Ham,Spam
Token,Unnamed: 1_level_1,Unnamed: 2_level_1
even,44.0,4.0
swollen,1.0,0.0
swalpa,1.0,0.0
2nite,4.0,4.0
intha,1.0,0.0


In [105]:
#alternative
token.sample(5,random_state=6)

Unnamed: 0_level_0,Ham,Spam
Token,Unnamed: 1_level_1,Unnamed: 2_level_1
very,64.0,2.0
nasty,1.0,1.0
villa,0.0,1.0
beloved,1.0,0.0
textoperator,0.0,2.0
