In [87]:
import numpy as np
import sklearn as sk
import pandas as pd

#### testing some phrases

In [88]:
sample_train = ['call you tonight','CALL me a cab','please call me...PLEASE']

## Feature extraction
### Convert a collection of text documents to a matrix of token counts

In [89]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

#### fitting the data

In [90]:
vect.fit(sample_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

#### getting the feature names

In [91]:
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

#### Converting into Document Sparse matrix

In [92]:
simple_train_dtm = vect.transform(sample_train)
simple_train_dtm
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [93]:
pd.DataFrame(simple_train_dtm.toarray(),columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [94]:
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [95]:
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


#### Testing the Docment Matrix to convert a new message

In [96]:
simple_test = ['please don’t call me']

In [97]:
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [98]:
pd.DataFrame(simple_test_dtm.toarray(),columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


#### Only the words that were present in the documentation were used as features

## Getting the data for project

In [99]:
sms = pd.read_csv("G:\Big Data\Data\Spam messages\spam.csv",encoding = "ISO-8859-1")
sms.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [100]:
print("Shape of the Dataset: ", sms.shape)
print("\n\nDistribution of the Target variable:\n",sms.v1.value_counts())

Shape of the Dataset:  (5572, 2)


Distribution of the Target variable:
 ham     4825
spam     747
Name: v1, dtype: int64


In [101]:
sms['v1_num'] =sms.v1.map({'ham':0,'spam':1})
sms.head()

Unnamed: 0,v1,v2,v1_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [102]:
X = sms.v2
y = sms.v1_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [103]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [104]:
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [105]:
X_train_dtm = vect.transform(X_train)

### Doing both fit and transform in same step

In [106]:
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<4179x7496 sparse matrix of type '<class 'numpy.int64'>'
	with 55614 stored elements in Compressed Sparse Row format>

In [107]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7496 sparse matrix of type '<class 'numpy.int64'>'
	with 17010 stored elements in Compressed Sparse Row format>

### Naive Bayes --- Multinomial

In [108]:
from sklearn.naive_bayes import MultinomialNB

In [109]:
nb = MultinomialNB()
nb.fit(X_train_dtm,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [110]:
y_pred_class = nb.predict(X_test_dtm)

In [111]:
from sklearn import metrics
print("Accurace for the model: ",metrics.accuracy_score(y_test,y_pred_class))

print("\n\nHere goes the confusion metrix:\n",metrics.confusion_matrix(y_test,y_pred_class))

Accurace for the model:  0.985642498205


Here goes the confusion metrix:
 [[1205    8]
 [  12  168]]


### Checking the missclassified ones

In [112]:
X_test[(y_pred_class==1)&(y_test==0)]

325                      No calls..messages..missed calls
4598              Have you laid your airtel line to rest?
1289    Hey...Great deal...Farm tour 9am to 5pm $95/pa...
45                       No calls..messages..missed calls
573                                Waiting for your call.
3373                              Also andros ice etc etc
1081                    Can u get pic msgs to your phone?
494                      Are you free now?can i call now?
Name: v2, dtype: object

In [113]:
X_test[(y_pred_class==0)&(y_test==1)]

4674    Hi babe its Chloe, how r u? I was smashed on s...
3528    Xmas & New Years Eve tickets are now on sale f...
1499    SMS. ac JSco: Energy is high, but u may not kn...
3417    LIFE has never been this much fun and great un...
2773    How come it takes so little time for a child w...
5       FreeMsg Hey there darling it's been 3 week's n...
1457    CLAIRE here am havin borin time & am now alone...
2429    Guess who am I?This is the first time I create...
4067    TBS/PERSOLVO. been chasing us since Sept forå£...
3358    Sorry I missed your call let's talk when you h...
2821    ROMCAPspam Everyone around should be respondin...
2247    Back 2 work 2morro half term over! Can U C me ...
Name: v2, dtype: object

#### Checking ROC

In [114]:
metrics.roc_auc_score(y_test,y_pred_class)

0.96336905743336076

### Logistic Regression

In [115]:
from sklearn import linear_model

In [116]:
logreg = linear_model.LogisticRegression()
logreg.fit(X_train_dtm,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [117]:
y_pred_class = logreg.predict(X_test_dtm)
print("Accuracy is: ", metrics.accuracy_score(y_test,y_pred_class))
print("And the Confustion Matrix")
metrics.confusion_matrix(y_test,y_pred_class)

Accuracy is:  0.987796123475
And the Confustion Matrix


array([[1211,    2],
       [  15,  165]])

### Checking ROC

In [118]:
metrics.roc_auc_score(y_test,y_pred_class)

0.95750893102500667

### Checking the missclassified ones

In [119]:
X_test[(y_pred_class==1)&(y_test==0)]

2520    Misplaced your number and was sending texts to...
494                      Are you free now?can i call now?
Name: v2, dtype: object

In [120]:
X_test[(y_pred_class==0)&(y_test==1)]

4674    Hi babe its Chloe, how r u? I was smashed on s...
1662    Hi if ur lookin 4 saucy daytime fun wiv busty ...
1448    As a registered optin subscriber ur draw 4 å£1...
4247    accordingly. I repeat, just text the word ok o...
3417    LIFE has never been this much fun and great un...
2773    How come it takes so little time for a child w...
5       FreeMsg Hey there darling it's been 3 week's n...
4071    Loans for any purpose even if you have Bad Cre...
1457    CLAIRE here am havin borin time & am now alone...
4067    TBS/PERSOLVO. been chasing us since Sept forå£...
1044    We know someone who you know that fancies you....
3358    Sorry I missed your call let's talk when you h...
2821    ROMCAPspam Everyone around should be respondin...
659     88800 and 89034 are premium phone services cal...
2247    Back 2 work 2morro half term over! Can U C me ...
Name: v2, dtype: object

## Checking the timmings for both the models

In [121]:
%time nb.fit(X_train_dtm,y_train)
%time logreg.fit(X_train_dtm,y_train)

Wall time: 3.51 ms
Wall time: 42.6 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Although the accuracy for logistic regression model is slightly better than Naive Bayes, we will still use Naive Bayes model as Text Classification problem is mostly used in Email spam filtering
## And timming plays an important role in predition