In [1]:
import numpy as np
import pandas as pd
from sklearn import naive_bayes
from sklearn import cross_validation
from sklearn.metrics import accuracy_score



In [2]:
df = pd.read_csv('smsspam',names=['status','messages'],delimiter='\t')

In [3]:
df.head()

Unnamed: 0,status,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['status'][df['status']=='spam'] = 1
df['status'][df['status']=='ham'] = 0
y = df['status'].values
X = df['messages']
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: messages, dtype: object

In [5]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=object)

In [6]:
y = y.astype('int64')

In [7]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [8]:
df.head()

Unnamed: 0,status,messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer()

In [13]:
X_cv = cv.fit_transform(X)

In [14]:
X_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
X_cv.shape

(5572, 8713)

In [16]:
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X_cv,y,test_size=0.2)

In [17]:
X_train.shape

(4457, 8713)

In [18]:
y_train.shape

(4457,)

In [22]:
nbm = naive_bayes.MultinomialNB()

In [23]:
nbm.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
nbm.score(X_test,y_test)

0.9820627802690582

In [26]:
list(X_cv)

[<1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 18 stored elements in Compressed Sparse Row format>,
 <1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 5 stored elements in Compressed Sparse Row format>,
 <1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 23 stored elements in Compressed Sparse Row format>,
 <1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 7 stored elements in Compressed Sparse Row format>,
 <1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 11 stored elements in Compressed Sparse Row format>,
 <1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 28 stored elements in Compressed Sparse Row format>,
 <1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 14 stored elements in Compressed Sparse Row format>,
 <1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 20 stored elements in Compressed Sparse Row format>,
 <1x8713 sparse matrix of type '<class 'numpy.int64'>'
 	with 23 stored elements i

In [31]:
a = ['Go until jurong point', 'crazy Available only']

In [32]:
cv1 = CountVectorizer()

In [33]:
test_a = cv1.fit_transform(a)

In [34]:
test_a.toarray()

array([[0, 0, 1, 1, 0, 1, 1],
       [1, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [35]:
cv1.get_feature_names()

['available', 'crazy', 'go', 'jurong', 'only', 'point', 'until']

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: messages, dtype: object

In [38]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [48]:
tv = TfidfVectorizer()

In [49]:
x_tv = tv.fit_transform(a)

In [50]:
x_f = x_tv.toarray()

In [51]:
list(x_f[0])

[0.0, 0.0, 0.5, 0.5, 0.0, 0.5, 0.5]

In [53]:
x_f

array([[0.        , 0.        , 0.5       , 0.5       , 0.        ,
        0.5       , 0.5       ],
       [0.57735027, 0.57735027, 0.        , 0.        , 0.57735027,
        0.        , 0.        ]])

In [54]:
tv.get_feature_names()

['available', 'crazy', 'go', 'jurong', 'only', 'point', 'until']