In [7]:
'''
Get data set
'''
import pandas as pd
filepath = "smsspamcollection/SMSSpamCollection"

df = pd.read_table(filepath,
                  sep='\t',
                  header=None,
                  names=['label','sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [52]:
'''
Map ham=0 sapm=1
'''
df['label'] = df.label.map({'ham':0,'spam':1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [53]:
'''
Training sets & tesing sets
'''
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                   df['label'],
                                                   random_state=1)
print('Rows in total set: {}'.format(df.shape[0]))
print('Rows in training set: {}'.format(X_train.shape[0]))
print('Rows in test set: {}'.format(X_test.shape[0]))
print('====X_train====')
print(X_train.head())
print('====X_test====')
print(X_test.head())
print('====y_train====')
print(y_train.head())
print('====y_test====')
print(y_test.head())

Rows in total set: 5572
Rows in training set: 4179
Rows in test set: 1393
====X_train====
710     4mths half price Orange line rental & latest c...
3740                           Did you stitch his trouser
2711    Hope you enjoyed your new content. text stop t...
3155    Not heard from U4 a while. Call 4 rude chat pr...
3748    Ü neva tell me how i noe... I'm not at home in...
Name: sms_message, dtype: object
====X_test====
1078                         Yep, by the pretty sculpture
4028        Yes, princess. Are you going to make me moan?
958                            Welp apparently he retired
4642                                              Havent.
4674    I forgot 2 ask ü all smth.. There's a card on ...
Name: sms_message, dtype: object
====y_train====
710     1
3740    0
2711    1
3155    1
3748    0
Name: label, dtype: int64
====y_test====
1078    0
4028    0
958     0
4642    0
4674    0
Name: label, dtype: int64


In [54]:
'''
Transform
'''
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [55]:
'''
Naive Bayes training & prediction
'''
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions = naive_bayes.predict(testing_data)

In [56]:
'''
Evaluate model 
'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

('Accuracy score: ', '0.988513998564')
('Precision score: ', '0.972067039106')
('Recall score: ', '0.940540540541')
('F1 score: ', '0.956043956044')
