In [1]:
# Data from https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
import pandas as pd
import codecs
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Training function which takes the train and test data along with the classifier model that we choose to use
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
 
    classifier.fit(X_train, y_train)
    print "Accuracy: %s" % classifier.score(X_test, y_test)
    predicted = classifier.predict(X_test)
    return classifier, predicted, X_test

In [3]:
# open for reading with "universal" type set
# reading the file and parsing with tab delimited 
doc = codecs.open('SMSSpamCollection','rU','UTF-8') 
df = pd.read_csv(doc, sep='\t', header=None, names=['email_labels', 'message'])
df.head(10)

Unnamed: 0,email_labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
# let us try to predict the email labels based o
x = df['message']
y = df['email_labels']

In [5]:
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB(alpha=0.03)),
])
 
classifier, predicted, X_test = train(trial1, x, y)

Accuracy: 0.986360373295


In [6]:
df_check = df.ix[X_test.index]
df_check['predicted_email_labels'] = predicted

In [7]:
df_not_matching = df_check[df_check['email_labels'] != df_check['predicted_email_labels']]
len(df_not_matching)

19

In [8]:
df_not_matching.head()

Unnamed: 0,email_labels,message,predicted_email_labels
991,ham,26th OF JULY,spam
4703,ham,Anytime...,spam
2248,spam,Back 2 work 2morro half term over! Can U C me ...,ham
4821,spam,Check Out Choose Your Babe Videos @ sms.shsex....,ham
3574,spam,You won't believe it but it's true. It's Incre...,ham


In [9]:
df_matching = df_check[df_check['email_labels'] == df_check['predicted_email_labels']]
len(df_matching)

1374

In [10]:
df_matching.head()

Unnamed: 0,email_labels,message,predicted_email_labels
3538,ham,Mmmmm ... It was sooooo good to wake to your w...,ham
4487,ham,have * good weekend.,ham
3738,ham,Plz note: if anyone calling from a mobile Co. ...,ham
4634,ham,Oh wow thats gay. Will firmware update help,ham
748,spam,U are subscribed to the best Mobile Content Se...,spam
