In [1]:
# Data from https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
import pandas as pd
import codecs
import string

from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from nltk import FreqDist
from nltk.corpus import stopwords



In [2]:
# open for reading with "universal" type set
# reading the file and parsing with tab delimited 
doc = codecs.open('SMSSpamCollection','rU','UTF-8') 
df = pd.read_csv(doc, sep='\t', header=None, names=['email_labels', 'message'])
df.head(5)

Unnamed: 0,email_labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# preprocessing the message columns by removing stopwords, punctuation, lowercase
english_stop_words = set(stopwords.words('english') ).union( set(ENGLISH_STOP_WORDS))
df['processed_message'] = df['message'].apply(lambda v: [k for k in v.split() if k not in english_stop_words])
df['processed_message'] = df['processed_message'].apply(lambda v: [k for k in v if k not in string.punctuation])
df['processed_message'] = df['processed_message'].apply(lambda v: ' '.join([k.lower() for k in v]))
df['processed_message'].head(5)

0    go jurong point, crazy.. available bugis n gre...
1                        ok lar... joking wif u oni...
2    free entry 2 wkly comp win fa cup final tkts 2...
3                    u dun say early hor... u c say...
4                          nah i think goes usf, lives
Name: processed_message, dtype: object

In [4]:
# separate out spam, ham messages
# get frequencey of words to understand which words are contributing to spam
df_spam = df[df['email_labels'] == 'spam']
df_ham = df[df['email_labels'] == 'ham']
len(df_spam['processed_message']), len(df_ham['processed_message'])

(747, 4825)

In [5]:
spam_words = df_spam['processed_message'].str.split(expand=True).stack().value_counts()
spam_unique_words = spam_words.keys()
ham_unique_words = df_ham['processed_message'].str.split(expand=True).stack().value_counts().keys()
spam_inc_words = spam_words.loc[set(spam_unique_words) - set(ham_unique_words) ].sort_values(ascending=False).head(15)
spam_inc_words

claim         106
prize          58
urgent!        43
tone           40
awarded        38
£1000          33
150ppm         30
guaranteed     29
entry          26
ringtone       24
4*             24
tones          24
valid          23
500            23
£100           22
dtype: int64

In [6]:
# the above spam words makes sense as message with subject claim your prize, you have won this much, 
# congrats you are awarded are most likely to be spam

In [7]:
# let us try to predict the email labels based on preprocessed message
x = df['processed_message']
y = df['email_labels']

In [8]:
# Training function which takes the train and test data along with the classifier model that we choose to use
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
 
    classifier.fit(X_train, y_train)
    print "Accuracy: %s" % classifier.score(X_test, y_test)
    predicted = classifier.predict(X_test)
    return classifier, predicted, X_test, y_test

In [9]:
clf = RandomForestClassifier()
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', clf),
])
 
classifier, predicted, X_test, y_test = train(trial1, x, y)

Accuracy: 0.9734386216798278


In [10]:
print metrics.classification_report(y_test, predicted, target_names=df['email_labels'].unique())

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1195
       spam       1.00      0.81      0.90       198

avg / total       0.97      0.97      0.97      1393



In [11]:
print confusion_matrix(y_test, predicted) 

[[1195    0]
 [  37  161]]


In [12]:
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB(alpha=0.03)),
])
 
classifier, predicted, X_test, y_test = train(trial1, x, y)

Accuracy: 0.9856424982053122


In [13]:
print metrics.classification_report(y_test, predicted, target_names=df['email_labels'].unique())

             precision    recall  f1-score   support

        ham       0.99      1.00      0.99      1195
       spam       0.97      0.92      0.95       198

avg / total       0.99      0.99      0.99      1393



In [14]:
print confusion_matrix(y_test, predicted) 

[[1190    5]
 [  15  183]]
