In [54]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [23]:
messages = pd.read_csv('SMSSpamCollection', sep='\t', names=['type', 'message'])

In [24]:
messages.head()

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [25]:
def text_process(message):
    # Remove punctuations()
    message = ''.join([c for c in message if c not in string.punctuation])
    # Clean message from 'Stop Words'
    cleaned_message = [word for word in message.split() if word.lower() not in stopwords.words('english')]
    # return cleaned text
    return cleaned_message

In [55]:
msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['type'], test_size=0.33)

In [56]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [57]:
pipeline.fit(msg_train, label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x7fbeaf3dea60>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=No...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [65]:
predictions = pipeline.predict(msg_test)

In [66]:
print(classification_report(label_test, predictions))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.97      1591
       spam       1.00      0.65      0.79       248

avg / total       0.96      0.95      0.95      1839

