In [1]:
import numpy as np
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import os
import random



In [2]:
df = pd.read_csv('sms_spam.csv')
df.shape

(5574, 2)

In [3]:
# Create feature set

# tokenize
# tolower
# remove nums
# remove punctuation
# remove stopwords
# stem

# create TF-IDF matrix

PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()

def tokenize(text):
    tokens = word_tokenize(text)
    lowercased = [t.lower() for t in tokens]
    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    stemmed = [STEMMER.stem(w) for w in no_stopwords]
    return [w for w in stemmed if w]



In [4]:
df['type'] = np.where(df['type']=='spam',1,0)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=2, stop_words='english')
dfTMD = vectorizer.fit_transform(df['text'])
dfTMD.shape

(5574, 5860)

In [6]:
# Split data into training, test
dfTMD
np.random.seed(100)
#df_shuff = dfTMD.iloc[np.random.permutation(len(dfTMD))]
trainSize = int(0.8*(dfTMD.shape[0]))
train_data = dfTMD[:trainSize]
test_data = dfTMD[trainSize:]

In [7]:
# Separate labels

y_train = df['type'][:trainSize]
X_train = train_data

#lt = y_train.shape[0]
#y_train = np.reshape(lt,1)

y_test = df['type'][trainSize:]
X_test = test_data


In [8]:
print("X_train", X_train.shape, "y_train", y_train.shape, "X_test", X_test.shape, "y_test", y_test.shape)


X_train (4459, 5860) y_train (4459,) X_test (1115, 5860) y_test (1115,)


In [9]:
# Train the model

clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
# Evaluate the model
pred = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, pred)
precision = metrics.precision_score
recall = metrics.recall_score
f1Score = metrics.f1_score

print("Summary Stats")
print('Model Accuracy = ' + str(float(accuracy)))
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score )

Summary Stats
Model Accuracy = 0.8699551569506726
Precision = <function precision_score at 0x10d5e2ea0>
Recall = <function recall_score at 0x10d5e2f28>
F1 Score = <function f1_score at 0x10d5e2c80>


In [12]:
# Tokenize (we use the tfidfVectorizer above instead)
texts = list(df['text'])
tokens = []

for t in texts: 
    a = tokenize(t)
    tokens.append(a)
    
df['text'] = tokens
# not used
#df['text'] = t.tokenize() for t in texts
#df['text'] = for idx in df.index:
#    tokenize(df.ix[idx]['text'])

In [14]:
# Manual evaluation -- not using

confmx = metrics.confusion_matrix(y_test, pred)
TP = 
TN = 
FP = 
FN = 
metrics.precision_score
print("TP = ", TP)
print("TN = ", TN)
print("FP = ", FP)
print("FN = ", FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1Score = 2*(precision*recall/(precision+recall))
print("Summary Stats")
print('Model Accuracy = ' + str(float(accuracy)))
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score )

NameError: name 'TP' is not defined