In [63]:
#import pandas
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

dataset_dir='/Users/kiruthikasekar/Workspace/workshop/datasets/'

df = pd.read_table(dataset_dir+'SMSspam.txt',
                   sep='\t', 
                   header=None,
                   names=['label', 'message'])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [64]:
# preprocessing data
# mapping string labels to numeric values
df['label']=df['label'].map({'ham': 0, 'spam': 1})

# remove punctuations
df['message'] = df.message.str.replace('[^\w\s]', '')

# converting all messages to lower case
df.dtypes
df['message']=df['message'].str.lower()

df


label       int64
message    object
dtype: object

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say


In [18]:
import nltk
nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [65]:
# tekenize the words in the message
df['message']
df['message'].dropna(inplace=True)
df['message'] = df['message'].apply(nltk.word_tokenize)


0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
Name: message, dtype: object

In [66]:
# apply stemming on the tokenized words
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
 
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])
df

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,0,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."


In [67]:
# transforming into features
from sklearn.feature_extraction.text import CountVectorizer

# This converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['message'])

In [68]:
# using TFIDF
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)

In [69]:
# training model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.3, random_state=0)

In [70]:
# Model fitting
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [71]:
# evaluating model
import numpy as np

predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.5


In [72]:
# performance measure
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predicted))

[[1 0]
 [1 0]]
