# Naive Bayes

**Note: This following cell contains some predefined functions to implement a type of Decision Tree algorithm called CART (Classification and Regression Trees). Please make sure you have run this cell before you run other cells in this notebook.**

In [36]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

def loadDataSet(dataset):
    with open(dataset) as f:
        returnVec=[]
        data=f.readlines()
        #print data
        text=[entry.split('\t')[1].rstrip() for entry in data]
        #print instances
        parsedText=map(textParse,text)
        #print parsedText
        vocabList=createVocabList(parsedText)
        for parsedSMS in parsedText:
            returnVec.append(setOfWords2Vec(vocabList,parsedSMS))
        labels=[entry.split('\t')[0] for entry in data]
        return returnVec,labels
         
def createVocabList(dataSet):
    vocabSet=set([])
    for document in dataSet:
        vocabSet=vocabSet|set(document)
    return list(vocabSet)
        
def textParse(bigString):
    import re
    #listOfTokens=re.split(r'\W*',bigString)
    listOfTokens=re.split(r'[^A-Za-z]*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def setOfWords2Vec(vocabList,inputSet):
    returnVec=[0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]=1
        else: print 'the word: %s is not in my Vocabulary' % word
    return returnVec

## Build a classifier##

In [26]:
dataset=raw_input('Please Enter Your Data Set:')
n_foldCV=int(raw_input("Please Enter the Number of Folds:"))
returnVec,labels=loadDataSet(dataset)

Please Enter Your Data Set:SMSSpamCollection


In [30]:
clf = BernoulliNB()
clf.fit(returnVec, labels)
scores = cross_val_score(clf, returnVec, labels, cv=n_foldCV)

In [34]:
clf = GaussianNB()
clf.fit(returnVec, labels)
scores = cross_val_score(clf, returnVec, labels, cv=n_foldCV)

In [38]:
clf = MultinomialNB()
clf.fit(returnVec, labels)
scores = cross_val_score(clf, returnVec, labels, cv=n_foldCV)

## Evaluate a classifier##

In [31]:
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.98207885  0.97043011  0.97217235  0.97486535  0.97935368]
Accuracy: 0.98 (+/- 0.01)


In [35]:
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.88799283  0.87455197  0.88240575  0.88509874  0.89856373]
Accuracy: 0.89 (+/- 0.02)


In [39]:
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.97580645  0.97401434  0.96140036  0.97307002  0.96947935]
Accuracy: 0.97 (+/- 0.01)
