In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Chapter 6 - Other Popular Machine Learning Methods
## Segment 5 - Naive Bayes Classifiers

In [2]:
import numpy as np
import pandas as pd
import urllib
import sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

## Naive Bayes
### Using Naive Bayes to predict spam
### read data and explore

In [4]:
# url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

# import urllib.request

# raw_data = urllib.request.urlopen(url)
raw_data = 'spambase.data'
dataset = np.loadtxt(raw_data, delimiter=',')
print(dataset[0])

[  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


In [5]:
type(dataset)
dataset.shape
len(dataset[0])
len(dataset[690])

numpy.ndarray

(4601, 58)

58

58

In [6]:
dataset[0, 0:48]

array([0.  , 0.64, 0.64, 0.  , 0.32, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.64, 0.  , 0.  , 0.  , 0.32, 0.  , 1.29, 1.93, 0.  , 0.96, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  ])

In [7]:
dataset[0, -1]

1.0

### create feature and target arrays
selected features are word frequency counts

In [8]:
X = dataset[:,0:48]
y = dataset[:,-1]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=17)

### `BernoulliNB`
`binazie=True` means do binning of target

In [10]:
BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)
#print(BernNB)
BernNB.get_params()

y_expect = y_test
y_pred = BernNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=True)

{'alpha': 1.0, 'binarize': True, 'class_prior': None, 'fit_prior': True}

0.8577633007600435


In [15]:
BernNB.feature_count_
BernNB.class_count_
BernNB.classes_
BernNB.coef_
BernNB.class_log_prior_
BernNB.feature_log_prob_

array([[ 261.,  181.,  586.,    5.,  473.,  202.,   34.,  131.,  144.,
         353.,   93.,  952.,  219.,   75.,   29.,  176.,  180.,  240.,
        1303.,   26.,  754.,   21.,   36.,   37.,  825.,  626.,  571.,
         330.,  277.,  342.,  235.,  165.,  254.,  167.,  352.,  364.,
         552.,   24.,  252.,  180.,  105.,  238.,  215.,  185.,  637.,
         350.,   23.,  127.],
       [ 406.,  429.,  852.,   28.,  847.,  470.,  520.,  430.,  371.,
         605.,  390.,  884.,  379.,  142.,  191.,  732.,  486.,  499.,
        1265.,  279., 1153.,   69.,  441.,  456.,   33.,   18.,    6.,
          25.,    5.,   14.,    3.,    2.,   37.,    7.,   22.,   71.,
          70.,   21.,   34.,  129.,    0.,   12.,   49.,   15.,  293.,
          24.,    4.,   14.]])

array([2241., 1439.])

array([0., 1.])

array([[-1.26427941, -1.20930739, -0.52433305, -3.90579677, -0.53021196,
        -1.1182345 , -1.01734255, -1.20698451, -1.35419874, -0.86621261,
        -1.30438504, -0.48750495, -1.33292134, -2.31024797, -2.01559722,
        -0.67594689, -1.08482847, -1.0584845 , -0.12947499, -1.63830299,
        -0.22210315, -3.02459735, -1.18178271, -1.14840921, -3.74673207,
        -4.32865362, -5.32718245, -4.01499606, -5.48133313, -4.56504239,
        -5.88679823, -6.17448031, -3.63550644, -5.19365105, -4.13759838,
        -2.99642648, -3.01041272, -4.18205014, -3.71774453, -2.40555815,
        -7.2730926 , -4.70814324, -3.36106959, -4.50050387, -1.58951283,
        -4.05421677, -5.66365468, -4.56504239]])

array([-0.49599056, -0.93896432])

array([[-2.14722503, -2.51156285, -1.34054471, -5.92381007, -1.55436221,
        -2.40236356, -4.16022147, -2.83276761, -2.73883579, -1.84627262,
        -3.17227475, -0.85595463, -2.32194199, -3.38483619, -4.31437215,
        -2.5394198 , -2.5170725 , -2.2307726 , -0.54237779, -4.41973267,
        -1.08885179, -4.62452708, -4.10465162, -4.07798337, -0.99897476,
        -1.27462299, -1.36643054, -1.91345116, -2.08794842, -1.87783909,
        -2.25173773, -2.60358175, -2.17430599, -2.59160556, -1.84910148,
        -1.81567218, -1.40021153, -4.49669371, -2.18218005, -2.5170725 ,
        -3.05213044, -2.23910598, -2.34029113, -2.48982286, -1.25723125,
        -1.85478331, -4.5375157 , -2.86353927],
       [-1.26427941, -1.20930739, -0.52433305, -3.90579677, -0.53021196,
        -1.1182345 , -1.01734255, -1.20698451, -1.35419874, -0.86621261,
        -1.30438504, -0.48750495, -1.33292134, -2.31024797, -2.01559722,
        -0.67594689, -1.08482847, -1.0584845 , -0.12947499, -1.63830299,
   

### `MultinomialNB`

In [12]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
#print(MultiNB)
MultiNB.get_params()

y_expect = y_test
y_pred = MultiNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

MultinomialNB()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

0.8816503800217155


### `GaussianNB`

In [13]:
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
#print(GausNB)
GausNB.get_params()

y_expect = y_test
y_pred = GausNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

GaussianNB()

{'priors': None, 'var_smoothing': 1e-09}

0.8197611292073833


### `BernoulliNB`

In [14]:
BernNB = BernoulliNB(binarize=0.1)
BernNB.fit(X_train, y_train)
#print(BernNB)
BernNB.get_params()

y_expect = y_test
y_pred = BernNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=0.1)

{'alpha': 1.0, 'binarize': 0.1, 'class_prior': None, 'fit_prior': True}

0.9109663409337676
