# [機械学習とPythonとの出会い](http://www.kamishima.net/mlmpyja/) をちゃんとよむ
numpy初心者には何も言わずこれを勧めておけばいいような気がする

In [2]:
import numpy as np
import scipy as sp
import os, sys
import matplotlib
matplotlib.rcParams['svg.fonttype'] = 'none'
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette("colorblind")
sns.set_context("paper", font_scale=2)
sns.set_style("white")
%matplotlib inline

## 単純ベイズ：入門編

### 型について

In [3]:
a = np.array([1, 2, 3])
a.dtype

dtype('int64')

In [4]:
a = np.array([1, 2, 3], dtype=float)
a.dtype

dtype('float64')

In [7]:
a = np.array(a, dtype=complex)
a.dtype, a

(dtype('complex128'), array([ 1.+0.j,  2.+0.j,  3.+0.j]))

In [8]:
a = a.astype(float)
a.dtype, a

  if __name__ == '__main__':


(dtype('float64'), array([ 1.,  2.,  3.]))

## 単純ベイズ

学習データ$\mathcal{D}$は
$$\mathcal{D}=\{\mathbf{x}_i, y_i\},\,i=1,\ldots,N$$
と表される．

$$
\text{特徴量}: \mathbf{x_i}  = (x_{i1}, x_{i2}, \dots, x_{iK})
$$
where $x_{ij} \in \{F_1, F_2, \dots, F_j\}$.

$$\text{ラベル}: y_i \in \{C_1, C_2, \dots, C_n\}$$

$$
X = \begin{pmatrix}
\mathbf{x}_1 \\
\mathbf{x}_2 \\
\dots \\
\mathbf{x}_N 
\end{pmatrix}
$$
$$
\mathbf{y} = \begin{pmatrix}
\mathbf{y}_1 \\
\mathbf{y}_2 \\
\dots \\
\mathbf{y}_N
\end{pmatrix}
$$

$$
\Pr[\mathbf{X}, Y] = \Pr[Y] \prod_{j=1}^K \Pr[X_j | Y]
$$

$$
\Pr[\mathbf{x}_i, y_i] = p(y_i)p(x_{i1}, x_{i2}, \dots, x_{iK}|y_i) \\
= p(y_i)p(x_{i1}|y_i)p(x_{i2}, \dots, x_{iK}|y_i, x_{i1}) \\
= p(y_i)p(x_{i1}|y_i)p(x_{i2}|y_i, x_{i1})p(x_{i3}, \dots, x_{iK}|y_i, x_{i1}) \quad  \text{単純ベイズの仮定により} \\
= p(y_i)\prod_{j=1}^K p(x_{j1} | y_j) \\
= \Pr[Y] \prod_{j=1}^K \Pr[X_j | Y]
$$

ここで$Pr[Y]$と$Pr[X_j|Y]$がカテゴリ分布（離散分布）である場合

\begin{split}\Pr[y],&\quad y=1, \ldots, C\\
\Pr[x_j | y],&\quad y=1,\ldots,C,\;x_j=1,\ldots,F_j,\; j=1,\ldots,K\end{split}


以下簡単のため
$$
\text{特徴量}: \mathbf{x_i}  = (x_{i1}, x_{i2}, \dots, x_{iK})
$$
where $x_{ij} \in \{F_1, F_2\}$.

$$\text{ラベル}: y_i \in \{C_1, C_2\}$$
とする．

ここれにより$Pr[Y]$と$Pr[X_j|Y]$はベルヌーイ分布に従う．

$\mathcal{D}=\{\mathbf{x}_i, y_i\},\,i=1,\ldots,N$ に対する対数尤度は

$$
\mathcal{L}(\mathcal{D}; \{\Pr[x_j | y]\}) = \sum_{(\mathbf{x}_i, y_i)\in\mathcal{D}} \ln\Pr[\mathbf{x}_i | y_i]
$$

入力ベクトル$\mathbf{x}^{new}$が与えられたときにクラス事後確率を最大にするクラスは

\begin{split}\hat{y} &= \arg\max_y \Pr[y|\mathbf{x}^\mathrm{new}] \\
        &= \arg\max_y \frac{\Pr[y]\Pr[\mathbf{x}^\mathrm{new}|y]}{\sum_{y'} \Pr[y']\Pr[\mathbf{x}^\mathrm{new} | y']} \\
        &= \arg\max_y \Pr[y]\Pr[\mathbf{x}^\mathrm{new}|y] \\
        &= \arg\max_y \Big(\Pr[y]
           \prod_j \Pr[x_j^\mathrm{new}|y]\Big) \\
        &= \arg\max_y \Big(\log\Pr[y] +
           \sum_j \log\Pr[x_j^\mathrm{new}|y]\Big)\end{split}

ここで
$$
\Pr[y]=\frac{N[y_i=y]}{N},\quad y\in\{0,1\}
$$

$$
\Pr[x_j | y]=\frac{N[x_{ij}=x_j, y_i=y]}{N[y_i=y]},
\quad y\in\{0,1\},\;x_j\in\{0,1\},\;j=1,\ldots,K
$$

### 実装

In [12]:
assert 1 == 2, "hum"

AssertionError: hum

In [15]:
a = np.zeros((2, 2))

In [17]:
class NaiveBayes1(object):
    """
    Naive Bayes class (1)
    Attributes
    ----------
    `pY_` : array_like, shape=(n_classes), dtype=float
        pmf of a class
    `pXgY_` : array_like, shape(n_features, n_classes, n_fvalues), dtype=float
        pmf of feature values given a class
    """

    def __init__(self):
        self.pY_ = None
        self.pXgY_ = None

    def fit(self, X, y):
        """
        Fitting model
        Parameters
        ----------
        X : array_like, shape=(n_samples, n_features), dtype=int
            feature values of training samples
        y : array_like, shape=(n_samples), dtype=int
            class labels of training samples
        """

        # constants
        n_samples = X.shape[0]
        n_features = X.shape[1]
        n_classes = 2     # classの取りうる値の数
        n_fvalues = 2     # featureの取りうる値の数

        # check the size of y
        if n_samples != len(y):
            raise ValueError('Mismatched number of samples.')

        # count up n[yi=y]
        nY = np.zeros(n_classes, dtype=int)
        for i in range(n_samples):
            nY[y[i]] += 1

        # calc pY_
        self.pY_ = np.empty(n_classes, dtype=float)
        for i in range(n_classes):
            self.pY_[i] = nY[i] / n_samples

        # count up n[x_ij=xj, yi=y]
        nXY = np.zeros((n_features, n_fvalues, n_classes), dtype=int)
        for i in range(n_samples):
            for j in range(n_features):
                # j(j=1, ..., n_features)番目の特徴がsum_i X[i, j] = xj(xj=0 or 1)かつクラスラベルがyi(yi=0 or 1)である個数を数える
                nXY[j, X[i, j], y[i]] += 1

        # calc pXgY_
        self.pXgY_ = np.empty((n_features, n_fvalues, n_classes),
                              dtype=float)
        for j in range(n_features):
            for xi in range(n_fvalues):
                for yi in range(n_classes):
                    self.pXgY_[j, xi, yi] = nXY[j, xi, yi] / nY[yi]

    def predict(self, X):
        """
        Predict class
        Parameters
        ----------
        X : array_like, shape=(n_samples, n_features), dtype=int
            feature values of unseen samples
        Returns
        -------
        y : array_like, shape=(n_samples), dtype=int
            predicted class labels
        """

        # constants
        n_samples = X.shape[0]
        n_features = X.shape[1]

        # memory for return values
        y = np.empty(n_samples, dtype=int)

        # for each feature in X
        for i, xi in enumerate(X):

            # calc joint probability
            logpXY = (np.log(self.pY_) +
                      np.sum(np.log(self.pXgY_[np.arange(n_features), xi, :]),
                            axis=0))

            # predict class
            y[i] = np.argmax(logpXY)

        return y