# the problem setup
suppose a sample x = (x0,x1,...,xn)  
there is m class c0,c1,...,cm  

for unknown sample x_k, we compute p(c|x_k), where c=(c0,c1,...,cm)  
and c_pred = argmax_c{p(c|x_k}  


# the bayes thm.

$ p(c_j|x_k)p(x_k) =  p(c_j, x_k) $  

$ =  p(x_k|c_j)p(c_j) $  


$p(x_k)$
==============================
for $0<=j<=m$, $p(x_k)$ remains the same and thus can be omitted;  

$p(c_j)$
==============================
$p(c_j)$ is the prior and can be computed on the training data;

$p(x_k|c_j)$  
==============================
according to the 'naive' nature,  
i.e., attributes are independent on each other
    
 $ p(x_k|c_j) = p(x0_k,x1_k,...,xn_k|c_j)$  
 
 $=  p(x0_k|c_j)*p(x1_k|c_j)*...*p(xn_k|c_j)$
      

for each attributes' conditional proba,
1. attributes are discrete values, the proba is calculated as 
    count of the value divided by total count of all possible values
2. attributes are continuous values, obtain a model for the 
    attribute's distribution estimated as a gaussian.
  
all of the above computation is done by the training data

In [None]:
import numpy as np
import logging

class NaiveBayes(object):
    def __init__(self, continuous_attr, logger='logger.txt'):
        logging.basicConfig(filename=logger, filemode='w',
                            level=logging.DEBUG, format='%(message)s') # debug < info < warning < error < critical
        self.continuous = continuous_attr
        self.samples = None
        self.labels = None
        self.cls = []
        self.n_samples = None
        self.n_attribute = None
        self.n_classes = None
        self.class_priors = None
        self.c_x_priors = None # n x m, each element is a dict, denoting for each attribute, the distribution of its values


    def _read_data(self, file, train=True):
        # cls = []
        self.samples = []
        self.labels = []
        with open(file) as f:
            for line in f.readlines():
                sample = [x.strip().strip('.') for x in line.split(',')]
                sample, label = sample[:-1], sample[-1]

                if '?' in sample or len(sample)<2:
                    continue
                # only convert label to numbers during training
                if train:
                    if label in self.cls:
                        label = self.cls.index(label)
                    else:
                        tmp = label
                        label = len(self.cls)
                        self.cls.append(tmp)

                self.samples.append(sample)
                self.labels.append(label)

        self.samples = np.array(self.samples)
        self.labels = np.array(self.labels)
        self.n_classes = len(self.cls)
        self.n_samples = len(self.samples)
        self.n_attribute = len(self.samples[0])

    # def _preprocess_data(self, file, train=True):
    #     """
    #     labels assured to be numbers
    #     :return: samples, labels
    #     """
    #     self._read_data(file=file, train=train)


    def _compute_class_prior(self):
        self.class_priors = np.array([0]*self.n_classes, dtype=np.float32)
        for sample,label in zip(self.samples, self.labels):
            self.class_priors[label] += 1
        self.class_priors /= self.n_samples

    def _compute_c_x_prior(self):
        tmp = [None]*(self.n_attribute*self.n_classes)
        self.c_x_priors = np.reshape(np.array(tmp), newshape=(self.n_classes, self.n_attribute))
        # m x n x a
        # sample : n x 1 可用花式索引 [label, list(range(n_attribute)), sample]
        # self.x_c_priors = np.array([0]*(self.n_classes*self.n_attribute*self.attr_max))
        for sample,label in zip(self.samples, self.labels):
            for i in range(self.n_attribute):

                if not i in self.continuous:
                    entry = self.c_x_priors[label, i]
                    if entry is None:
                        entry = dict()
                    if entry.get(sample[i], None) is None:
                        entry[sample[i]] = 1
                    else:
                        entry[sample[i]] += 1
                    self.c_x_priors[label, i] = entry
        for i in self.continuous:# should make sure it's numerical value
            # avg = np.mean(self.samples[,i])
            # stddev = np.std(self.samples[:,i])
            self.c_x_priors[:,i] = [(np.mean(self.samples[c==self.labels,i].astype(np.int16)),
                                     np.std(self.samples[c==self.labels,i].astype(np.int16)))
                                    for c in range(len(self.cls))]
        # for attributes in self.c_x_priors:
        #     # attributes is a vector
        #     denoms = [np.sum(list(attr.values())) for attr in attributes] # attr is dict

    def _gaussian(self, x, c, i):
        yita, sigma = self.c_x_priors[c,i]
        fraction = 1. / (np.sqrt(2*np.pi)*sigma)
        exp = np.exp(-(x.astype(np.int16)-yita)**2 / (2*sigma**2))
        return fraction*exp


    def _predict(self, sample):
        #p(x_k|c_j)
        proba = [1]*self.n_classes

        for i,p in enumerate(proba):
            # for class i, compute chaining proba for each attr
            for idx,attr in enumerate(sample):
                if not idx in self.continuous: # discrete values
                    entry = self.c_x_priors[i,idx] # dict
                    total = self.n_samples*self.class_priors[i]
                    if entry.get(attr) is None:
                        p *= 1. / (total+len(list(entry.values()))+1)
                        # entry[attr] = 1
                        # self.c_x_priors[i, idx] = entry
                    else:
                        p *= np.float32(entry[attr]) / (total)
                else:
                    p *= self._gaussian(attr, i, idx) # the gaussian density for class i attr idx

            proba[i] = p

        pred = np.argmax(proba)
        pred = self.cls[pred]

        return proba, pred


    def train(self):
        self._read_data(file='adult.data')
        self._compute_class_prior()
        self._compute_c_x_prior()

    def test(self):
        self._read_data(file='adult.test', train=False)
        # the variables are shared with testing
        accuracy = 0
        for sample,label in zip(self.samples,self.labels):
            proba, predicted = self._predict(sample)
            if predicted == label:
                accuracy += 1
                logging.info('sample={} is predict {}'.format(sample, predicted))
            else:
                logging.warning('=====================================\n'
                                +'sample={} is predict {}\nwrong answer, should be {}. the estimated proba is {}\n'.format(sample, predicted, label, proba)
                                +'=====================================')


        logging.info('the overall accuracy is {}'.format(np.float32(accuracy)/len(self.samples)))