## Naive Bayes Classifier
- for (pandas) categorized data
- using log(prob) for calculating precisely

In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings(action='ignore')

In [2]:
class NBC_category():
    def __init__(self):
        self.features = pd.DataFrame()
        self.label = pd.DataFrame()
        self.log_prob_label_ = {}
        self.log_prob_feature_ = {}
        self.idx_col={}
        
    
    def train(self, features, label):
        '''
        features : pandas DataFrame (except label)
        label : pandas Series
        '''
        self.features = features
        self.label = label
        self.log_prob_label_ = self.column_log_prob(label)        
        idx = 0
        for col in features:
            self.log_prob_feature_[col] = self.feature_log_prob(self.features[col], self.label)
            self.idx_col[idx] = col
            idx+=1
            
    
    def predict(self, data):
        '''
        y = argmax{ log(P(y)) + log(P(x_1|y)) + log(P(x_2|y)) + ... }
        '''
        prob_dict = {i:self.log_prob_label_[i] for i in self.label.unique()}
        
        for y in prob_dict:
            prob_dict[y] = self.log_prob_label_[y]
            for i in range(len(data)):
                prob_dict[y] += self.log_prob_feature_[self.idx_col[i]][(data[i], y)]
        
        maximum = np.log(0)
        maximum_idx = None
        for y in prob_dict:
            if prob_dict[y] >= maximum:
                maximum = prob_dict[y]
                maximum_idx = y
        
        return maximum_idx
    
        
    def column_log_prob(self, column):
        '''
        calculate log(p(y))
        꼭 label일 필요는 없음
        하나의 column(series)의 unique value 각각의 비율(확률)을 계산하여 그 log 값을 return
        return : {'y_1':log(P(y_1)) , 'y_2': log(P(y_2)), ...}
        '''
        log_prob_dict = {}
        total_len = len(column)
        
        for val in column.unique():
            length = len(column[column==val])
            log_prob_dict[val] = np.log(length) - np.log(total_len)
            
        return log_prob_dict
    
    
    def feature_log_prob(self, feature, label):
        '''
        calculate log(p(x_i|y))
        해당 값을 계산하여 dictionary 형태로 반환
        return : {(x_i, y_j): log(P(x_i|y_j)) ... }   for x_i in feature, y_j in label
        '''
        f_l_df = pd.concat([feature, label], axis=1)
        feature_idx = list(feature.unique())
        label_idx = list(label.unique())
        
        f_dict = {feature_idx[f]:f for f in range(len(feature_idx))}
        l_dict = {label_idx[l]:l for l in range(len(label_idx))}
                
        count_matrix = [[0 for f in feature_idx] for l in label_idx]
        
        for instance in f_l_df.values:
            count_matrix[l_dict[instance[1]]][f_dict[instance[0]]] += 1

        count_matrix = np.array(count_matrix)
        col_sum = np.sum(count_matrix,axis=1)
        col_sum = np.expand_dims(col_sum, axis=1)
        count_matrix = np.log(count_matrix)
        col_sum = np.log(col_sum)
        count_matrix = count_matrix - col_sum
        
        log_prob_dict = {(feature_idx[f], label_idx[l]):count_matrix[l][f] 
                         for f in range(len(feature_idx)) for l in range(len(label_idx))}
        
        return log_prob_dict

## Try with a sample
### Data documentation: [Car Evaluation](https://archive.ics.uci.edu/ml/datasets/Car+Evaluation)
(https://archive.ics.uci.edu/ml/datasets/Car+Evaluation)

* Car Evaluation의 feature: `buying`, `maint`, `doors`, `persons`, `lung_boot`, `satefy`

* Car Evalaution의 label: `accept`

In [3]:
sample_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'

sample_df = pd.read_csv(sample_url, 
                        names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "acceptability"])

### Pre-processing

In [4]:
for col in sample_df.columns:
    sample_df[col] = sample_df[col].astype('category')
    
sample_feature = sample_df.drop(columns=['acceptability'])
sample_label = sample_df['acceptability']

In [5]:
sample_feature.head(3)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high


In [6]:
sample_label.head(3)

0    unacc
1    unacc
2    unacc
Name: acceptability, dtype: category
Categories (4, object): ['acc', 'good', 'unacc', 'vgood']

### Create a model

In [7]:
model = NBC_category()

### Train the model

In [8]:
model.train(sample_feature, sample_label)

### Prediction with 

In [9]:
test_data = ['med', 'med', '2', '4', 'big', 'high']
test_predict = model.predict(test_data)

print(' - Prediction of Test data')
print(f'test data: {test_data} --> predict: {test_predict}')

 - Prediction of Test data
test data: ['med', 'med', '2', '4', 'big', 'high'] --> predict: acc
