In [132]:
import numpy as np
from scipy.stats import bernoulli,  norm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class binary_naive_bayes():
    """
    A class which can fit data set according to the Naive Bayes method and predict new unseen data
    (USING MLE)
    Atributes:
        data : Contains the Data which the model was fit with
    Methods:
        fit(data)
        predit(new_data)
    """
    def __init__(self):
        """Initializes the class with data, model and Class atributes"""
        self.data = None
        self.model = {'discrete_feature_MLE' : [], 'continuous_feature_MLE' : []}
        self.classes = [0, 1] #Note that by making this dynamic we can take this from a binary classifyer to an arbitary one
    def fit(self, discrete_features, continuous_features, output):
        """
        Description:
            fit(data): Creates a model for prediction
        Parameters:
            discrete_features: A $nxm_0$ numpy array which contains the data to train on 
                              (m is number of discrete_features n is the number of entries)
            continuous_features: A $nxm_1$ numpy array which contains the data to train on 
                                 ($m_1$ is number of discrete_features n is the number of entries)
            output: A $nx1$ numpy array which contains either 0 or 1 for the output of the data given
        """
        for i in range(continuous_features.shape[1]):
            for _class in self.classes:
                col = continuous_features[np.where(output == _class)[0]][:, i]
                self.model['continuous_feature_MLE'].append((_class, np.mean(col), np.std(col, ddof=1)))
        for i in range(discrete_features.shape[1]):
            for _class in self.classes:
                col = discrete_features[np.where(output == _class)[0]][:, i]
                self.model['discrete_feature_MLE'].append((_class, np.mean(col)))
        self.data = np.hstack((discrete_features, continuous_features))
    def predict(self, new_data_discrete_features, new_data_continuous_features):
        """
        Description:
            predict(new_data): Predicts which of the two categories `new_data` will fall into
                               Raises Values error if no data has been fit yet
        Parameters:
            new_data_discrete_features: A new nxm_0 numpy vector which contains all of the atributes 
            new_data_continuous_features: A new nxm_1 numpy vector which contains all of the atributes 
            
        """
        if self.data is None:
            raise ValueError("You must successfully fit data before calling this method")
        if new_data_discrete_features.shape[0] != new_data_continuous_features.shape[0]:
            raise ValueError("Discrete and continous features don't match in the first dimension")
        def get_prob_for_single(new_data_discrete_feature, new_data_continuous_feature):
            prob = []
            for _class in self.classes:
                class_prob = 0
                discrete_features = [j for j in self.model['discrete_feature_MLE'] if j[0]==_class]
                continuous_features = [j for j in self.model['continuous_feature_MLE'] if j[0]==_class]
                for i, discrete_params in enumerate(discrete_features):
                    class_prob += np.log(bernoulli.pmf(new_data_discrete_feature[i], discrete_params[1]))
                for i, continuous_params in enumerate(continuous_features):
                    _, mean, std = continuous_params
                    class_prob += np.log(norm.pdf(new_data_continuous_feature[i], mean, std))
                prob.append(class_prob)
            prob = np.exp(prob)
            return prob/(np.sum(prob)) #Scale
        prob_array = []
        for k in range(new_data_discrete_features.shape[0]):
            prob_array.append(get_prob_for_single(new_data_discrete_features[k, :], new_data_continuous_features[k, :]))
        return np.array(prob_array)

[See adult data set here -- classification problem to determine if individual makes over 50K a year](https://archive.ics.uci.edu/ml/datasets/Adult)

In [54]:
data = pd.read_csv('./adult.data', delimiter=", ")#Import data 

  """Entry point for launching an IPython kernel.


In [55]:
y = data[["output"]]
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] 
X = data.drop(['fnlwgt', 'output'], axis=1)
X = pd.get_dummies(X, columns = categorical_cols)


In [59]:
discrete_var_column_names = [column_name for column_name in X.columns if "_" in column_name]
continuous_var_column_names = [column_name for column_name in X.columns if "_" not in column_name]

In [67]:
X_discrete = X[discrete_var_column_names].to_numpy()
X_continuous = X[continuous_var_column_names].to_numpy()
y = (y=="<=50K").to_numpy() + 0

In [78]:
X_continuous.shape 
#We're going to stack our X_discrete and X_continuous -- so we can test, train, split -- 
#we'll just tack the continous variables at the end and remember to break it back up after the split

(32561, 5)

In [81]:
X_train, X_test, y_train, y_test = train_test_split(np.hstack((X_discrete, X_continuous)), y, test_size=0.33, random_state=42)

In [130]:
X_discrete_trian = X_train[:, :102]
X_continuous_trian = X_train[:, 102:107]

personal_inmplementaiton_clf = binary_naive_bayes()
personal_inmplementaiton_clf.fit(X_discrete_trian, X_continuous_trian, y_train)

In [131]:
X_discrete_test = X_test[:, :102]
X_continuous_test = X_test[:, 102:107]

y_pred = personal_inmplementaiton_clf.predict(X_discrete_test, X_continuous_test)



In [133]:
accuracy_score(y_test, np.argmax(y_pred, axis=1))

0.8305415968732551

You'll notice that compared to the out of the Box Naive Bayes provided by sklearn will actually perform worse then the model above. This is because it (perhaps wrongly) assumes that even the discrete data is drawn from a normal distrobution. Thus arguably the above model above is `more` correct.

In [134]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

  return f(**kwargs)


0.801321421924437