In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [3]:
from sklearn.model_selection import train_test_split


X = pd.read_csv("train_values.csv", index_col="building_id")
y = pd.read_csv("train_labels.csv", index_col="building_id")
X_test = pd.read_csv("test_values.csv", index_col="building_id")

y['damage_grade'] = y['damage_grade'].replace({1: 0, 2: 1, 3: 2})
#Encode the dataset
X = pd.get_dummies(X, dtype = int)
# encoder = LabelEncoder()
# for column in X.columns:
#     if X[column].dtype == 'object':
#         X[column] = encoder.fit_transform(X[column])
#Scaler the dataset
#Use formula
print(X.columns)

X_scaler = (X - X.mean()) / X.std(ddof=0)

# Use corr formula
X_corr = X_corr = (1 / X_scaler.shape[0]) * X_scaler.T.dot(X_scaler)

X_train, X_valid, y_train, y_valid = train_test_split(X, y , test_size=0.2, random_state=40)


Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other', 'land_surface_condition_n',
       'land

In [4]:
class MixedNB():
    def __init__(self, gauss_index=None, bernouli_index=None, alpha=1):
        self.gauss_index = gauss_index
        self.bernouli_index = bernouli_index
        self.alpha = alpha


    def fit(self, X, Y):
        self.classes = np.unique(np.array(Y))
        self.n_features = X.shape[1]
        n_class = len(self.classes)
        self.class_priors = np.empty(shape=(n_class))

        self._calculate_stuffs(X, Y)

    def _calculate_stuffs(self, X, Y):
        P_ic = []
        means = []
        vars_ = []

        for k, c in enumerate(self.classes):
            Xc = X.iloc[np.where(Y == c)]
            temp = []
            temp_mean = []
            temp_var = []
            self.class_priors[k] = (len(Xc) + self.alpha) / (len(X) + self.alpha * len(self.class_priors))
            
            for i in range(X.shape[1]):
                feat_probs_c = np.sum(Xc.iloc[:, i]) / len(Xc)

                temp.append(feat_probs_c)
                temp_mean.append(np.mean(Xc.iloc[:, i]))
                temp_var.append(np.var(Xc.iloc[:, i]))

            P_ic.append(temp)
            means.append(temp_mean)
            vars_.append(temp_var)

        self.P_ic = np.array(P_ic)
        self.means = np.array(means)
        self.vars = np.array(vars_)


    def _pdf_binomial(self, x, feat_probs_class):
        return (feat_probs_class * x) * ((1 - feat_probs_class) * (1 - x))
    

    def _pdf_gauss(self, x, mean, variance):
        exponent = np.exp(-((x - mean) ** 2) / (2 * variance))
        return (1 / np.sqrt(2 * np.pi * variance)) * exponent
    
    
    def _get_joint_probs(self, X):

        class_probs = []
        for x in X:
            # print(x)
            posts = []

            for i in range(len(self.class_priors)):
                post = np.log(self.class_priors[i])

                for j in range(X.shape[1]):
                    # print("buoi")
                    if j in self.gauss_index:
                        post += np.log(self._pdf_gauss(x=x[j], mean=self.means[i,j], variance=self.vars[i, j]))
                    elif j in self.bernouli_index:
                        # print("loz")
                        post += np.log(self._pdf_binomial(x=x[j], feat_probs_class=self.P_ic[i, j]))

                posts.append(post)
            class_probs.append(posts)

        return np.array(class_probs)
    
    
    def predict(self, X):
        probs = self._get_joint_probs(X)
        # print(probs)
        Y_preds = []
        for prob in probs:
            Y_preds.append(self.classes[np.argmax(prob)])

        return Y_preds

In [5]:
test = MixedNB()
test.fit(X_train, y_train)
print(test.predict(X_valid))

: 