In [10]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [11]:
data = pd.read_csv('caesarian.csv')
non_categorical=['Age']
decision_column_name='Caesarian'
count_of_yes_no=[]
tables={}

In [12]:
train, test = train_test_split(data, test_size=.20, random_state=10)
print(test)

    Age  Delivery  Delivery1  Blood  Heart  Caesarian
41   19         1          0      1      0          1
46   26         1          0      1      0          0
26   18         1          1      2      1          1
32   32         2          0      2      1          1
35   28         3          0      2      0          1
65   35         2          0      1      0          1
21   33         2          0      0      1          1
3    28         1          0      2      0          0
37   31         2          2      1      0          0
19   24         1          2      0      1          1
44   36         4          0      2      1          1
34   26         2          2      1      0          1
43   22         1          0      1      0          1
74   38         3          2      2      1          1
2    26         2          1      1      0          0
6    27         2          0      1      0          0


In [13]:
# X_of_test=test.iloc[:,:-1].values  # X > features
# Y_actual_of_test=test.iloc[:,-1].values   # Y > decision column 
# Y_actual_of_train = train[decision_column_name]
# print(X_of_test)

In [14]:
# All support functions
def extract_features():
  return list(train.columns)[:-1]

def find_unique(Y_actual_of_train):
  return sorted(Y_actual_of_train.unique())

def filter_column(data,column,filter_value):
  return data[column==filter_value]

def prior_probability_calculator(Y_actual_of_train):
  classes = find_unique(Y_actual_of_train)
  prior_probabilities = []
  for c in classes:
    filtered_data = filter_column(train, Y_actual_of_train, c)
    count_of_yes_no.append(len(filtered_data))
    prior_probabilities.append(len(filtered_data)/len(train))
  return prior_probabilities

def is_feature_categorical(feature_name):
  for i in non_categorical:
    if(feature_name==i):
      return False
  return True


def make_table(feature_column, decision_column):
  unique_attributes=find_unique(feature_column)
  decision_values=find_unique(decision_column)
  feat_table={}
  for i in unique_attributes:
    probablity_feature_data_with_decision=[]
    feature_data=filter_column(train,feature_column,i)
    for j in decision_values:
      feature_data_with_decision=filter_column(feature_data,feature_data[decision_column_name],j)  
      probablity_feature_data_with_decision.append(len(feature_data_with_decision)/count_of_yes_no[j])
    probablity_feature_data_with_decision.append(len(feature_data))
    feat_table[i]=probablity_feature_data_with_decision
  return feat_table

def calculate_probablity(feat_name,feat_value,decision_value,decision_column):
  d=filter_column(train,decision_column,decision_value)
  mean,std=d[feat_name].mean(),d[feat_name].std()
  return (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_value-mean)**2 / (2 * std**2 )))

In [15]:
def naiveBayes(train, test_features, Y_actual_of_train):
    X_of_train = extract_features()
    prior_probabilities = prior_probability_calculator(Y_actual_of_train)
    for feature_name in X_of_train:
        if is_feature_categorical(feature_name):
            tables[feature_name] = make_table(train[feature_name], Y_actual_of_train)
    
    Y_pred_of_train = []
    for test_feature in test_features:
        probability_of_feature_given_decision = [1]*len(prior_probabilities)
        posterior_probabilities = [1]*len(prior_probabilities)
        for i in range(len(prior_probabilities)):
            for j in range(len(X_of_train)):
                if(is_feature_categorical(X_of_train[j])):
                    if(tables[X_of_train[j]].get(test_feature[j]) != None):
                        probability_of_feature_given_decision[i] *=(tables[X_of_train[j]][test_feature[j]][i])
                    else:
                        probability_of_feature_given_decision[i] *=0 
                else:
                    probability_of_feature_given_decision[i] *=calculate_probablity(X_of_train[j],test_feature[j],i,Y_actual_of_train)
            posterior_probabilities[i]=probability_of_feature_given_decision[i]*prior_probabilities[i]
        Y_pred_of_train.append(np.argmax(posterior_probabilities))
    return Y_pred_of_train 

In [16]:
X_of_test=test.iloc[:,:-1].values  # X > features
Y_actual_of_test=test.iloc[:,-1].values   # Y > decision column 
Y_actual_of_train = train[decision_column_name]

Y_pred_of_train=naiveBayes(train, X_of_test, Y_actual_of_train)

# print(Y_actual_of_test, Y_pred_of_train)
# print(confusion_matrix(Y_actual_of_test, Y_pred_of_train))

print(accuracy_score(Y_actual_of_test, Y_pred_of_train))

0.75


In [17]:
TP = 0
FP = 0
FN = 0
TN = 0
# 1 positive && 0 negative
for i in range(len(Y_actual_of_test)):
    if(Y_actual_of_test[i] == 1 and Y_pred_of_train[i] == 1):
        TP += 1
    elif(Y_actual_of_train[i] == 1 and Y_pred_of_train[i] == 0):
        FN += 1
    elif(Y_actual_of_test[i] == 0 and Y_pred_of_train[i]== 1):
        FP += 1
    elif(Y_actual_of_train[i] == 0 and Y_pred_of_train[i] == 0):
        TN += 1


print("\nConfusion Matrix:" + "\ntrue positive: "+str(TP) + "\nfalse negative: "+str(FN) 
      + "\nfalse positive: "+str(FP) + "\ntrue negative: "+str(TN) )

print("\nAccuracy: " + str((TP+TN)/(TP+FN+FP+TN))) 


Confusion Matrix:
true positive: 8
false negative: 3
false positive: 1
true negative: 4

Accuracy: 0.75
