In [1]:
import pandas as pd
import numpy as np
import scipy.stats as s

In [2]:
class Gausian_NB():
    
    def __init__(self,features,labels,data_split_ratio,Apply_PCA,n_components):
        self.X = features
        self.Y = np.array(labels).reshape(labels.shape[0],1) 
        self.ratio = data_split_ratio
        self.n_components = n_components
        if Apply_PCA == True:
            self.X = self.pca(self.X, self.n_components)
            
            
    def pca(self,data, n_components):
        X = np.array(data)
        X_dash = X - X.mean(axis=0)
        cov = (1/X.shape[0]) * np.matmul(X_dash.T, X_dash)
        Q = np.linalg.svd(cov)[0]
        Q_tilda = Q[:,:n_components]
        X_new = np.matmul(X_dash, Q_tilda)
        data_new = pd.DataFrame(X_new)
        return data_new
    
    
    def split_data(self):
        data = pd.DataFrame(self.X)
        data['diagnosis'] = self.Y
        Data_pos = data[data['diagnosis'] == np.unique(self.Y)[0]]
        Data_neg = data[data['diagnosis'] == np.unique(self.Y)[1]]
        
        Training_data_count = int(data.shape[0]*self.ratio[0])
        CV_data_count = int(data.shape[0]*self.ratio[1])
        Test_data_count = data.shape[0] - (Training_data_count + CV_data_count)
        
        Training_data = pd.concat([Data_pos.iloc[:Training_data_count//2,:], Data_neg.iloc[:Training_data_count//2,:]])
        
        Remaing_data = pd.concat([Data_pos.iloc[Training_data_count//2:,:],Data_neg.iloc[Training_data_count//2:,:]])
        
        Random_number = np.random.choice(np.arange(0,171), size=171, replace=False)
        Remaing_data = Remaing_data.iloc[Random_number]
        
        CV_data = Remaing_data.iloc[:CV_data_count,:]
        
        Testing_data = Remaing_data.iloc[CV_data_count:,:]
        
        return Training_data, CV_data, Testing_data
    
    
    def fit(self,data):
        training_data_pos = data[data['diagnosis'] == np.unique(self.Y)[0]].iloc[:,:-1]
        training_data_neg = data[data['diagnosis'] == np.unique(self.Y)[0]].iloc[:,:-1]
        self.mean = np.array([training_data_pos.mean(axis=0), training_data_neg.mean(axis=0)])
        self.cov = np.array([training_data_pos.cov(), training_data_neg.cov()])
        self.prior = np.array([training_data_pos.shape[0]/data.shape[0], training_data_neg.shape[0]/data.shape[0]])
#         print('Mean :- ', self.mean, '\nCOV :- ', self.cov, '\nPrior :- ', self.prior)

        
    def evaluate(self, data):
        posterior_m = np.array(s.multivariate_normal.pdf(data.iloc[:,:-1], self.mean[0], self.cov[0]) * self.prior[0])
        posterior_b = np.array(s.multivariate_normal.pdf(data.iloc[:,:-1], self.mean[1], self.cov[1]) * self.prior[1])
#         print(posterior_m, posterior_b)
        Boolean_mask = pd.Series(posterior_b > posterior_m)
        predicted = np.array(Boolean_mask.replace(to_replace=[True, False], value=['B', 'M']))
        self.performance(predicted, data.iloc[:,-1])
        print('\nPredicted = ', predicted, '\n\nActual =',np.array(data.iloc[:,-1]))
    
        
    def performance(self, actual,predicted,):

        TP = np.count_nonzero((predicted == 'M') & (actual == 'M'))
        TN = np.count_nonzero((predicted == 'B') & (actual == 'B'))
        FP = np.count_nonzero((predicted == 'M') & (actual == 'B'))
        FN = np.count_nonzero((predicted == 'B') & (actual == 'M'))

        if (TP+TN+FP+FN) == 0:
            Accuracy = 0
        else:
            Accuracy = (TP+TN)/(TP+TN+FP+FN)

        if (TP+FP) == 0:
            Precision = 0
        else:
            Precision = TP/(TP+FP)

        if (TP+FN) == 0:
            Recall = 0
        else:
            Recall = TP/(TP+FN)

        if (Precision+Recall) == 0:
            F1_Score = 0
        else:
            F1_Score = (2*Precision*Recall)/(Precision+Recall)

        print(f' Accuracy = {Accuracy} \n Precision = {Precision} \n Recall = {Recall} \n F1-Score = {F1_Score}')

        return {'Accuracy':Accuracy, 'Precision':Precision, 'Recall':Recall, 'F1-Score':F1_Score}

In [3]:
data = pd.read_csv('data.csv')

In [4]:
data.drop([data.columns[0], data.columns[-1]], inplace=True, axis=1)

In [5]:
clf = Gausian_NB(data.iloc[:,1:], data.iloc[:,0], (0.7, 0.2, 0.1), True, 15)

In [6]:
Train, CV, Test = clf.split_data()

In [7]:
clf.fit(Train)

In [8]:
clf.evaluate(CV)

 Accuracy = 0.061946902654867256 
 Precision = 1.0 
 Recall = 0.061946902654867256 
 F1-Score = 0.11666666666666665

Predicted =  ['M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'
 'M' 'M' 'M' 'M' 'M'] 

Actual = ['B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B'
 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' '