## Loading dataset

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer(as_frame=True)

In [3]:
dataset['data'].head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [10]:
dataset['target'].head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [11]:
dataset['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

## Pre-Processing

In [2]:
X = dataset['data']
y = np.array(dataset['target'])

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)

In [4]:
from sklearn.preprocessing import StandardScaler

ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)

ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)

In [5]:
print(X_train[0].shape)
print(X_train.shape)
print(type(y_train))

(30,)
(426, 30)
<class 'numpy.ndarray'>


In [5]:
A = np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]])
y = np.array([0,1,1,0], dtype=np.int64)

B = np.zeros(tuple([len(set(y))]+list(A.shape[1:])))

for i in range(A.shape[0]):
    B[y[i]] += A[i]

print(B)

[[11. 13. 15.]
 [11. 13. 15.]]


## Fit into different classification models

In [5]:
import sys
sys.path.append("/home/joelm/ml")

from supervised.classification.full_bayes import BayesClassifer
from supervised.classification.naive_bayes import NBClassifer
from supervised.classification.k_means_classification import KMeansClassifier

kmeans = KMeansClassifier()
naive_bayes = NBClassifer()
full_bayes = BayesClassifer()



In [6]:
kmeans.fit(X_train, y_train)
y_pred_kmeans = kmeans.predict(X_test)

naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)

full_bayes.fit(X_train, y_train)
y_pred_fb = full_bayes.predict(X_test)

  nll_j = -np.log(self.dist(X_test[i], self.mu[j], self.var[j]) * self.prior[j])
  nll_j = -np.log(self.dist(X_test[i], self.mu[j], self.cov[j]) * self.prior[j])


## Compare performance using metrics

In [7]:
from utils.binary_classification_metrics import *

performance_df = pd.DataFrame({'Model':[],'Accuracy':[],'Precision':[],'Recall':[],'F1Score':[]})

predictions = [y_pred_kmeans,y_pred_nb,y_pred_fb]
model = ['K-Means','Naive Bayes', 'Full Bayes']

for i in range(3):
    row = []
    row.append(model[i])
    row.append(Accuracy(y_test,predictions[i]))
    row.append(Precision(y_test,predictions[i]))
    row.append(Recall(y_test,predictions[i]))
    row.append(F1score(y_test,predictions[i]))
    
    performance_df.loc[i] = row


performance_df.head()


Unnamed: 0,Model,Accuracy,Precision,Recall,F1Score
0,K-Means,0.013986,0.011111,0.007143,0.008696
1,Naive Bayes,0.937063,0.955556,0.945055,0.950276
2,Full Bayes,0.909091,0.877778,0.975309,0.923977
