In [5]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn import metrics

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from cluster_data_creation import data_for_cluster, cluster_extract

In [6]:
feat_num = 21
classes = 3

# load data from df to arrays to train and test

def xls2df(df, N, split=0.8):
    # strings of feature names for reference
    # of 21 features
    feat_num = 21
    classes = 3
    feature_names = np.array([data[i][0] for i in range(1, feat_num+classes)])
    
    # load all data
    all_data = np.array([[data[i][j] for i in range(1, feat_num+1)]
                                     for j in range(1, N+1)])
    # load all labels
    all_labels = np.array([data[23][i] for i in range(1, N+1)])
    
    # shuffle indices so that no class gets huddled together
    idxs = [i for i in range(N)]
    np.random.shuffle(idxs)
    all_data = all_data[idxs]
    all_labels = all_labels[idxs]
    
    # split into 80: 20 for test: train data
    train_num = int(N*split)

    # load train data
    train_data = all_data[:train_num]
    
    # load train labels
    train_labels = all_labels[:train_num]
    
    # load test data
    test_data = all_data[train_num:]
    
    # load test labels
    test_labels = all_labels[train_num:]
    
    return train_data, train_labels, test_data, test_labels

# Test on Original Dataset

In [36]:
# load data
xls = pd.ExcelFile('../CTG.xls')
data = pd.read_excel(xls, 'Data')

x_train, y_train, x_test, y_test = xls2df(data, 2126)

In [50]:
model = GaussianNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print("Accuracy of Guassian NB:", metrics.accuracy_score(y_test, y_pred))
print("F-score of Guassian NB:", f1_score(y_test, y_pred, average='macro'))

Accuracy of Guassian NB: 0.8145539906103286
F-score of Guassian NB: 0.730946248600224


In [51]:
model = BernoulliNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print("Accuracy of Bernoulli NB:", metrics.accuracy_score(y_test, y_pred))
print("F-score of Bernoulli NB:", f1_score(y_test, y_pred, average='macro'))

Accuracy of Bernoulli NB: 0.852112676056338
F-score of Bernoulli NB: 0.7432343263489369


# Test on Artificial Dataset

In [7]:
# load data
xls = pd.ExcelFile('../data_creation/artificial_FINAL.xlsx')
data = pd.read_excel(xls, 'Sheet1')

x_train, y_train, x_test, y_test = xls2df(data, 5400)

In [8]:
model = GaussianNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print("Accuracy of Guassian NB:", metrics.accuracy_score(y_test, y_pred))
print("F-score of Guassian NB:", f1_score(y_test, y_pred, average='macro'))

Accuracy of Guassian NB: 0.8842592592592593
F-score of Guassian NB: 0.8834033460075634


In [9]:
model = BernoulliNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print("Accuracy of Bernoulli NB:", metrics.accuracy_score(y_test, y_pred))
print("F-score of Bernoulli NB:", f1_score(y_test, y_pred, average='macro'))

Accuracy of Bernoulli NB: 0.8851851851851852
F-score of Bernoulli NB: 0.8878249441664946
