In [1]:
import numpy as np
import pandas as pd
import warnings
import csv
from itertools import groupby

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import recall_score as rec, precision_score as pre, f1_score as f1, accuracy_score as acc

In [2]:
# Read Signatures and z scores
csv_name = "../data/labeled_sigs"
z_score_csv_name = "../data/z_score_aa"
df_init = pd.read_csv(csv_name, delimiter='\t')
z_scores_df = pd.read_csv(z_score_csv_name, delimiter=' ')

In [3]:
df_init

Unnamed: 0,sub,sig,seq
0,pro,LWHAFDVAAQESYAAQAGEHNHYGPAETHVMTGI,DVQYAAHVMK
1,phe,VPLAFDAALWELTLVVAGETNAYGPTEAAVCTTI,DAWTVAAVCK
2,lys,YDHWFDAAWQPADTALGGEFNCYGPTETTVEAVV,DAQDAGCVEK
3,pro,LWHTFDVAAQEAYAAQAGEHNHYGPAETHVMTGT,DVQYAAHVMK
4,phe,TAQAFDAAVWESALIVAGDVNAYGLTETTVCATM,DAWAIAAVCK
...,...,...,...
449,leu,LWHAFDASVWEPFILTGGDVNNYGPTENTVVTTS,DAWFLGNVVK
450,orn,AGWAFDVFAGDREFVVGSDINSYGLSEATIDSTY,DVGEVGSIDK
451,pip,LWQAFDISLQESFVSQAGEHNHYGPSEAHVVTSY,DIQFSAHVVK
452,betaKala,RWMTFVDHVAESVLFCSGEFNLYGSSEVAADVTC,VDAVFSLADK


In [4]:
# Remove the entries with only one substrate occurence
df_original = df_init.drop_duplicates(subset='sig')
print("Reduced data size from ",len(df_init)," to ", len(df_original), " due to signature duplicates")

Reduced data size from  454  to  350  due to signature duplicates


In [10]:
#pd.DataFrame(df_init[['sub', 'sig']].groupby('sig').last())#.sort_values(['sig'])
df_init[['sub', 'sig']].groupby('sig')
#df_init.drop_duplicates(subset='sig')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EE027E4748>

In [6]:
df_original

Unnamed: 0,sub,sig,seq
0,pro,LWHAFDVAAQESYAAQAGEHNHYGPAETHVMTGI,DVQYAAHVMK
1,phe,VPLAFDAALWELTLVVAGETNAYGPTEAAVCTTI,DAWTVAAVCK
2,lys,YDHWFDAAWQPADTALGGEFNCYGPTETTVEAVV,DAQDAGCVEK
3,pro,LWHTFDVAAQEAYAAQAGEHNHYGPAETHVMTGT,DVQYAAHVMK
4,phe,TAQAFDAAVWESALIVAGDVNAYGLTETTVCATM,DAWAIAAVCK
...,...,...,...
449,leu,LWHAFDASVWEPFILTGGDVNNYGPTENTVVTTS,DAWFLGNVVK
450,orn,AGWAFDVFAGDREFVVGSDINSYGLSEATIDSTY,DVGEVGSIDK
451,pip,LWQAFDISLQESFVSQAGEHNHYGPSEAHVVTSY,DIQFSAHVVK
452,betaKala,RWMTFVDHVAESVLFCSGEFNLYGSSEVAADVTC,VDAVFSLADK


In [7]:
# Prep for hardcoded values
m=26
nrow = 12
D = np.zeros((nrow, m))
D1 = D[0 ,:]
D2 = D[1 ,:]
D3 = D[2 ,:]
D4 = D[3 ,:]
D5 = D[4 ,:]
D6 = D[5 ,:]
D7 = D[6 ,:]
D8 = D[7 ,:]
D9 = D[8 ,:]
D10 = D[9 ,:]
D11 = D[10 ,:]
D12 = D[11 ,:]

In [8]:
# Hard coded values, taken from NRPSPredictor2 github

#1 aa-alpha-helix.aaindex
D1[0]=1.420;D1[1]=0.000;D1[2]=0.700;D1[3]=1.010;D1[4]=1.510;D1[5]=1.130;D1[6]=0.570;D1[7]=1.000;D1[8]=1.080;D1[9]=0.000;D1[10]=1.160;D1[11]=1.210;D1[12]=1.450;D1[13]=0.670;D1[14]=0.000;D1[15]=0.570;D1[16]=1.110;D1[17]=0.980;D1[18]=0.770;D1[19]=0.830;D1[20]=0.000;D1[21]=1.060;D1[22]=1.080;D1[23]=0.000;D1[24]=0.690;D1[25]=0.000;
#2 aa-beta-sheet.aaindex
D2[0]=0.830;D2[1]=0.000;D2[2]=1.190;D2[3]=0.540;D2[4]=0.370;D2[5]=1.380;D2[6]=0.750;D2[7]=0.870;D2[8]=1.600;D2[9]=0.000;D2[10]=0.740;D2[11]=1.300;D2[12]=1.050;D2[13]=0.890;D2[14]=0.000;D2[15]=0.550;D2[16]=1.100;D2[17]=0.930;D2[18]=0.750;D2[19]=1.190;D2[20]=0.000;D2[21]=1.700;D2[22]=1.370;D2[23]=0.000;D2[24]=1.470;D2[25]=0.000;
#3 aa-beta-turn.aaindex
D3[0]=0.740;D3[1]=0.000;D3[2]=0.960;D3[3]=1.520;D3[4]=0.950;D3[5]=0.660;D3[6]=1.560;D3[7]=0.950;D3[8]=0.470;D3[9]=0.000;D3[10]=1.190;D3[11]=0.500;D3[12]=0.600;D3[13]=1.460;D3[14]=0.000;D3[15]=1.560;D3[16]=0.960;D3[17]=1.010;D3[18]=1.430;D3[19]=0.980;D3[20]=0.000;D3[21]=0.590;D3[22]=0.600;D3[23]=0.000;D3[24]=1.140;D3[25]=0.000;
#4 aa-hydrogenbond.aaindex
D4[0]=0.000;D4[1]=0.000;D4[2]=0.000;D4[3]=1.000;D4[4]=1.000;D4[5]=0.000;D4[6]=0.000;D4[7]=1.000;D4[8]=0.000;D4[9]=0.000;D4[10]=2.000;D4[11]=0.000;D4[12]=0.000;D4[13]=2.000;D4[14]=0.000;D4[15]=0.000;D4[16]=2.000;D4[17]=4.000;D4[18]=1.000;D4[19]=1.000;D4[20]=0.000;D4[21]=0.000;D4[22]=1.000;D4[23]=0.000;D4[24]=1.000;D4[25]=0.000;
#5 aa-hydrophobicity-neu1.aaindex
D5[0]=0.060;D5[1]=0.000;D5[2]=-0.560;D5[3]=0.970;D5[4]=0.850;D5[5]=-0.990;D5[6]=0.320;D5[7]=0.150;D5[8]=-1.000;D5[9]=0.000;D5[10]=1.000;D5[11]=-0.830;D5[12]=-0.680;D5[13]=0.700;D5[14]=0.000;D5[15]=0.450;D5[16]=0.710;D5[17]=0.800;D5[18]=0.480;D5[19]=0.380;D5[20]=0.000;D5[21]=-0.750;D5[22]=-0.570;D5[23]=0.000;D5[24]=-0.350;D5[25]=0.000;
#6 aa-hydrophobicity-neu2.aaindex
D6[0]=-0.250;D6[1]=0.000;D6[2]=-0.400;D6[3]=-0.080;D6[4]=-0.100;D6[5]=0.180;D6[6]=-0.320;D6[7]=-0.030;D6[8]=-0.030;D6[9]=0.000;D6[10]=0.320;D6[11]=0.050;D6[12]=-0.010;D6[13]=-0.060;D6[14]=0.000;D6[15]=0.230;D6[16]=-0.020;D6[17]=0.190;D6[18]=-0.150;D6[19]=-0.100;D6[20]=0.000;D6[21]=-0.190;D6[22]=0.310;D6[23]=0.000;D6[24]=0.400;D6[25]=0.000;
#7 aa-hydrophobicity-neu3.aaindex
D7[0]=0.250;D7[1]=0.000;D7[2]=-0.140;D7[3]=0.080;D7[4]=-0.050;D7[5]=0.150;D7[6]=0.280;D7[7]=-0.100;D7[8]=0.100;D7[9]=0.000;D7[10]=0.110;D7[11]=0.010;D7[12]=0.040;D7[13]=0.170;D7[14]=0.000;D7[15]=0.410;D7[16]=0.120;D7[17]=-0.410;D7[18]=0.230;D7[19]=0.290;D7[20]=0.000;D7[21]=0.030;D7[22]=0.340;D7[23]=0.000;D7[24]=-0.020;D7[25]=0.000;
#8 aa-isoelectric.aaindex
D8[0]=6.000;D8[1]=0.000;D8[2]=5.050;D8[3]=2.770;D8[4]=3.220;D8[5]=5.480;D8[6]=5.970;D8[7]=7.590;D8[8]=6.020;D8[9]=0.000;D8[10]=9.740;D8[11]=5.980;D8[12]=5.740;D8[13]=5.410;D8[14]=0.000;D8[15]=6.300;D8[16]=5.650;D8[17]=10.760;D8[18]=5.680;D8[19]=5.660;D8[20]=0.000;D8[21]=5.960;D8[22]=5.890;D8[23]=0.000;D8[24]=5.660;D8[25]=0.000;
#9 aa-polar-grantham.aaindex
D9[0]=8.100;D9[1]=0.000;D9[2]=5.500;D9[3]=13.000;D9[4]=12.300;D9[5]=5.200;D9[6]=9.000;D9[7]=10.400;D9[8]=5.200;D9[9]=0.000;D9[10]=11.300;D9[11]=4.900;D9[12]=5.700;D9[13]=11.600;D9[14]=0.000;D9[15]=8.000;D9[16]=10.500;D9[17]=10.500;D9[18]=9.200;D9[19]=8.600;D9[20]=0.000;D9[21]=5.900;D9[22]=5.400;D9[23]=0.000;D9[24]=6.200;D9[25]=0.000;
#10 aa-polar-radzicka.aaindex
D10[0]=-0.060;D10[1]=0.000;D10[2]=1.360;D10[3]=-0.800;D10[4]=-0.770;D10[5]=1.270;D10[6]=-0.410;D10[7]=0.490;D10[8]=1.310;D10[9]=0.000;D10[10]=-1.180;D10[11]=1.210;D10[12]=1.270;D10[13]=-0.480;D10[14]=0.000;D10[15]=1.100;D10[16]=-0.730;D10[17]=-0.840;D10[18]=-0.500;D10[19]=-0.270;D10[20]=0.000;D10[21]=1.090;D10[22]=0.880;D10[23]=0.000;D10[24]=0.330;D10[25]=0.000;
#11 aa-polar-zimmerman.aaindex
D11[0]=0.000;D11[1]=0.000;D11[2]=1.480;D11[3]=49.700;D11[4]=49.900;D11[5]=0.350;D11[6]=0.000;D11[7]=51.600;D11[8]=0.130;D11[9]=0.000;D11[10]=49.500;D11[11]=0.130;D11[12]=1.430;D11[13]=3.380;D11[14]=0.000;D11[15]=1.580;D11[16]=3.530;D11[17]=52.000;D11[18]=1.670;D11[19]=1.660;D11[20]=0.000;D11[21]=0.130;D11[22]=2.100;D11[23]=0.000;D11[24]=1.610;D11[25]=0.000;
#12 aa-volume.aaindex
D12[0]=90.000;D12[1]=0.000;D12[2]=103.300;D12[3]=117.300;D12[4]=142.200;D12[5]=191.900;D12[6]=64.900;D12[7]=160.000;D12[8]=163.900;D12[9]=0.000;D12[10]=167.300;D12[11]=164.000;D12[12]=167.000;D12[13]=124.700;D12[14]=0.000;D12[15]=122.900;D12[16]=149.400;D12[17]=194.000;D12[18]=95.400;D12[19]=121.500;D12[20]=0.000;D12[21]=139.000;D12[22]=228.200;D12[23]=0.000;D12[24]=197.000;D12[25]=0.000;



In [9]:
# Map Amino acid abbreviations with indices of D[i, :]

aa2ind_map = {}
aa2ind_map['A'] = 0
aa2ind_map['R'] = 17
aa2ind_map['D'] = 3
aa2ind_map['N'] = 13
aa2ind_map['C'] = 2
aa2ind_map['E'] = 4
aa2ind_map['Q'] = 16
aa2ind_map['G'] = 6
aa2ind_map['H'] = 7
aa2ind_map['I'] = 8
aa2ind_map['L'] = 11
aa2ind_map['K'] = 10
aa2ind_map['M'] = 12
aa2ind_map['F'] = 5
aa2ind_map['P'] = 15
aa2ind_map['S'] = 18
aa2ind_map['T'] = 19
aa2ind_map['W'] = 22
aa2ind_map['Y'] = 24
aa2ind_map['V'] = 21

In [10]:
# Dictionary to store 15 dimensional vector values for each amino acid

aa_dict = {}
for i, row in z_scores_df.iterrows():
    aa_dict[row['aa_short']] = [row['z1'], row['z2'], row['z3'], ] # Store z values
    aa_dict[row['aa_short']].extend(D[:, aa2ind_map[row['aa_short']]]) # Store rest of the 12 hard-coded values


In [11]:
# Create index to value map and value to index map from a list using list index
def get_dict_from_list(mylist):
    assert len(mylist) > 0
    idx_to_value_dict = {}
    value_to_idx_dict = {}
    for i in range(len(mylist)):
        idx_to_value_dict[i] = mylist[i]
        value_to_idx_dict[mylist[i]] = int(i)
    return idx_to_value_dict, value_to_idx_dict

# Get index-value maps for substrates
sub_vocab_set = set(df_original['sub'].tolist())
sub_vocab_list = list(sub_vocab_set)
sub_idx_to_value_dict, sub_value_to_idx_dict = get_dict_from_list(sub_vocab_list)

In [12]:
# Remove the entries with only one substrate occurence
df_cnt_filtered = df_original.groupby('sub').filter(lambda x: len(x) > 1)
print("Reduced data size from ",len(df_original)," to ", len(df_cnt_filtered), " due to count filtering")

Reduced data size from  350  to  326  due to count filtering


In [13]:
# Re-Create substrate index-value maps after removing the substrates with count 1

sub_vocab_set = set(df_cnt_filtered['sub'].tolist())
sub_vocab_list = list(sub_vocab_set)
sub_idx_to_value_dict, sub_value_to_idx_dict = get_dict_from_list(sub_vocab_list)

In [14]:
df_cnt_filtered

Unnamed: 0,sub,sig,seq
0,pro,LWHAFDVAAQESYAAQAGEHNHYGPAETHVMTGI,DVQYAAHVMK
1,phe,VPLAFDAALWELTLVVAGETNAYGPTEAAVCTTI,DAWTVAAVCK
2,lys,YDHWFDAAWQPADTALGGEFNCYGPTETTVEAVV,DAQDAGCVEK
3,pro,LWHTFDVAAQEAYAAQAGEHNHYGPAETHVMTGT,DVQYAAHVMK
4,phe,TAQAFDAAVWESALIVAGDVNAYGLTETTVCATM,DAWAIAAVCK
...,...,...,...
446,gly,FSMTFDIAGLELQALCGGEWNLYGPTETTIWSTV,DILQLGLIWK
448,ala,MWSTFDLSVFELNTNLAGEYNLYGPSEATTYSTS,DLFNNALTYK
449,leu,LWHAFDASVWEPFILTGGDVNNYGPTENTVVTTS,DAWFLGNVVK
450,orn,AGWAFDVFAGDREFVVGSDINSYGLSEATIDSTY,DVGEVGSIDK


In [15]:
# Given a signature of amino acids(34 aa long), construct the 34x15=510 dimentional representation
def get_encoding(signature):
    ret = []
    for i in signature:
        ret.extend(aa_dict[i])
    return np.asarray(ret)


In [16]:
# Format the data
sig_len = 34
raw_data = []
data_np = []
label_np = []
j=0
for i, row in df_cnt_filtered.iterrows():
    assert len(row['sig']) == sig_len
    raw_data.append(row['sig'])
    data_np.append(get_encoding(row['sig']))
    label_np.append(sub_value_to_idx_dict[row['sub']])
    j+=1
data_np = np.asarray(data_np)
label_np = np.asarray(label_np).astype('int')
raw_data = np.array(raw_data)

In [17]:
def get_hamming_distance(str1, str2):
    return sum(i != j for i, j in zip(str1, str2))

# For all points in test_data, choose the minimum hamming distance from all of train data, and return the distance list
def get_hamming_distance_bucket_info(test_data, train_data):
    dist_list = []
    for test_data_pt in test_data:
        dist_list.append(min([get_hamming_distance(test_data_pt, train_data_pt) for train_data_pt in train_data]))
    return np.array(dist_list)

def round_dec(num, dec=2):
    return float(round(num* 10.**dec))/(10**dec)

In [18]:
# Build classifier dictionary using sklearn multiclass classifiers

clf_dict={}
clf_dict['lr'] = LogisticRegression(random_state=0, max_iter=400, multi_class='multinomial', solver='newton-cg')
clf_dict['svm'] = make_pipeline(StandardScaler(), LinearSVC(random_state=0, multi_class='crammer_singer', tol=1e-9, max_iter=2000))
clf_dict['knn'] = KNeighborsClassifier(weights='distance')
clf_dict['mlp_sklearn'] = MLPClassifier(random_state=1, max_iter=400, early_stopping=False, )
clf_dict['rand_for'] = RandomForestClassifier(max_depth=4, criterion='entropy')
clf_dict['dec_tree'] = DecisionTreeClassifier(random_state=0, criterion='entropy')
clf_dict['ber_nb'] = BernoulliNB()
clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')
clf_dict['gau_nb'] = GaussianNB()
clf_dict['label_prop'] = LabelPropagation(kernel='knn')
clf_dict['label_spread'] = LabelSpreading(kernel='knn')
clf_dict['lda'] = LinearDiscriminantAnalysis()
clf_dict['ridge_cv'] = RidgeClassifierCV()
clf_dict['n_cent'] = NearestCentroid()
clf_dict['ridge'] = RidgeClassifier()

In [19]:
def print_stats(metric_list, metric_list_bucket, n_bucket):
    print(f"\n\nOverall Stats:")

    for item in metric_list:
        print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc']:.3f} Average Recall: {item['rec']:.3f} Average Precision: {item['pre']:.3f} Average F1 Score: {item['f1']:.3f}")

    for bucket in range(n_bucket):
        print(f"\n\nBucket {bucket+1} Stats:")
        for item in metric_list_bucket:
            print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc'][bucket]:.3f} Average Recall: {item['rec'][bucket]:.3f} Average Precision: {item['pre'][bucket]:.3f} Average F1 Score: {item['f1'][bucket]:.3f}")

def save_stats(metric_list, metric_list_bucket, n_bucket, pct_list, csv_name='stats'):
    overall_dict = {}
    for item in metric_list:
        overall_dict[item['pct']] = item['acc']
    
    with open(csv_name + '.csv', 'w', newline='') as csvfile:
        fieldnames = ['pct', 'overall_acc'] + ['Bucket '+str(item) for item in range(n_bucket)]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for item in metric_list_bucket:
            temp_dict = {'pct': item['pct'], 'overall_acc': round_dec(overall_dict[item['pct']])}
            for bucket in range(n_bucket):
                temp_dict['Bucket ' + str(bucket)] = round_dec(item['acc'][bucket])
            writer.writerow(temp_dict)

# n_iter signifies number of iterations
# test_percentage_list is the list of percentages of test data with respect to total data
# Code will iterate for each percentage for n_iter iterations
def train_and_validate(clf_type, n_iter=10, test_percentage_list = [5, 10, 25, 40, 50], print_stat=True, save_stat=True):
    print(f'\nUsing {clf_type} classifier')
    n=len(label_np)
    avg = 'micro'
    # metric_list contains overall accuracy, precision, recall and f1 score for all data
    # metric_list_bucket contains hammning distance bucket-wise accuracy, precision, recall and f1 score
    metric_list = []
    metric_list_bucket = []
    n_bucket = 6
    eps = 1e-8
    #n_iter = 3

    for test_percentage in test_percentage_list:
        assert test_percentage>0 and test_percentage<100
        acc_sum = 0
        pre_sum = 0
        rec_sum = 0
        f1_sum = 0
        print(f'Test data percentage wrt total data: {test_percentage}')
        n_iter_ar = np.zeros(n_bucket)
        acc_sum_ar = np.zeros(n_bucket)
        pre_sum_ar = np.zeros(n_bucket)
        rec_sum_ar = np.zeros(n_bucket)
        f1_sum_ar = np.zeros(n_bucket)
        dist_buckets_all = []

        for iter in range(n_iter):
            # Create filter for random split
            test_elig = np.random.random(size=(n)) <= (test_percentage/100)
            test_data = data_np[test_elig]
            test_label = label_np[test_elig]
            train_data = data_np[(test_elig-1).astype('bool')]
            train_label = label_np[(test_elig-1).astype('bool')]

            raw_train_data = raw_data[(test_elig-1).astype('bool')]
            raw_test_data = raw_data[test_elig]
            dist_buckets = get_hamming_distance_bucket_info(raw_test_data, raw_train_data)
            dist_buckets_all.extend(list(dist_buckets))
            #print(dist_buckets)

            clf = clf_dict[clf_type]
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                clf.fit(train_data, train_label)
            test_predicted = clf.predict(test_data)

            accuracy = acc(test_label, test_predicted)
            precision = pre(test_label, test_predicted, average=avg)
            recall = rec(test_label, test_predicted, average=avg)
            f1_score = f1(test_label, test_predicted, average=avg)
            acc_sum += accuracy
            pre_sum += precision
            rec_sum += recall
            f1_sum += f1_score
            print(f'Accuracy: {accuracy:.3f} Recall: {recall:.3f} Precision: {precision:.3f} F1 Score: {f1_score:.3f}')

            for bucket in range(n_bucket):
                b_filter = dist_buckets == bucket
                filtered_test_label = test_label[b_filter]
                filtered_test_predicted = test_predicted[b_filter]
                if len(filtered_test_label):
                    accuracy = acc(filtered_test_label, filtered_test_predicted)
                    precision = pre(filtered_test_label, filtered_test_predicted, average=avg)
                    recall = rec(filtered_test_label, filtered_test_predicted, average=avg)
                    f1_score = f1(filtered_test_label, filtered_test_predicted, average=avg)
                    acc_sum_ar[bucket] += accuracy
                    pre_sum_ar[bucket] += precision
                    rec_sum_ar[bucket] += recall
                    f1_sum_ar[bucket] += f1_score
                    n_iter_ar[bucket] += 1


        metric_list.append({'pct':test_percentage, 'acc':acc_sum/n_iter, 'pre':pre_sum/n_iter, 'rec':rec_sum/n_iter, 'f1':f1_sum/n_iter})
        metric_list_bucket.append({'pct':test_percentage, 'acc':acc_sum_ar/(n_iter+eps), 'pre':pre_sum_ar/(n_iter+eps), 'rec':rec_sum_ar/(n_iter+eps), 'f1':f1_sum_ar/(n_iter+eps)})
        freq_stat = {value: round_dec(len(list(freq))*100./len(dist_buckets_all), 2) for value, freq in groupby(sorted(dist_buckets_all))}
        print(f"Frequency of hamming distance: ", freq_stat)


    if print_stat:
        print_stats(metric_list, metric_list_bucket, n_bucket)
    if save_stat:
        save_stats(metric_list, metric_list_bucket, n_bucket, test_percentage_list, csv_name='../results/inspect_stats_'+clf_type+'_iter_'+str(n_iter)+'_bucket_'+str(n_bucket)+'_pct_'+'_'.join(map(str, test_percentage_list)))
    return {'overall':metric_list, 'bucket':metric_list_bucket}


In [20]:
def save_consolidated_stats(metric_struct, metric_bucket_struct):
    algos = list(metric_struct.keys())
    if algos == []:
        return
    pct_list = [item['pct'] for item in metric_struct[algos[0]]]
    n_bucket = len(metric_bucket_struct[algos[0]][0]['acc'])
    overall_struct = {}
    bucket_struct = {}
    for algo in algos:
        temp = {}
        for item in metric_struct[algo]:
            temp[item['pct']] = round_dec(item['acc'])
        overall_struct[algo] = temp
        temp = {}
        for item in metric_bucket_struct[algo]:
            temp[item['pct']] = [round_dec(it) for it in item['acc']]
        bucket_struct[algo] = temp
    for pct in pct_list:
        with open('../results/inspect_consolidated_stats_algo_'+'_'.join(map(str, algos))+'pct_'+str(pct)+'_bucket_'+str(n_bucket)+ '.csv', 'w', newline='') as csvfile:
            Bucket_fields = ['Bucket '+str(item) for item in range(n_bucket)]
            fieldnames = ['Algo', 'Overall'] + Bucket_fields
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for algo in algos:
                temp = {'Algo': algo, 'Overall':overall_struct[algo][pct]}
                temp.update(dict(zip(Bucket_fields, bucket_struct[algo][pct])))
                writer.writerow(temp)

In [21]:
algo_overall_metric = {}
algo_bucket_metric = {}
for clf in list(clf_dict.keys()):
    ret = train_and_validate(clf)
    algo_overall_metric[clf] = ret['overall']
    algo_bucket_metric[clf] = ret['bucket']
save_consolidated_stats(algo_overall_metric, algo_bucket_metric)


Using lr classifier
Test data percentage wrt total data: 5
Accuracy: 0.786 Recall: 0.786 Precision: 0.786 F1 Score: 0.786
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.600 Recall: 0.600 Precision: 0.600 F1 Score: 0.600
Accuracy: 0.864 Recall: 0.864 Precision: 0.864 F1 Score: 0.864
Accuracy: 0.792 Recall: 0.792 Precision: 0.792 F1 Score: 0.792
Accuracy: 0.444 Recall: 0.444 Precision: 0.444 F1 Score: 0.444
Accuracy: 0.765 Recall: 0.765 Precision: 0.765 F1 Score: 0.765
Accuracy: 0.765 Recall: 0.765 Precision: 0.765 F1 Score: 0.765
Accuracy: 0.529 Recall: 0.529 Precision: 0.529 F1 Score: 0.529
Accuracy: 0.769 Recall: 0.769 Precision: 0.769 F1 Score: 0.769
Frequency of hamming distance:  {1: 24.1, 2: 15.66, 3: 12.05, 4: 5.42, 5: 7.23, 6: 3.01, 7: 6.63, 8: 2.41, 9: 2.41, 10: 4.82, 11: 3.61, 12: 3.01, 13: 1.81, 14: 1.81, 15: 1.2, 16: 1.2, 17: 0.6, 18: 3.01}
Test data percentage wrt total data: 10
Accuracy: 0.769 Recall: 0.769 Precision: 0.769 F1 Score: 0.769
Accu

Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.706 Recall: 0.706 Precision: 0.706 F1 Score: 0.706
Accuracy: 0.789 Recall: 0.789 Precision: 0.789 F1 Score: 0.789
Accuracy: 0.571 Recall: 0.571 Precision: 0.571 F1 Score: 0.571
Accuracy: 0.700 Recall: 0.700 Precision: 0.700 F1 Score: 0.700
Accuracy: 0.786 Recall: 0.786 Precision: 0.786 F1 Score: 0.786
Accuracy: 0.778 Recall: 0.778 Precision: 0.778 F1 Score: 0.778
Accuracy: 0.733 Recall: 0.733 Precision: 0.733 F1 Score: 0.733
Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Frequency of hamming distance:  {1: 25.62, 2: 15.62, 3: 12.5, 4: 6.25, 5: 5.62, 6: 3.75, 7: 3.12, 8: 3.12, 9: 5.0, 10: 3.75, 11: 4.38, 12: 0.62, 13: 2.5, 14: 3.12, 15: 1.25, 16: 2.5, 18: 1.25}
Test data percentage wrt total data: 10
Accuracy: 0.875 Recall: 0.875 Precision: 0.875 F1 Score: 0.875
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0

Accuracy: 0.800 Recall: 0.800 Precision: 0.800 F1 Score: 0.800
Accuracy: 0.714 Recall: 0.714 Precision: 0.714 F1 Score: 0.714
Accuracy: 0.846 Recall: 0.846 Precision: 0.846 F1 Score: 0.846
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.882 Recall: 0.882 Precision: 0.882 F1 Score: 0.882
Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Frequency of hamming distance:  {1: 31.33, 2: 16.27, 3: 9.04, 4: 7.83, 5: 4.22, 6: 2.41, 7: 4.82, 8: 1.2, 9: 4.82, 10: 3.01, 11: 1.81, 12: 3.61, 13: 1.81, 14: 3.61, 15: 0.6, 16: 2.41, 18: 0.6, 20: 0.6}
Test data percentage wrt total data: 10
Accuracy: 0.719 Recall: 0.719 Precision: 0.719 F1 Score: 0.719
Accuracy: 0.804 Recall: 0.804 Precision: 0.804 F1 Score: 0.804
Accuracy: 0.800 Recall: 0.800 Precision: 0.800 F1 Score: 0.800
Accuracy: 0.724 Recall: 0.724 Precision: 0.724 F1 Score: 0.724
Accuracy: 0.781 Recall: 0.781 Precision: 0.781 F1 Score: 0.781
Accuracy: 0.652 Recall: 0.652 Precision: 0.652 F1 Score: 0.652
Ac

Accuracy: 0.778 Recall: 0.778 Precision: 0.778 F1 Score: 0.778
Accuracy: 0.643 Recall: 0.643 Precision: 0.643 F1 Score: 0.643
Accuracy: 0.727 Recall: 0.727 Precision: 0.727 F1 Score: 0.727
Accuracy: 0.842 Recall: 0.842 Precision: 0.842 F1 Score: 0.842
Accuracy: 0.706 Recall: 0.706 Precision: 0.706 F1 Score: 0.706
Accuracy: 0.333 Recall: 0.333 Precision: 0.333 F1 Score: 0.333
Accuracy: 0.619 Recall: 0.619 Precision: 0.619 F1 Score: 0.619
Accuracy: 0.933 Recall: 0.933 Precision: 0.933 F1 Score: 0.933
Accuracy: 0.684 Recall: 0.684 Precision: 0.684 F1 Score: 0.684
Accuracy: 0.833 Recall: 0.833 Precision: 0.833 F1 Score: 0.833
Frequency of hamming distance:  {1: 31.71, 2: 23.17, 3: 3.66, 4: 4.27, 5: 6.71, 6: 4.27, 7: 4.27, 8: 2.44, 9: 4.88, 10: 5.49, 11: 1.83, 12: 2.44, 14: 0.61, 15: 1.22, 16: 2.44, 18: 0.61}
Test data percentage wrt total data: 10
Accuracy: 0.735 Recall: 0.735 Precision: 0.735 F1 Score: 0.735
Accuracy: 0.600 Recall: 0.600 Precision: 0.600 F1 Score: 0.600
Accuracy: 0.808 Re

Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Accuracy: 0.765 Recall: 0.765 Precision: 0.765 F1 Score: 0.765
Accuracy: 0.385 Recall: 0.385 Precision: 0.385 F1 Score: 0.385
Accuracy: 0.900 Recall: 0.900 Precision: 0.900 F1 Score: 0.900
Accuracy: 0.870 Recall: 0.870 Precision: 0.870 F1 Score: 0.870
Accuracy: 0.714 Recall: 0.714 Precision: 0.714 F1 Score: 0.714
Accuracy: 0.650 Recall: 0.650 Precision: 0.650 F1 Score: 0.650
Frequency of hamming distance:  {1: 33.33, 2: 12.82, 3: 10.26, 4: 7.69, 5: 5.77, 6: 3.21, 7: 6.41, 8: 3.21, 9: 5.77, 10: 1.92, 11: 1.28, 12: 1.92, 13: 0.64, 14: 3.21, 15: 0.64, 16: 0.64, 18: 1.28}
Test data percentage wrt total data: 10
Accuracy: 0.613 Recall: 0.613 Precision: 0.613 F1 Score: 0.613
Accuracy: 0.756 Recall: 0.756 Precision: 0.756 F1 Score: 0.756
Accuracy: 0.696 Recall: 0.696 Precision: 0.696 F1 Score: 0.696
Accuracy: 0.590 Recall: 0.590 Precision: 0.590 F1 Score: 0.590
Accuracy: 0.538 Recall: 0.538 Precision: 0.538 F1 Score: 0.538
Accurac

Accuracy: 0.824 Recall: 0.824 Precision: 0.824 F1 Score: 0.824
Accuracy: 0.786 Recall: 0.786 Precision: 0.786 F1 Score: 0.786
Accuracy: 0.611 Recall: 0.611 Precision: 0.611 F1 Score: 0.611
Accuracy: 0.786 Recall: 0.786 Precision: 0.786 F1 Score: 0.786
Accuracy: 0.533 Recall: 0.533 Precision: 0.533 F1 Score: 0.533
Accuracy: 0.900 Recall: 0.900 Precision: 0.900 F1 Score: 0.900
Accuracy: 0.765 Recall: 0.765 Precision: 0.765 F1 Score: 0.765
Frequency of hamming distance:  {1: 23.53, 2: 17.65, 3: 9.8, 4: 6.54, 5: 7.19, 6: 1.31, 7: 5.23, 8: 5.23, 9: 4.58, 10: 5.88, 11: 1.96, 12: 1.96, 13: 0.65, 14: 1.96, 15: 1.96, 16: 1.96, 17: 0.65, 18: 1.96}
Test data percentage wrt total data: 10
Accuracy: 0.875 Recall: 0.875 Precision: 0.875 F1 Score: 0.875
Accuracy: 0.625 Recall: 0.625 Precision: 0.625 F1 Score: 0.625
Accuracy: 0.677 Recall: 0.677 Precision: 0.677 F1 Score: 0.677
Accuracy: 0.786 Recall: 0.786 Precision: 0.786 F1 Score: 0.786
Accuracy: 0.605 Recall: 0.605 Precision: 0.605 F1 Score: 0.605

Accuracy: 0.562 Recall: 0.562 Precision: 0.562 F1 Score: 0.562
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.688 Recall: 0.688 Precision: 0.688 F1 Score: 0.688
Accuracy: 0.905 Recall: 0.905 Precision: 0.905 F1 Score: 0.905
Accuracy: 0.714 Recall: 0.714 Precision: 0.714 F1 Score: 0.714
Accuracy: 0.625 Recall: 0.625 Precision: 0.625 F1 Score: 0.625
Accuracy: 0.619 Recall: 0.619 Precision: 0.619 F1 Score: 0.619
Accuracy: 0.833 Recall: 0.833 Precision: 0.833 F1 Score: 0.833
Frequency of hamming distance:  {1: 30.54, 2: 19.16, 3: 10.18, 4: 7.78, 5: 5.39, 6: 1.2, 7: 2.99, 8: 1.2, 9: 2.99, 10: 3.59, 11: 4.19, 12: 0.6, 13: 2.4, 14: 1.8, 15: 0.6, 16: 2.99, 18: 1.2, 20: 1.2}
Test data percentage wrt total data: 10
Accuracy: 0.731 Recall: 0.731 Precision: 0.731 F1 Score: 0.731
Accuracy: 0.795 Recall: 0.795 Precision: 0.795 F1 Score: 0.795
Accuracy: 0.791 Recall: 0.791 Precision: 0.791 F1 Score: 0.791
Accuracy: 0.833 Recall: 0.833 Precision: 0.833 F1 Score: 0.833
Accur

Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.889 Recall: 0.889 Precision: 0.889 F1 Score: 0.889
Accuracy: 0.889 Recall: 0.889 Precision: 0.889 F1 Score: 0.889
Accuracy: 0.818 Recall: 0.818 Precision: 0.818 F1 Score: 0.818
Accuracy: 0.800 Recall: 0.800 Precision: 0.800 F1 Score: 0.800
Accuracy: 0.882 Recall: 0.882 Precision: 0.882 F1 Score: 0.882
Accuracy: 0.875 Recall: 0.875 Precision: 0.875 F1 Score: 0.875
Accuracy: 0.786 Recall: 0.786 Precision: 0.786 F1 Score: 0.786
Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Accuracy: 0.950 Recall: 0.950 Precision: 0.950 F1 Score: 0.950
Frequency of hamming distance:  {1: 28.31, 2: 18.67, 3: 9.04, 4: 4.82, 5: 4.82, 6: 2.41, 7: 8.43, 8: 5.42, 9: 4.22, 10: 3.01, 11: 4.22, 12: 1.81, 13: 0.6, 15: 1.81, 16: 1.81, 18: 0.6}
Test data percentage wrt total data: 10
Accuracy: 0.765 Recall: 0.765 Precision: 0.765 F1 Score: 0.765
Accuracy: 0.917 Recall: 0.917 Precision: 0.917 F1 Score: 0.917
Accuracy: 0.750 Reca

Accuracy: 0.818 Recall: 0.818 Precision: 0.818 F1 Score: 0.818
Accuracy: 0.690 Recall: 0.690 Precision: 0.690 F1 Score: 0.690
Accuracy: 0.909 Recall: 0.909 Precision: 0.909 F1 Score: 0.909
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Frequency of hamming distance:  {1: 20.25, 2: 20.86, 3: 12.88, 4: 6.75, 5: 5.52, 6: 4.91, 7: 5.52, 8: 2.45, 9: 4.29, 10: 2.45, 11: 2.45, 12: 2.45, 13: 1.23, 14: 1.84, 15: 2.45, 16: 2.45, 18: 1.23}
Test data percentage wrt total data: 10
Accuracy: 0.710 Recall: 0.710 Precision: 0.710 F1 Score: 0.710
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.641 Recall: 0.641 Precision: 0.641 F1 Score: 0.641
Accuracy: 0.763 Recall: 0.763 Precision: 0.763 F1 Score: 0.763
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.742 Recall: 0.742 Precision: 0.742 F1 Score: 0.742
Accuracy: 0.714 Recall: 0.714 Precision: 0.714 F1 Score: 0.714
Accuracy: 0.565 Recall: 0.565 Precision: 0.565 F1 Score: 0.565
Accurac

Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Accuracy: 0.600 Recall: 0.600 Precision: 0.600 F1 Score: 0.600
Accuracy: 0.700 Recall: 0.700 Precision: 0.700 F1 Score: 0.700
Accuracy: 0.625 Recall: 0.625 Precision: 0.625 F1 Score: 0.625
Accuracy: 0.643 Recall: 0.643 Precision: 0.643 F1 Score: 0.643
Accuracy: 0.545 Recall: 0.545 Precision: 0.545 F1 Score: 0.545
Accuracy: 0.824 Recall: 0.824 Precision: 0.824 F1 Score: 0.824
Accuracy: 0.739 Recall: 0.739 Precision: 0.739 F1 Score: 0.739
Accuracy: 0.538 Recall: 0.538 Precision: 0.538 F1 Score: 0.538
Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Frequency of hamming distance:  {1: 31.41, 2: 19.23, 3: 9.62, 4: 8.33, 5: 5.77, 6: 3.21, 7: 4.49, 8: 1.92, 9: 2.56, 10: 3.21, 11: 2.56, 12: 1.28, 14: 0.64, 15: 3.21, 16: 1.28, 18: 1.28}
Test data percentage wrt total data: 10
Accuracy: 0.645 Recall: 0.645 Precision: 0.645 F1 Score: 0.645
Accuracy: 0.562 Recall: 0.562 Precision: 0.562 F1 Score: 0.562
Accuracy: 0.733 Re

Accuracy: 0.611 Recall: 0.611 Precision: 0.611 F1 Score: 0.611
Accuracy: 0.500 Recall: 0.500 Precision: 0.500 F1 Score: 0.500
Accuracy: 0.800 Recall: 0.800 Precision: 0.800 F1 Score: 0.800
Accuracy: 0.625 Recall: 0.625 Precision: 0.625 F1 Score: 0.625
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.789 Recall: 0.789 Precision: 0.789 F1 Score: 0.789
Accuracy: 0.727 Recall: 0.727 Precision: 0.727 F1 Score: 0.727
Accuracy: 0.818 Recall: 0.818 Precision: 0.818 F1 Score: 0.818
Frequency of hamming distance:  {1: 26.28, 2: 16.67, 3: 14.74, 4: 8.97, 5: 3.85, 6: 2.56, 7: 3.85, 8: 2.56, 9: 4.49, 10: 3.85, 11: 3.21, 12: 3.21, 13: 0.64, 14: 0.64, 15: 0.64, 16: 3.21, 20: 0.64}
Test data percentage wrt total data: 10
Accuracy: 0.541 Recall: 0.541 Precision: 0.541 F1 Score: 0.541
Accuracy: 0.615 Recall: 0.615 Precision: 0.615 F1 Score: 0.615
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.710 Recall: 0.710 Precision: 0.710 F1 Score: 0.710
Accurac

Accuracy: 0.692 Recall: 0.692 Precision: 0.692 F1 Score: 0.692
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.625 Recall: 0.625 Precision: 0.625 F1 Score: 0.625
Accuracy: 0.800 Recall: 0.800 Precision: 0.800 F1 Score: 0.800
Accuracy: 0.550 Recall: 0.550 Precision: 0.550 F1 Score: 0.550
Accuracy: 0.882 Recall: 0.882 Precision: 0.882 F1 Score: 0.882
Accuracy: 0.833 Recall: 0.833 Precision: 0.833 F1 Score: 0.833
Frequency of hamming distance:  {1: 24.68, 2: 20.78, 3: 11.04, 4: 7.14, 5: 5.19, 6: 3.9, 7: 3.9, 8: 3.9, 9: 2.6, 10: 3.25, 11: 3.25, 12: 3.25, 13: 1.3, 14: 2.6, 15: 0.65, 16: 1.95, 18: 0.65}
Test data percentage wrt total data: 10
Accuracy: 0.724 Recall: 0.724 Precision: 0.724 F1 Score: 0.724
Accuracy: 0.625 Recall: 0.625 Precision: 0.625 F1 Score: 0.625
Accuracy: 0.632 Recall: 0.632 Precision: 0.632 F1 Score: 0.632
Accuracy: 0.935 Recall: 0.935 Precision: 0.935 F1 Score: 0.935
Accuracy: 0.5

Accuracy: 0.857 Recall: 0.857 Precision: 0.857 F1 Score: 0.857
Accuracy: 0.833 Recall: 0.833 Precision: 0.833 F1 Score: 0.833
Accuracy: 0.429 Recall: 0.429 Precision: 0.429 F1 Score: 0.429
Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Accuracy: 0.941 Recall: 0.941 Precision: 0.941 F1 Score: 0.941
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.800 Recall: 0.800 Precision: 0.800 F1 Score: 0.800
Accuracy: 0.905 Recall: 0.905 Precision: 0.905 F1 Score: 0.905
Accuracy: 0.733 Recall: 0.733 Precision: 0.733 F1 Score: 0.733
Accuracy: 0.765 Recall: 0.765 Precision: 0.765 F1 Score: 0.765
Frequency of hamming distance:  {1: 27.27, 2: 15.58, 3: 6.49, 4: 9.09, 5: 5.19, 6: 1.3, 7: 5.19, 8: 3.9, 9: 4.55, 10: 3.9, 11: 2.6, 12: 1.3, 13: 3.25, 14: 3.9, 15: 0.65, 16: 1.3, 18: 3.25, 20: 1.3}
Test data percentage wrt total data: 10
Accuracy: 0.871 Recall: 0.871 Precision: 0.871 F1 Score: 0.871
Accuracy: 0.930 Recall: 0.930 Precision: 0.930 F1 Score: 0.930
Accura

Accuracy: 0.600 Recall: 0.600 Precision: 0.600 F1 Score: 0.600
Accuracy: 0.609 Recall: 0.609 Precision: 0.609 F1 Score: 0.609
Accuracy: 0.789 Recall: 0.789 Precision: 0.789 F1 Score: 0.789
Accuracy: 0.824 Recall: 0.824 Precision: 0.824 F1 Score: 0.824
Accuracy: 0.783 Recall: 0.783 Precision: 0.783 F1 Score: 0.783
Accuracy: 0.739 Recall: 0.739 Precision: 0.739 F1 Score: 0.739
Accuracy: 0.500 Recall: 0.500 Precision: 0.500 F1 Score: 0.500
Accuracy: 0.769 Recall: 0.769 Precision: 0.769 F1 Score: 0.769
Accuracy: 0.706 Recall: 0.706 Precision: 0.706 F1 Score: 0.706
Accuracy: 0.737 Recall: 0.737 Precision: 0.737 F1 Score: 0.737
Frequency of hamming distance:  {1: 25.4, 2: 25.93, 3: 7.94, 4: 4.76, 5: 7.41, 6: 5.29, 7: 3.17, 8: 1.59, 9: 4.23, 10: 3.17, 11: 1.59, 12: 4.76, 13: 1.06, 14: 1.06, 15: 1.06, 16: 1.06, 18: 0.53}
Test data percentage wrt total data: 10
Accuracy: 0.733 Recall: 0.733 Precision: 0.733 F1 Score: 0.733
Accuracy: 0.784 Recall: 0.784 Precision: 0.784 F1 Score: 0.784
Accuracy:

Accuracy: 0.895 Recall: 0.895 Precision: 0.895 F1 Score: 0.895
Accuracy: 0.833 Recall: 0.833 Precision: 0.833 F1 Score: 0.833
Accuracy: 0.667 Recall: 0.667 Precision: 0.667 F1 Score: 0.667
Accuracy: 0.833 Recall: 0.833 Precision: 0.833 F1 Score: 0.833
Accuracy: 0.810 Recall: 0.810 Precision: 0.810 F1 Score: 0.810
Accuracy: 0.706 Recall: 0.706 Precision: 0.706 F1 Score: 0.706
Frequency of hamming distance:  {1: 23.24, 2: 23.78, 3: 6.49, 4: 7.03, 5: 3.78, 6: 3.24, 7: 6.49, 8: 4.86, 9: 5.41, 10: 1.08, 11: 3.78, 12: 2.16, 13: 2.16, 14: 0.54, 15: 1.62, 16: 2.16, 18: 2.16}
Test data percentage wrt total data: 10
Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.824 Recall: 0.824 Precision: 0.824 F1 Score: 0.824
Accuracy: 0.923 Recall: 0.923 Precision: 0.923 F1 Score: 0.923
Accuracy: 0.744 Recall: 0.744 Precision: 0.744 F1 Score: 0.744
Accuracy: 0.816 Recall: 0.816 Precision: 0.816 F1 Score: 0.816
Accuracy