In [1]:
import numpy as np
import pandas as pd
import warnings
import csv
from itertools import groupby

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import recall_score as rec, precision_score as pre, f1_score as f1, accuracy_score as acc

In [2]:
# Read Signatures and z scores
csv_name = "../data/labeled_sigs"
csv_name = "../data/Adomain_Substrate_labeled_sigs.report"
z_score_csv_name = "../data/z_score_aa"
df_original = pd.read_csv(csv_name, delimiter='\t')
z_scores_df = pd.read_csv(z_score_csv_name, delimiter=' ')

In [3]:
# Prep for hardcoded values
m=26
nrow = 12
D = np.zeros((nrow, m))
D1 = D[0 ,:]
D2 = D[1 ,:]
D3 = D[2 ,:]
D4 = D[3 ,:]
D5 = D[4 ,:]
D6 = D[5 ,:]
D7 = D[6 ,:]
D8 = D[7 ,:]
D9 = D[8 ,:]
D10 = D[9 ,:]
D11 = D[10 ,:]
D12 = D[11 ,:]

In [4]:
# Hard coded values, taken from NRPSPredictor2 github

#1 aa-alpha-helix.aaindex
D1[0]=1.420;D1[1]=0.000;D1[2]=0.700;D1[3]=1.010;D1[4]=1.510;D1[5]=1.130;D1[6]=0.570;D1[7]=1.000;D1[8]=1.080;D1[9]=0.000;D1[10]=1.160;D1[11]=1.210;D1[12]=1.450;D1[13]=0.670;D1[14]=0.000;D1[15]=0.570;D1[16]=1.110;D1[17]=0.980;D1[18]=0.770;D1[19]=0.830;D1[20]=0.000;D1[21]=1.060;D1[22]=1.080;D1[23]=0.000;D1[24]=0.690;D1[25]=0.000;
#2 aa-beta-sheet.aaindex
D2[0]=0.830;D2[1]=0.000;D2[2]=1.190;D2[3]=0.540;D2[4]=0.370;D2[5]=1.380;D2[6]=0.750;D2[7]=0.870;D2[8]=1.600;D2[9]=0.000;D2[10]=0.740;D2[11]=1.300;D2[12]=1.050;D2[13]=0.890;D2[14]=0.000;D2[15]=0.550;D2[16]=1.100;D2[17]=0.930;D2[18]=0.750;D2[19]=1.190;D2[20]=0.000;D2[21]=1.700;D2[22]=1.370;D2[23]=0.000;D2[24]=1.470;D2[25]=0.000;
#3 aa-beta-turn.aaindex
D3[0]=0.740;D3[1]=0.000;D3[2]=0.960;D3[3]=1.520;D3[4]=0.950;D3[5]=0.660;D3[6]=1.560;D3[7]=0.950;D3[8]=0.470;D3[9]=0.000;D3[10]=1.190;D3[11]=0.500;D3[12]=0.600;D3[13]=1.460;D3[14]=0.000;D3[15]=1.560;D3[16]=0.960;D3[17]=1.010;D3[18]=1.430;D3[19]=0.980;D3[20]=0.000;D3[21]=0.590;D3[22]=0.600;D3[23]=0.000;D3[24]=1.140;D3[25]=0.000;
#4 aa-hydrogenbond.aaindex
D4[0]=0.000;D4[1]=0.000;D4[2]=0.000;D4[3]=1.000;D4[4]=1.000;D4[5]=0.000;D4[6]=0.000;D4[7]=1.000;D4[8]=0.000;D4[9]=0.000;D4[10]=2.000;D4[11]=0.000;D4[12]=0.000;D4[13]=2.000;D4[14]=0.000;D4[15]=0.000;D4[16]=2.000;D4[17]=4.000;D4[18]=1.000;D4[19]=1.000;D4[20]=0.000;D4[21]=0.000;D4[22]=1.000;D4[23]=0.000;D4[24]=1.000;D4[25]=0.000;
#5 aa-hydrophobicity-neu1.aaindex
D5[0]=0.060;D5[1]=0.000;D5[2]=-0.560;D5[3]=0.970;D5[4]=0.850;D5[5]=-0.990;D5[6]=0.320;D5[7]=0.150;D5[8]=-1.000;D5[9]=0.000;D5[10]=1.000;D5[11]=-0.830;D5[12]=-0.680;D5[13]=0.700;D5[14]=0.000;D5[15]=0.450;D5[16]=0.710;D5[17]=0.800;D5[18]=0.480;D5[19]=0.380;D5[20]=0.000;D5[21]=-0.750;D5[22]=-0.570;D5[23]=0.000;D5[24]=-0.350;D5[25]=0.000;
#6 aa-hydrophobicity-neu2.aaindex
D6[0]=-0.250;D6[1]=0.000;D6[2]=-0.400;D6[3]=-0.080;D6[4]=-0.100;D6[5]=0.180;D6[6]=-0.320;D6[7]=-0.030;D6[8]=-0.030;D6[9]=0.000;D6[10]=0.320;D6[11]=0.050;D6[12]=-0.010;D6[13]=-0.060;D6[14]=0.000;D6[15]=0.230;D6[16]=-0.020;D6[17]=0.190;D6[18]=-0.150;D6[19]=-0.100;D6[20]=0.000;D6[21]=-0.190;D6[22]=0.310;D6[23]=0.000;D6[24]=0.400;D6[25]=0.000;
#7 aa-hydrophobicity-neu3.aaindex
D7[0]=0.250;D7[1]=0.000;D7[2]=-0.140;D7[3]=0.080;D7[4]=-0.050;D7[5]=0.150;D7[6]=0.280;D7[7]=-0.100;D7[8]=0.100;D7[9]=0.000;D7[10]=0.110;D7[11]=0.010;D7[12]=0.040;D7[13]=0.170;D7[14]=0.000;D7[15]=0.410;D7[16]=0.120;D7[17]=-0.410;D7[18]=0.230;D7[19]=0.290;D7[20]=0.000;D7[21]=0.030;D7[22]=0.340;D7[23]=0.000;D7[24]=-0.020;D7[25]=0.000;
#8 aa-isoelectric.aaindex
D8[0]=6.000;D8[1]=0.000;D8[2]=5.050;D8[3]=2.770;D8[4]=3.220;D8[5]=5.480;D8[6]=5.970;D8[7]=7.590;D8[8]=6.020;D8[9]=0.000;D8[10]=9.740;D8[11]=5.980;D8[12]=5.740;D8[13]=5.410;D8[14]=0.000;D8[15]=6.300;D8[16]=5.650;D8[17]=10.760;D8[18]=5.680;D8[19]=5.660;D8[20]=0.000;D8[21]=5.960;D8[22]=5.890;D8[23]=0.000;D8[24]=5.660;D8[25]=0.000;
#9 aa-polar-grantham.aaindex
D9[0]=8.100;D9[1]=0.000;D9[2]=5.500;D9[3]=13.000;D9[4]=12.300;D9[5]=5.200;D9[6]=9.000;D9[7]=10.400;D9[8]=5.200;D9[9]=0.000;D9[10]=11.300;D9[11]=4.900;D9[12]=5.700;D9[13]=11.600;D9[14]=0.000;D9[15]=8.000;D9[16]=10.500;D9[17]=10.500;D9[18]=9.200;D9[19]=8.600;D9[20]=0.000;D9[21]=5.900;D9[22]=5.400;D9[23]=0.000;D9[24]=6.200;D9[25]=0.000;
#10 aa-polar-radzicka.aaindex
D10[0]=-0.060;D10[1]=0.000;D10[2]=1.360;D10[3]=-0.800;D10[4]=-0.770;D10[5]=1.270;D10[6]=-0.410;D10[7]=0.490;D10[8]=1.310;D10[9]=0.000;D10[10]=-1.180;D10[11]=1.210;D10[12]=1.270;D10[13]=-0.480;D10[14]=0.000;D10[15]=1.100;D10[16]=-0.730;D10[17]=-0.840;D10[18]=-0.500;D10[19]=-0.270;D10[20]=0.000;D10[21]=1.090;D10[22]=0.880;D10[23]=0.000;D10[24]=0.330;D10[25]=0.000;
#11 aa-polar-zimmerman.aaindex
D11[0]=0.000;D11[1]=0.000;D11[2]=1.480;D11[3]=49.700;D11[4]=49.900;D11[5]=0.350;D11[6]=0.000;D11[7]=51.600;D11[8]=0.130;D11[9]=0.000;D11[10]=49.500;D11[11]=0.130;D11[12]=1.430;D11[13]=3.380;D11[14]=0.000;D11[15]=1.580;D11[16]=3.530;D11[17]=52.000;D11[18]=1.670;D11[19]=1.660;D11[20]=0.000;D11[21]=0.130;D11[22]=2.100;D11[23]=0.000;D11[24]=1.610;D11[25]=0.000;
#12 aa-volume.aaindex
D12[0]=90.000;D12[1]=0.000;D12[2]=103.300;D12[3]=117.300;D12[4]=142.200;D12[5]=191.900;D12[6]=64.900;D12[7]=160.000;D12[8]=163.900;D12[9]=0.000;D12[10]=167.300;D12[11]=164.000;D12[12]=167.000;D12[13]=124.700;D12[14]=0.000;D12[15]=122.900;D12[16]=149.400;D12[17]=194.000;D12[18]=95.400;D12[19]=121.500;D12[20]=0.000;D12[21]=139.000;D12[22]=228.200;D12[23]=0.000;D12[24]=197.000;D12[25]=0.000;



In [5]:
# Map Amino acid abbreviations with indices of D[i, :]

aa2ind_map = {}
aa2ind_map['A'] = 0
aa2ind_map['R'] = 17
aa2ind_map['D'] = 3
aa2ind_map['N'] = 13
aa2ind_map['C'] = 2
aa2ind_map['E'] = 4
aa2ind_map['Q'] = 16
aa2ind_map['G'] = 6
aa2ind_map['H'] = 7
aa2ind_map['I'] = 8
aa2ind_map['L'] = 11
aa2ind_map['K'] = 10
aa2ind_map['M'] = 12
aa2ind_map['F'] = 5
aa2ind_map['P'] = 15
aa2ind_map['S'] = 18
aa2ind_map['T'] = 19
aa2ind_map['W'] = 22
aa2ind_map['Y'] = 24
aa2ind_map['V'] = 21

In [6]:
# Dictionary to store 15 dimensional vector values for each amino acid

aa_dict = {}
for i, row in z_scores_df.iterrows():
    aa_dict[row['aa_short']] = [row['z1'], row['z2'], row['z3'], ] # Store z values
    aa_dict[row['aa_short']].extend(D[:, aa2ind_map[row['aa_short']]]) # Store rest of the 12 hard-coded values
aa_dict['-'] = list(np.mean(np.asarray(list(aa_dict.values())), axis=0))

In [7]:
# Create index to value map and value to index map from a list using list index
def get_dict_from_list(mylist):
    assert len(mylist) > 0
    idx_to_value_dict = {}
    value_to_idx_dict = {}
    for i in range(len(mylist)):
        idx_to_value_dict[i] = mylist[i]
        value_to_idx_dict[mylist[i]] = int(i)
    return idx_to_value_dict, value_to_idx_dict

# Get index-value maps for substrates
sub_vocab_set = set(df_original['sub'].tolist())
sub_vocab_list = list(sub_vocab_set)
sub_idx_to_value_dict, sub_value_to_idx_dict = get_dict_from_list(sub_vocab_list)

In [8]:
sub_idx_to_value_dict

{0: 'asn',
 1: 'orn',
 2: 'dht',
 3: 'pro',
 4: 'hpg',
 5: 'dhpg',
 6: 'arg',
 7: 'asp',
 8: 'ile',
 9: 'lys',
 10: 'leu',
 11: 'cys',
 12: 'val',
 13: 'ser',
 14: 'ORN',
 15: 'ala',
 16: 'tyr',
 17: 'hyv-d',
 18: 'gln',
 19: 'beta-ala',
 20: 'pip',
 21: 'aad',
 22: 'dab',
 23: 'trp',
 24: 'dhb',
 25: 'glu',
 26: 'bht',
 27: 'gly',
 28: 'horn',
 29: 'thr',
 30: 'phe'}

In [9]:
# Remove the entries with only one substrate occurence
df_cnt_filtered = df_original.groupby('sub').filter(lambda x: len(x) > 1)
print("Reduced data size from ",len(df_original)," to ", len(df_cnt_filtered), " due to count filtering")

Reduced data size from  1534  to  1533  due to count filtering


In [10]:
# Re-Create substrate index-value maps after removing the substrates with count 1

sub_vocab_set = set(df_cnt_filtered['sub'].tolist())
sub_vocab_list = list(sub_vocab_set)
sub_idx_to_value_dict, sub_value_to_idx_dict = get_dict_from_list(sub_vocab_list)

In [11]:
# Given a signature of amino acids(34 aa long), construct the 34x15=510 dimentional representation
def get_encoding(signature):
    ret = []
    for i in signature:
        ret.extend(aa_dict[i])
    return np.asarray(ret)


In [12]:
# Format the data
def format_data_from_df(df_in):
    sig_len = 34
    raw_data = []
    data_np = []
    label_np = []
    j=0
    for i, row in df_cnt_filtered.iterrows():
        assert len(row['sig']) == sig_len
        raw_data.append(row['sig'])
        data_np.append(get_encoding(row['sig']))
        label_np.append(sub_value_to_idx_dict[row['sub']])
        j+=1
    return np.asarray(data_np), np.asarray(label_np).astype('int'), np.array(raw_data)

data_np, label_np, raw_data = format_data_from_df(df_original)

#data_np = np.asarray(data_np)
#label_np = np.asarray(label_np).astype('int')
#raw_data = np.array(raw_data)

In [13]:
def get_hamming_distance(str1, str2):
    return sum(i != j for i, j in zip(str1, str2))

# For all points in test_data, choose the minimum hamming distance from all of train data, and return the distance list
def get_hamming_distance_bucket_info(test_data, train_data):
    dist_list = []
    for test_data_pt in test_data:
        dist_list.append(min([get_hamming_distance(test_data_pt, train_data_pt) for train_data_pt in train_data]))
    return np.array(dist_list)

def round_dec(num, dec=2):
    return float(round(num* 10.**dec))/(10**dec)

In [14]:
# Build classifier dictionary using sklearn multiclass classifiers

clf_dict={}
clf_dict['lr'] = LogisticRegression(random_state=0, max_iter=400, multi_class='multinomial', solver='newton-cg')
clf_dict['svm'] = make_pipeline(StandardScaler(), LinearSVC(random_state=0, multi_class='crammer_singer', tol=1e-9, max_iter=2000))
clf_dict['knn'] = KNeighborsClassifier(weights='distance')
clf_dict['mlp_sklearn'] = MLPClassifier(random_state=1, max_iter=400, early_stopping=False, )
clf_dict['rand_for'] = RandomForestClassifier(max_depth=4, criterion='entropy')
clf_dict['dec_tree'] = DecisionTreeClassifier(random_state=0, criterion='entropy')
clf_dict['ber_nb'] = BernoulliNB()
clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')
clf_dict['gau_nb'] = GaussianNB()
clf_dict['label_prop'] = LabelPropagation(kernel='knn')
clf_dict['label_spread'] = LabelSpreading(kernel='knn')
clf_dict['lda'] = LinearDiscriminantAnalysis()
clf_dict['ridge_cv'] = RidgeClassifierCV()
clf_dict['n_cent'] = NearestCentroid()
clf_dict['ridge'] = RidgeClassifier()

In [15]:
def print_stats(metric_list, metric_list_bucket, n_bucket):
    print(f"\n\nOverall Stats:")

    for item in metric_list:
        print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc']:.3f} Average Recall: {item['rec']:.3f} Average Precision: {item['pre']:.3f} Average F1 Score: {item['f1']:.3f}")

    for bucket in range(n_bucket):
        print(f"\n\nBucket {bucket+1} Stats:")
        for item in metric_list_bucket:
            print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc'][bucket]:.3f} Average Recall: {item['rec'][bucket]:.3f} Average Precision: {item['pre'][bucket]:.3f} Average F1 Score: {item['f1'][bucket]:.3f}")

def save_stats(metric_list, metric_list_bucket, n_bucket, pct_list, csv_name='stats'):
    overall_dict = {}
    for item in metric_list:
        overall_dict[item['pct']] = item['acc']
    
    with open(csv_name + '.csv', 'w', newline='') as csvfile:
        fieldnames = ['pct', 'overall_acc'] + ['Bucket '+str(item) for item in range(n_bucket)]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for item in metric_list_bucket:
            temp_dict = {'pct': item['pct'], 'overall_acc': round_dec(overall_dict[item['pct']])}
            for bucket in range(n_bucket):
                temp_dict['Bucket ' + str(bucket)] = round_dec(item['acc'][bucket])
            writer.writerow(temp_dict)

# n_iter signifies number of iterations
# test_percentage_list is the list of percentages of test data with respect to total data
# Code will iterate for each percentage for n_iter iterations
def train_and_validate(clf_type, n_iter=10, test_percentage_list = [5, 10, 25, 40, 50], print_stat=True, save_stat=True):
    print(f'\nUsing {clf_type} classifier')
    n=len(label_np)
    avg = 'micro'
    # metric_list contains overall accuracy, precision, recall and f1 score for all data
    # metric_list_bucket contains hammning distance bucket-wise accuracy, precision, recall and f1 score
    metric_list = []
    metric_list_bucket = []
    n_bucket = 6
    eps = 1e-8

    for test_percentage in test_percentage_list:
        assert test_percentage>0 and test_percentage<100
        acc_sum = 0
        pre_sum = 0
        rec_sum = 0
        f1_sum = 0
        print(f'Test data percentage wrt total data: {test_percentage}')
        n_iter_ar = np.zeros(n_bucket)
        acc_sum_ar = np.zeros(n_bucket)
        pre_sum_ar = np.zeros(n_bucket)
        rec_sum_ar = np.zeros(n_bucket)
        f1_sum_ar = np.zeros(n_bucket)
        dist_buckets_all = []

        for iter in range(n_iter):
            # Create filter for random split
            test_elig = np.random.random(size=(n)) <= (test_percentage/100)
            test_data = data_np[test_elig]
            test_label = label_np[test_elig]
            train_data = data_np[(test_elig-1).astype('bool')]
            train_label = label_np[(test_elig-1).astype('bool')]

            raw_train_data = raw_data[(test_elig-1).astype('bool')]
            raw_test_data = raw_data[test_elig]
            dist_buckets = get_hamming_distance_bucket_info(raw_test_data, raw_train_data)
            dist_buckets_all.extend(list(dist_buckets))
            #print(dist_buckets)

            clf = clf_dict[clf_type]
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                clf.fit(train_data, train_label)
            test_predicted = clf.predict(test_data)

            accuracy = acc(test_label, test_predicted)
            precision = pre(test_label, test_predicted, average=avg)
            recall = rec(test_label, test_predicted, average=avg)
            f1_score = f1(test_label, test_predicted, average=avg)
            acc_sum += accuracy
            pre_sum += precision
            rec_sum += recall
            f1_sum += f1_score
            print(f'Accuracy: {accuracy:.3f} Recall: {recall:.3f} Precision: {precision:.3f} F1 Score: {f1_score:.3f}')

            for bucket in range(n_bucket):
                b_filter = dist_buckets == bucket
                filtered_test_label = test_label[b_filter]
                filtered_test_predicted = test_predicted[b_filter]
                if len(filtered_test_label):
                    accuracy = acc(filtered_test_label, filtered_test_predicted)
                    precision = pre(filtered_test_label, filtered_test_predicted, average=avg)
                    recall = rec(filtered_test_label, filtered_test_predicted, average=avg)
                    f1_score = f1(filtered_test_label, filtered_test_predicted, average=avg)
                    acc_sum_ar[bucket] += accuracy
                    pre_sum_ar[bucket] += precision
                    rec_sum_ar[bucket] += recall
                    f1_sum_ar[bucket] += f1_score
                    n_iter_ar[bucket] += 1


        metric_list.append({'pct':test_percentage, 'acc':acc_sum/n_iter, 'pre':pre_sum/n_iter, 'rec':rec_sum/n_iter, 'f1':f1_sum/n_iter})
        metric_list_bucket.append({'pct':test_percentage, 'acc':acc_sum_ar/(n_iter+eps), 'pre':pre_sum_ar/(n_iter+eps), 'rec':rec_sum_ar/(n_iter+eps), 'f1':f1_sum_ar/(n_iter+eps)})
        freq_stat = {value: round_dec(len(list(freq))*100./len(dist_buckets_all), 2) for value, freq in groupby(sorted(dist_buckets_all))}
        print(f"Frequency of hamming distance: ", freq_stat)


    if print_stat:
        print_stats(metric_list, metric_list_bucket, n_bucket)
    if save_stat:
        save_stats(metric_list, metric_list_bucket, n_bucket, test_percentage_list, csv_name='../results/Adomain_Subs_stats_'+clf_type+'_iter_'+str(n_iter)+'_pct_'+'_'.join(map(str, test_percentage_list)))
    return {'overall':metric_list, 'bucket':metric_list_bucket}


In [16]:
def save_consolidated_stats(metric_struct, metric_bucket_struct):
    algos = list(metric_struct.keys())
    if algos == []:
        return
    pct_list = [item['pct'] for item in metric_struct[algos[0]]]
    n_bucket = len(metric_bucket_struct[algos[0]][0]['acc'])
    overall_struct = {}
    bucket_struct = {}
    for algo in algos:
        temp = {}
        for item in metric_struct[algo]:
            temp[item['pct']] = round_dec(item['acc'])
        overall_struct[algo] = temp
        temp = {}
        for item in metric_bucket_struct[algo]:
            temp[item['pct']] = [round_dec(it) for it in item['acc']]
        bucket_struct[algo] = temp
    for pct in pct_list:
        with open('../results/Adomain_Subs_consolidated_stats_algo_'+'_'.join(map(str, algos))+'pct_'+str(pct)+'_bucket_'+str(n_bucket)+ '.csv', 'w', newline='') as csvfile:
            Bucket_fields = ['Bucket '+str(item) for item in range(n_bucket)]
            fieldnames = ['Algo', 'Overall'] + Bucket_fields
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for algo in algos:
                temp = {'Algo': algo, 'Overall':overall_struct[algo][pct]}
                temp.update(dict(zip(Bucket_fields, bucket_struct[algo][pct])))
                writer.writerow(temp)

In [17]:
algo_overall_metric = {}
algo_bucket_metric = {}
for clf in list(clf_dict.keys()):
    ret = train_and_validate(clf)
    algo_overall_metric[clf] = ret['overall']
    algo_bucket_metric[clf] = ret['bucket']
save_consolidated_stats(algo_overall_metric, algo_bucket_metric)


Using lr classifier
Test data percentage wrt total data: 5
Accuracy: 0.818 Recall: 0.818 Precision: 0.818 F1 Score: 0.818
Accuracy: 0.784 Recall: 0.784 Precision: 0.784 F1 Score: 0.784
Accuracy: 0.841 Recall: 0.841 Precision: 0.841 F1 Score: 0.841
Accuracy: 0.885 Recall: 0.885 Precision: 0.885 F1 Score: 0.885
Accuracy: 0.831 Recall: 0.831 Precision: 0.831 F1 Score: 0.831
Accuracy: 0.894 Recall: 0.894 Precision: 0.894 F1 Score: 0.894
Accuracy: 0.900 Recall: 0.900 Precision: 0.900 F1 Score: 0.900
Accuracy: 0.869 Recall: 0.869 Precision: 0.869 F1 Score: 0.869
Accuracy: 0.850 Recall: 0.850 Precision: 0.850 F1 Score: 0.850
Accuracy: 0.886 Recall: 0.886 Precision: 0.886 F1 Score: 0.886
Frequency of hamming distance:  {0: 66.96, 1: 8.32, 2: 3.23, 3: 4.22, 4: 0.99, 5: 2.86, 6: 1.61, 7: 1.37, 8: 1.49, 9: 1.49, 10: 1.37, 11: 1.24, 12: 0.75, 13: 1.12, 14: 0.5, 15: 1.24, 16: 0.75, 18: 0.25, 20: 0.12, 23: 0.12}
Test data percentage wrt total data: 10
Accuracy: 0.854 Recall: 0.854 Precision: 0.854 

Accuracy: 0.868 Recall: 0.868 Precision: 0.868 F1 Score: 0.868
Accuracy: 0.884 Recall: 0.884 Precision: 0.884 F1 Score: 0.884
Accuracy: 0.863 Recall: 0.863 Precision: 0.863 F1 Score: 0.863
Accuracy: 0.928 Recall: 0.928 Precision: 0.928 F1 Score: 0.928
Accuracy: 0.823 Recall: 0.823 Precision: 0.823 F1 Score: 0.823
Accuracy: 0.833 Recall: 0.833 Precision: 0.833 F1 Score: 0.833
Accuracy: 0.831 Recall: 0.831 Precision: 0.831 F1 Score: 0.831
Accuracy: 0.882 Recall: 0.882 Precision: 0.882 F1 Score: 0.882
Accuracy: 0.871 Recall: 0.871 Precision: 0.871 F1 Score: 0.871
Accuracy: 0.805 Recall: 0.805 Precision: 0.805 F1 Score: 0.805
Frequency of hamming distance:  {0: 64.98, 1: 7.78, 2: 4.41, 3: 3.89, 4: 1.3, 5: 2.59, 6: 0.65, 7: 1.82, 8: 2.08, 9: 1.82, 10: 0.91, 11: 0.91, 12: 1.43, 13: 1.17, 14: 1.17, 15: 0.78, 16: 0.26, 17: 0.52, 18: 0.39, 21: 0.39, 22: 0.52, 23: 0.13, 24: 0.13}
Test data percentage wrt total data: 10
Accuracy: 0.861 Recall: 0.861 Precision: 0.861 F1 Score: 0.861
Accuracy: 0.84

Accuracy: 0.862 Recall: 0.862 Precision: 0.862 F1 Score: 0.862
Accuracy: 0.851 Recall: 0.851 Precision: 0.851 F1 Score: 0.851
Accuracy: 0.932 Recall: 0.932 Precision: 0.932 F1 Score: 0.932
Accuracy: 0.789 Recall: 0.789 Precision: 0.789 F1 Score: 0.789
Accuracy: 0.877 Recall: 0.877 Precision: 0.877 F1 Score: 0.877
Accuracy: 0.865 Recall: 0.865 Precision: 0.865 F1 Score: 0.865
Accuracy: 0.844 Recall: 0.844 Precision: 0.844 F1 Score: 0.844
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.899 Recall: 0.899 Precision: 0.899 F1 Score: 0.899
Accuracy: 0.785 Recall: 0.785 Precision: 0.785 F1 Score: 0.785
Frequency of hamming distance:  {0: 65.57, 1: 6.58, 2: 4.39, 3: 4.39, 4: 1.92, 5: 3.57, 6: 0.96, 7: 1.1, 8: 1.65, 9: 0.96, 10: 0.82, 11: 0.69, 12: 1.92, 13: 1.1, 14: 0.55, 15: 1.1, 16: 0.69, 17: 0.14, 18: 0.41, 20: 0.14, 22: 0.27, 23: 0.82, 24: 0.27}
Test data percentage wrt total data: 10
Accuracy: 0.856 Recall: 0.856 Precision: 0.856 F1 Score: 0.856
Accuracy: 0.870 

Accuracy: 0.861 Recall: 0.861 Precision: 0.861 F1 Score: 0.861
Accuracy: 0.890 Recall: 0.890 Precision: 0.890 F1 Score: 0.890
Accuracy: 0.800 Recall: 0.800 Precision: 0.800 F1 Score: 0.800
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.847 Recall: 0.847 Precision: 0.847 F1 Score: 0.847
Accuracy: 0.857 Recall: 0.857 Precision: 0.857 F1 Score: 0.857
Accuracy: 0.831 Recall: 0.831 Precision: 0.831 F1 Score: 0.831
Accuracy: 0.855 Recall: 0.855 Precision: 0.855 F1 Score: 0.855
Accuracy: 0.789 Recall: 0.789 Precision: 0.789 F1 Score: 0.789
Accuracy: 0.889 Recall: 0.889 Precision: 0.889 F1 Score: 0.889
Frequency of hamming distance:  {0: 64.64, 1: 7.48, 2: 4.44, 3: 4.56, 4: 1.27, 5: 2.66, 6: 1.01, 7: 1.14, 8: 1.77, 9: 0.89, 10: 0.89, 11: 2.03, 12: 1.9, 13: 0.63, 14: 1.27, 15: 0.63, 16: 1.01, 17: 0.13, 18: 0.38, 20: 0.63, 21: 0.13, 22: 0.13, 24: 0.38}
Test data percentage wrt total data: 10
Accuracy: 0.843 Recall: 0.843 Precision: 0.843 F1 Score: 0.843
Accuracy: 0.81

Accuracy: 0.764 Recall: 0.764 Precision: 0.764 F1 Score: 0.764
Accuracy: 0.753 Recall: 0.753 Precision: 0.753 F1 Score: 0.753
Accuracy: 0.806 Recall: 0.806 Precision: 0.806 F1 Score: 0.806
Accuracy: 0.689 Recall: 0.689 Precision: 0.689 F1 Score: 0.689
Accuracy: 0.789 Recall: 0.789 Precision: 0.789 F1 Score: 0.789
Accuracy: 0.877 Recall: 0.877 Precision: 0.877 F1 Score: 0.877
Accuracy: 0.810 Recall: 0.810 Precision: 0.810 F1 Score: 0.810
Accuracy: 0.775 Recall: 0.775 Precision: 0.775 F1 Score: 0.775
Accuracy: 0.775 Recall: 0.775 Precision: 0.775 F1 Score: 0.775
Accuracy: 0.852 Recall: 0.852 Precision: 0.852 F1 Score: 0.852
Frequency of hamming distance:  {0: 64.3, 1: 6.82, 2: 5.08, 3: 4.81, 4: 1.87, 5: 3.74, 6: 0.8, 7: 1.07, 8: 0.8, 9: 2.14, 10: 0.8, 11: 1.34, 12: 1.2, 13: 1.87, 14: 1.07, 15: 0.53, 16: 0.53, 17: 0.27, 18: 0.27, 20: 0.13, 21: 0.13, 22: 0.27, 23: 0.13}
Test data percentage wrt total data: 10
Accuracy: 0.740 Recall: 0.740 Precision: 0.740 F1 Score: 0.740
Accuracy: 0.720 Re

Accuracy: 0.890 Recall: 0.890 Precision: 0.890 F1 Score: 0.890
Accuracy: 0.909 Recall: 0.909 Precision: 0.909 F1 Score: 0.909
Accuracy: 0.803 Recall: 0.803 Precision: 0.803 F1 Score: 0.803
Accuracy: 0.898 Recall: 0.898 Precision: 0.898 F1 Score: 0.898
Accuracy: 0.902 Recall: 0.902 Precision: 0.902 F1 Score: 0.902
Accuracy: 0.894 Recall: 0.894 Precision: 0.894 F1 Score: 0.894
Accuracy: 0.813 Recall: 0.813 Precision: 0.813 F1 Score: 0.813
Accuracy: 0.851 Recall: 0.851 Precision: 0.851 F1 Score: 0.851
Accuracy: 0.868 Recall: 0.868 Precision: 0.868 F1 Score: 0.868
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Frequency of hamming distance:  {0: 68.77, 1: 3.94, 2: 3.54, 3: 5.51, 4: 1.31, 5: 3.15, 6: 1.05, 7: 2.49, 8: 1.31, 9: 0.92, 10: 0.39, 11: 1.57, 12: 1.44, 13: 0.66, 14: 1.18, 15: 0.79, 16: 0.66, 18: 0.26, 19: 0.13, 20: 0.26, 21: 0.13, 22: 0.13, 23: 0.26, 24: 0.13}
Test data percentage wrt total data: 10
Accuracy: 0.854 Recall: 0.854 Precision: 0.854 F1 Score: 0.854
Acc

Accuracy: 0.778 Recall: 0.778 Precision: 0.778 F1 Score: 0.778
Accuracy: 0.818 Recall: 0.818 Precision: 0.818 F1 Score: 0.818
Accuracy: 0.764 Recall: 0.764 Precision: 0.764 F1 Score: 0.764
Accuracy: 0.870 Recall: 0.870 Precision: 0.870 F1 Score: 0.870
Accuracy: 0.842 Recall: 0.842 Precision: 0.842 F1 Score: 0.842
Accuracy: 0.805 Recall: 0.805 Precision: 0.805 F1 Score: 0.805
Accuracy: 0.867 Recall: 0.867 Precision: 0.867 F1 Score: 0.867
Accuracy: 0.787 Recall: 0.787 Precision: 0.787 F1 Score: 0.787
Accuracy: 0.811 Recall: 0.811 Precision: 0.811 F1 Score: 0.811
Accuracy: 0.860 Recall: 0.860 Precision: 0.860 F1 Score: 0.860
Frequency of hamming distance:  {0: 64.35, 1: 8.52, 2: 5.24, 3: 3.93, 4: 1.97, 5: 3.01, 6: 1.05, 7: 1.44, 8: 0.92, 9: 1.31, 10: 1.18, 11: 1.05, 12: 1.18, 13: 1.18, 14: 1.31, 15: 0.26, 16: 0.52, 17: 0.13, 18: 0.39, 20: 0.26, 21: 0.26, 22: 0.26, 23: 0.26}
Test data percentage wrt total data: 10
Accuracy: 0.829 Recall: 0.829 Precision: 0.829 F1 Score: 0.829
Accuracy: 0.8

Accuracy: 0.865 Recall: 0.865 Precision: 0.865 F1 Score: 0.865
Accuracy: 0.861 Recall: 0.861 Precision: 0.861 F1 Score: 0.861
Accuracy: 0.854 Recall: 0.854 Precision: 0.854 F1 Score: 0.854
Accuracy: 0.923 Recall: 0.923 Precision: 0.923 F1 Score: 0.923
Accuracy: 0.893 Recall: 0.893 Precision: 0.893 F1 Score: 0.893
Accuracy: 0.840 Recall: 0.840 Precision: 0.840 F1 Score: 0.840
Accuracy: 0.896 Recall: 0.896 Precision: 0.896 F1 Score: 0.896
Accuracy: 0.877 Recall: 0.877 Precision: 0.877 F1 Score: 0.877
Accuracy: 0.889 Recall: 0.889 Precision: 0.889 F1 Score: 0.889
Accuracy: 0.939 Recall: 0.939 Precision: 0.939 F1 Score: 0.939
Frequency of hamming distance:  {0: 66.88, 1: 4.27, 2: 5.69, 3: 2.72, 4: 1.81, 5: 3.88, 6: 0.91, 7: 1.81, 8: 1.68, 9: 1.16, 10: 0.26, 11: 1.29, 12: 1.29, 13: 1.16, 14: 0.91, 15: 1.29, 16: 0.78, 17: 0.39, 18: 0.39, 19: 0.26, 20: 0.52, 21: 0.13, 22: 0.26, 24: 0.26}
Test data percentage wrt total data: 10
Accuracy: 0.897 Recall: 0.897 Precision: 0.897 F1 Score: 0.897
Acc

Accuracy: 0.731 Recall: 0.731 Precision: 0.731 F1 Score: 0.731
Accuracy: 0.747 Recall: 0.747 Precision: 0.747 F1 Score: 0.747
Accuracy: 0.793 Recall: 0.793 Precision: 0.793 F1 Score: 0.793
Accuracy: 0.756 Recall: 0.756 Precision: 0.756 F1 Score: 0.756
Accuracy: 0.753 Recall: 0.753 Precision: 0.753 F1 Score: 0.753
Accuracy: 0.675 Recall: 0.675 Precision: 0.675 F1 Score: 0.675
Accuracy: 0.818 Recall: 0.818 Precision: 0.818 F1 Score: 0.818
Accuracy: 0.817 Recall: 0.817 Precision: 0.817 F1 Score: 0.817
Accuracy: 0.795 Recall: 0.795 Precision: 0.795 F1 Score: 0.795
Accuracy: 0.867 Recall: 0.867 Precision: 0.867 F1 Score: 0.867
Frequency of hamming distance:  {0: 64.74, 1: 6.95, 2: 4.63, 3: 4.25, 4: 1.03, 5: 3.35, 6: 1.16, 7: 1.67, 8: 1.16, 9: 0.77, 10: 0.9, 11: 2.83, 12: 1.42, 13: 1.42, 14: 1.03, 15: 0.9, 16: 0.9, 17: 0.13, 18: 0.13, 19: 0.13, 20: 0.26, 21: 0.13, 22: 0.13}
Test data percentage wrt total data: 10
Accuracy: 0.773 Recall: 0.773 Precision: 0.773 F1 Score: 0.773
Accuracy: 0.809 

Accuracy: 0.775 Recall: 0.775 Precision: 0.775 F1 Score: 0.775
Accuracy: 0.824 Recall: 0.824 Precision: 0.824 F1 Score: 0.824
Accuracy: 0.857 Recall: 0.857 Precision: 0.857 F1 Score: 0.857
Accuracy: 0.772 Recall: 0.772 Precision: 0.772 F1 Score: 0.772
Accuracy: 0.863 Recall: 0.863 Precision: 0.863 F1 Score: 0.863
Accuracy: 0.830 Recall: 0.830 Precision: 0.830 F1 Score: 0.830
Accuracy: 0.792 Recall: 0.792 Precision: 0.792 F1 Score: 0.792
Accuracy: 0.821 Recall: 0.821 Precision: 0.821 F1 Score: 0.821
Accuracy: 0.788 Recall: 0.788 Precision: 0.788 F1 Score: 0.788
Accuracy: 0.852 Recall: 0.852 Precision: 0.852 F1 Score: 0.852
Frequency of hamming distance:  {0: 65.08, 1: 6.8, 2: 3.47, 3: 3.72, 4: 1.67, 5: 2.82, 6: 0.77, 7: 1.41, 8: 1.28, 9: 1.54, 10: 0.9, 11: 2.31, 12: 1.8, 13: 2.18, 14: 1.67, 15: 0.9, 16: 0.51, 17: 0.13, 18: 0.26, 20: 0.26, 24: 0.51}
Test data percentage wrt total data: 10
Accuracy: 0.850 Recall: 0.850 Precision: 0.850 F1 Score: 0.850
Accuracy: 0.848 Recall: 0.848 Precisi

Accuracy: 0.870 Recall: 0.870 Precision: 0.870 F1 Score: 0.870
Accuracy: 0.793 Recall: 0.793 Precision: 0.793 F1 Score: 0.793
Accuracy: 0.821 Recall: 0.821 Precision: 0.821 F1 Score: 0.821
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.744 Recall: 0.744 Precision: 0.744 F1 Score: 0.744
Accuracy: 0.855 Recall: 0.855 Precision: 0.855 F1 Score: 0.855
Accuracy: 0.810 Recall: 0.810 Precision: 0.810 F1 Score: 0.810
Accuracy: 0.809 Recall: 0.809 Precision: 0.809 F1 Score: 0.809
Accuracy: 0.778 Recall: 0.778 Precision: 0.778 F1 Score: 0.778
Accuracy: 0.823 Recall: 0.823 Precision: 0.823 F1 Score: 0.823
Frequency of hamming distance:  {0: 65.76, 1: 7.09, 2: 3.96, 3: 4.77, 4: 1.23, 5: 3.14, 6: 1.36, 7: 1.91, 8: 1.23, 9: 0.82, 10: 0.68, 11: 1.91, 12: 1.5, 13: 0.68, 14: 0.82, 15: 1.23, 16: 0.27, 17: 0.41, 18: 0.55, 20: 0.41, 21: 0.14, 22: 0.14}
Test data percentage wrt total data: 10
Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.764 Recall: 

Accuracy: 0.812 Recall: 0.812 Precision: 0.812 F1 Score: 0.812
Accuracy: 0.772 Recall: 0.772 Precision: 0.772 F1 Score: 0.772
Accuracy: 0.815 Recall: 0.815 Precision: 0.815 F1 Score: 0.815
Accuracy: 0.789 Recall: 0.789 Precision: 0.789 F1 Score: 0.789
Accuracy: 0.907 Recall: 0.907 Precision: 0.907 F1 Score: 0.907
Accuracy: 0.877 Recall: 0.877 Precision: 0.877 F1 Score: 0.877
Accuracy: 0.821 Recall: 0.821 Precision: 0.821 F1 Score: 0.821
Accuracy: 0.843 Recall: 0.843 Precision: 0.843 F1 Score: 0.843
Accuracy: 0.845 Recall: 0.845 Precision: 0.845 F1 Score: 0.845
Accuracy: 0.756 Recall: 0.756 Precision: 0.756 F1 Score: 0.756
Frequency of hamming distance:  {0: 65.17, 1: 7.26, 2: 4.49, 3: 3.43, 4: 0.92, 5: 3.83, 6: 1.06, 7: 1.58, 8: 1.06, 9: 1.19, 10: 1.06, 11: 1.72, 12: 1.85, 13: 1.58, 14: 1.06, 15: 0.66, 16: 0.79, 17: 0.26, 18: 0.13, 19: 0.26, 22: 0.13, 23: 0.4, 24: 0.13}
Test data percentage wrt total data: 10
Accuracy: 0.807 Recall: 0.807 Precision: 0.807 F1 Score: 0.807
Accuracy: 0.80

Accuracy: 0.810 Recall: 0.810 Precision: 0.810 F1 Score: 0.810
Accuracy: 0.862 Recall: 0.862 Precision: 0.862 F1 Score: 0.862
Accuracy: 0.899 Recall: 0.899 Precision: 0.899 F1 Score: 0.899
Accuracy: 0.901 Recall: 0.901 Precision: 0.901 F1 Score: 0.901
Accuracy: 0.829 Recall: 0.829 Precision: 0.829 F1 Score: 0.829
Accuracy: 0.863 Recall: 0.863 Precision: 0.863 F1 Score: 0.863
Accuracy: 0.803 Recall: 0.803 Precision: 0.803 F1 Score: 0.803
Accuracy: 0.905 Recall: 0.905 Precision: 0.905 F1 Score: 0.905
Accuracy: 0.892 Recall: 0.892 Precision: 0.892 F1 Score: 0.892
Accuracy: 0.859 Recall: 0.859 Precision: 0.859 F1 Score: 0.859
Frequency of hamming distance:  {0: 67.02, 1: 6.62, 2: 4.5, 3: 4.11, 4: 1.99, 5: 2.52, 6: 0.79, 7: 0.93, 8: 1.06, 9: 0.79, 10: 0.79, 11: 1.99, 12: 1.46, 13: 0.93, 14: 1.06, 15: 0.79, 16: 0.4, 17: 0.26, 18: 0.4, 19: 0.13, 20: 0.13, 22: 0.26, 23: 0.93, 24: 0.13}
Test data percentage wrt total data: 10
Accuracy: 0.844 Recall: 0.844 Precision: 0.844 F1 Score: 0.844
Accura

Accuracy: 0.820 Recall: 0.820 Precision: 0.820 F1 Score: 0.820
Accuracy: 0.806 Recall: 0.806 Precision: 0.806 F1 Score: 0.806
Accuracy: 0.795 Recall: 0.795 Precision: 0.795 F1 Score: 0.795
Accuracy: 0.793 Recall: 0.793 Precision: 0.793 F1 Score: 0.793
Accuracy: 0.877 Recall: 0.877 Precision: 0.877 F1 Score: 0.877
Accuracy: 0.750 Recall: 0.750 Precision: 0.750 F1 Score: 0.750
Accuracy: 0.776 Recall: 0.776 Precision: 0.776 F1 Score: 0.776
Accuracy: 0.770 Recall: 0.770 Precision: 0.770 F1 Score: 0.770
Accuracy: 0.716 Recall: 0.716 Precision: 0.716 F1 Score: 0.716
Accuracy: 0.689 Recall: 0.689 Precision: 0.689 F1 Score: 0.689
Frequency of hamming distance:  {0: 64.99, 1: 5.84, 2: 5.04, 3: 3.05, 4: 2.25, 5: 4.11, 6: 0.66, 7: 1.59, 8: 1.59, 9: 2.12, 10: 0.66, 11: 1.33, 12: 2.25, 13: 1.46, 14: 0.8, 15: 0.66, 16: 0.4, 18: 0.4, 20: 0.13, 21: 0.4, 23: 0.27}
Test data percentage wrt total data: 10
Accuracy: 0.800 Recall: 0.800 Precision: 0.800 F1 Score: 0.800
Accuracy: 0.767 Recall: 0.767 Precisi

Accuracy: 0.927 Recall: 0.927 Precision: 0.927 F1 Score: 0.927
Accuracy: 0.873 Recall: 0.873 Precision: 0.873 F1 Score: 0.873
Accuracy: 0.828 Recall: 0.828 Precision: 0.828 F1 Score: 0.828
Accuracy: 0.883 Recall: 0.883 Precision: 0.883 F1 Score: 0.883
Accuracy: 0.815 Recall: 0.815 Precision: 0.815 F1 Score: 0.815
Accuracy: 0.886 Recall: 0.886 Precision: 0.886 F1 Score: 0.886
Accuracy: 0.855 Recall: 0.855 Precision: 0.855 F1 Score: 0.855
Accuracy: 0.843 Recall: 0.843 Precision: 0.843 F1 Score: 0.843
Accuracy: 0.794 Recall: 0.794 Precision: 0.794 F1 Score: 0.794
Accuracy: 0.921 Recall: 0.921 Precision: 0.921 F1 Score: 0.921
Frequency of hamming distance:  {0: 65.55, 1: 6.28, 2: 4.41, 3: 4.01, 4: 1.6, 5: 3.47, 6: 0.4, 7: 1.47, 8: 1.07, 9: 1.07, 10: 1.47, 11: 2.8, 12: 1.47, 13: 1.34, 14: 0.8, 15: 1.34, 16: 0.53, 17: 0.13, 18: 0.13, 20: 0.13, 23: 0.53}
Test data percentage wrt total data: 10
Accuracy: 0.880 Recall: 0.880 Precision: 0.880 F1 Score: 0.880
Accuracy: 0.844 Recall: 0.844 Precisi