In [1]:
import numpy as np
import pandas as pd
import warnings
import csv
from itertools import groupby

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import recall_score as rec, precision_score as pre, f1_score as f1, accuracy_score as acc

In [2]:
# Read Signatures and z scores
csv_name = "../data/labeled_sigs"
new_csv_name = "../data/Adomain_Substrate_labeled_sigs.report"
z_score_csv_name = "../data/z_score_aa"
df_init = pd.read_csv(csv_name, delimiter='\t')
df_new = pd.read_csv(new_csv_name, delimiter='\t')
z_scores_df = pd.read_csv(z_score_csv_name, delimiter=' ')

In [3]:
df_original = df_init.drop_duplicates(subset='sig')

In [4]:
# Prep for hardcoded values
m=26
nrow = 12
D = np.zeros((nrow, m))
D1 = D[0 ,:]
D2 = D[1 ,:]
D3 = D[2 ,:]
D4 = D[3 ,:]
D5 = D[4 ,:]
D6 = D[5 ,:]
D7 = D[6 ,:]
D8 = D[7 ,:]
D9 = D[8 ,:]
D10 = D[9 ,:]
D11 = D[10 ,:]
D12 = D[11 ,:]

In [5]:
# Hard coded values, taken from NRPSPredictor2 github

#1 aa-alpha-helix.aaindex
D1[0]=1.420;D1[1]=0.000;D1[2]=0.700;D1[3]=1.010;D1[4]=1.510;D1[5]=1.130;D1[6]=0.570;D1[7]=1.000;D1[8]=1.080;D1[9]=0.000;D1[10]=1.160;D1[11]=1.210;D1[12]=1.450;D1[13]=0.670;D1[14]=0.000;D1[15]=0.570;D1[16]=1.110;D1[17]=0.980;D1[18]=0.770;D1[19]=0.830;D1[20]=0.000;D1[21]=1.060;D1[22]=1.080;D1[23]=0.000;D1[24]=0.690;D1[25]=0.000;
#2 aa-beta-sheet.aaindex
D2[0]=0.830;D2[1]=0.000;D2[2]=1.190;D2[3]=0.540;D2[4]=0.370;D2[5]=1.380;D2[6]=0.750;D2[7]=0.870;D2[8]=1.600;D2[9]=0.000;D2[10]=0.740;D2[11]=1.300;D2[12]=1.050;D2[13]=0.890;D2[14]=0.000;D2[15]=0.550;D2[16]=1.100;D2[17]=0.930;D2[18]=0.750;D2[19]=1.190;D2[20]=0.000;D2[21]=1.700;D2[22]=1.370;D2[23]=0.000;D2[24]=1.470;D2[25]=0.000;
#3 aa-beta-turn.aaindex
D3[0]=0.740;D3[1]=0.000;D3[2]=0.960;D3[3]=1.520;D3[4]=0.950;D3[5]=0.660;D3[6]=1.560;D3[7]=0.950;D3[8]=0.470;D3[9]=0.000;D3[10]=1.190;D3[11]=0.500;D3[12]=0.600;D3[13]=1.460;D3[14]=0.000;D3[15]=1.560;D3[16]=0.960;D3[17]=1.010;D3[18]=1.430;D3[19]=0.980;D3[20]=0.000;D3[21]=0.590;D3[22]=0.600;D3[23]=0.000;D3[24]=1.140;D3[25]=0.000;
#4 aa-hydrogenbond.aaindex
D4[0]=0.000;D4[1]=0.000;D4[2]=0.000;D4[3]=1.000;D4[4]=1.000;D4[5]=0.000;D4[6]=0.000;D4[7]=1.000;D4[8]=0.000;D4[9]=0.000;D4[10]=2.000;D4[11]=0.000;D4[12]=0.000;D4[13]=2.000;D4[14]=0.000;D4[15]=0.000;D4[16]=2.000;D4[17]=4.000;D4[18]=1.000;D4[19]=1.000;D4[20]=0.000;D4[21]=0.000;D4[22]=1.000;D4[23]=0.000;D4[24]=1.000;D4[25]=0.000;
#5 aa-hydrophobicity-neu1.aaindex
D5[0]=0.060;D5[1]=0.000;D5[2]=-0.560;D5[3]=0.970;D5[4]=0.850;D5[5]=-0.990;D5[6]=0.320;D5[7]=0.150;D5[8]=-1.000;D5[9]=0.000;D5[10]=1.000;D5[11]=-0.830;D5[12]=-0.680;D5[13]=0.700;D5[14]=0.000;D5[15]=0.450;D5[16]=0.710;D5[17]=0.800;D5[18]=0.480;D5[19]=0.380;D5[20]=0.000;D5[21]=-0.750;D5[22]=-0.570;D5[23]=0.000;D5[24]=-0.350;D5[25]=0.000;
#6 aa-hydrophobicity-neu2.aaindex
D6[0]=-0.250;D6[1]=0.000;D6[2]=-0.400;D6[3]=-0.080;D6[4]=-0.100;D6[5]=0.180;D6[6]=-0.320;D6[7]=-0.030;D6[8]=-0.030;D6[9]=0.000;D6[10]=0.320;D6[11]=0.050;D6[12]=-0.010;D6[13]=-0.060;D6[14]=0.000;D6[15]=0.230;D6[16]=-0.020;D6[17]=0.190;D6[18]=-0.150;D6[19]=-0.100;D6[20]=0.000;D6[21]=-0.190;D6[22]=0.310;D6[23]=0.000;D6[24]=0.400;D6[25]=0.000;
#7 aa-hydrophobicity-neu3.aaindex
D7[0]=0.250;D7[1]=0.000;D7[2]=-0.140;D7[3]=0.080;D7[4]=-0.050;D7[5]=0.150;D7[6]=0.280;D7[7]=-0.100;D7[8]=0.100;D7[9]=0.000;D7[10]=0.110;D7[11]=0.010;D7[12]=0.040;D7[13]=0.170;D7[14]=0.000;D7[15]=0.410;D7[16]=0.120;D7[17]=-0.410;D7[18]=0.230;D7[19]=0.290;D7[20]=0.000;D7[21]=0.030;D7[22]=0.340;D7[23]=0.000;D7[24]=-0.020;D7[25]=0.000;
#8 aa-isoelectric.aaindex
D8[0]=6.000;D8[1]=0.000;D8[2]=5.050;D8[3]=2.770;D8[4]=3.220;D8[5]=5.480;D8[6]=5.970;D8[7]=7.590;D8[8]=6.020;D8[9]=0.000;D8[10]=9.740;D8[11]=5.980;D8[12]=5.740;D8[13]=5.410;D8[14]=0.000;D8[15]=6.300;D8[16]=5.650;D8[17]=10.760;D8[18]=5.680;D8[19]=5.660;D8[20]=0.000;D8[21]=5.960;D8[22]=5.890;D8[23]=0.000;D8[24]=5.660;D8[25]=0.000;
#9 aa-polar-grantham.aaindex
D9[0]=8.100;D9[1]=0.000;D9[2]=5.500;D9[3]=13.000;D9[4]=12.300;D9[5]=5.200;D9[6]=9.000;D9[7]=10.400;D9[8]=5.200;D9[9]=0.000;D9[10]=11.300;D9[11]=4.900;D9[12]=5.700;D9[13]=11.600;D9[14]=0.000;D9[15]=8.000;D9[16]=10.500;D9[17]=10.500;D9[18]=9.200;D9[19]=8.600;D9[20]=0.000;D9[21]=5.900;D9[22]=5.400;D9[23]=0.000;D9[24]=6.200;D9[25]=0.000;
#10 aa-polar-radzicka.aaindex
D10[0]=-0.060;D10[1]=0.000;D10[2]=1.360;D10[3]=-0.800;D10[4]=-0.770;D10[5]=1.270;D10[6]=-0.410;D10[7]=0.490;D10[8]=1.310;D10[9]=0.000;D10[10]=-1.180;D10[11]=1.210;D10[12]=1.270;D10[13]=-0.480;D10[14]=0.000;D10[15]=1.100;D10[16]=-0.730;D10[17]=-0.840;D10[18]=-0.500;D10[19]=-0.270;D10[20]=0.000;D10[21]=1.090;D10[22]=0.880;D10[23]=0.000;D10[24]=0.330;D10[25]=0.000;
#11 aa-polar-zimmerman.aaindex
D11[0]=0.000;D11[1]=0.000;D11[2]=1.480;D11[3]=49.700;D11[4]=49.900;D11[5]=0.350;D11[6]=0.000;D11[7]=51.600;D11[8]=0.130;D11[9]=0.000;D11[10]=49.500;D11[11]=0.130;D11[12]=1.430;D11[13]=3.380;D11[14]=0.000;D11[15]=1.580;D11[16]=3.530;D11[17]=52.000;D11[18]=1.670;D11[19]=1.660;D11[20]=0.000;D11[21]=0.130;D11[22]=2.100;D11[23]=0.000;D11[24]=1.610;D11[25]=0.000;
#12 aa-volume.aaindex
D12[0]=90.000;D12[1]=0.000;D12[2]=103.300;D12[3]=117.300;D12[4]=142.200;D12[5]=191.900;D12[6]=64.900;D12[7]=160.000;D12[8]=163.900;D12[9]=0.000;D12[10]=167.300;D12[11]=164.000;D12[12]=167.000;D12[13]=124.700;D12[14]=0.000;D12[15]=122.900;D12[16]=149.400;D12[17]=194.000;D12[18]=95.400;D12[19]=121.500;D12[20]=0.000;D12[21]=139.000;D12[22]=228.200;D12[23]=0.000;D12[24]=197.000;D12[25]=0.000;



In [6]:
# Map Amino acid abbreviations with indices of D[i, :]

aa2ind_map = {}
aa2ind_map['A'] = 0
aa2ind_map['R'] = 17
aa2ind_map['D'] = 3
aa2ind_map['N'] = 13
aa2ind_map['C'] = 2
aa2ind_map['E'] = 4
aa2ind_map['Q'] = 16
aa2ind_map['G'] = 6
aa2ind_map['H'] = 7
aa2ind_map['I'] = 8
aa2ind_map['L'] = 11
aa2ind_map['K'] = 10
aa2ind_map['M'] = 12
aa2ind_map['F'] = 5
aa2ind_map['P'] = 15
aa2ind_map['S'] = 18
aa2ind_map['T'] = 19
aa2ind_map['W'] = 22
aa2ind_map['Y'] = 24
aa2ind_map['V'] = 21

In [7]:
# Dictionary to store 15 dimensional vector values for each amino acid

aa_dict = {}
for i, row in z_scores_df.iterrows():
    aa_dict[row['aa_short']] = [row['z1'], row['z2'], row['z3'], ] # Store z values
    aa_dict[row['aa_short']].extend(D[:, aa2ind_map[row['aa_short']]]) # Store rest of the 12 hard-coded values
aa_dict['-'] = list(np.mean(np.asarray(list(aa_dict.values())), axis=0))

In [8]:
# Create index to value map and value to index map from a list using list index
def get_dict_from_list(mylist):
    assert len(mylist) > 0
    idx_to_value_dict = {}
    value_to_idx_dict = {}
    for i in range(len(mylist)):
        idx_to_value_dict[i] = mylist[i]
        value_to_idx_dict[mylist[i]] = int(i)
    return idx_to_value_dict, value_to_idx_dict

# Get index-value maps for substrates
sub_vocab_set = set(df_original['sub'].tolist() + ['null'])
sub_vocab_list = list(sub_vocab_set)
sub_idx_to_value_dict, sub_value_to_idx_dict = get_dict_from_list(sub_vocab_list)

In [9]:
sub_value_to_idx_dict

{'aad': 0,
 'dhpg': 1,
 'hasn': 2,
 'glu': 3,
 'bht': 4,
 'tcl': 5,
 'val': 6,
 'betaKala': 7,
 'orn': 8,
 'dhb': 9,
 'hpg': 10,
 'vol': 11,
 'lys': 12,
 'NK(1,1KdimethylK1Kallyl)Trp': 13,
 'serKthr': 14,
 'tyr': 15,
 '4ppro': 16,
 'asp': 17,
 'arg': 18,
 'asn': 19,
 'pip': 20,
 'phe': 21,
 'hivKd': 22,
 'dKlyserg': 23,
 'dab': 24,
 'abu': 25,
 'leu': 26,
 'null': 27,
 'phg': 28,
 'thr': 29,
 'iva': 30,
 '3KmeKglu': 31,
 'alaninol': 32,
 'lysKb': 33,
 'sal': 34,
 'pro': 35,
 '2KoxoKisovalericKacid': 36,
 'aeo': 37,
 'cys': 38,
 'asn/gln': 39,
 'LDAP': 40,
 'ala': 41,
 'dpg': 42,
 'horn': 43,
 'gly': 44,
 'val/ile': 45,
 'ile': 46,
 'cap': 47,
 'alaKb': 48,
 'val/ile/alloile': 49,
 'ser': 50,
 'trp/tyr': 51,
 'his': 52,
 'alphaKhydroxyKisocaproic acid': 53,
 'trp': 54,
 'gln': 55,
 'haorn': 56}

In [10]:
# Given a signature of amino acids(34 aa long), construct the 34x15=510 dimentional representation
def get_encoding(signature):
    ret = []
    for i in signature:
        ret.extend(aa_dict[i])
    return np.asarray(ret)


In [11]:
def get_correct_sub(sub_in):
    if sub_in.lower() == 'beta-ala':
        ret = 'ala'
    elif sub_in.lower() in ['orn', 'horn']:
        ret = 'orn'
    elif sub_in.lower() == 'hyv-d':
        ret = 'null'
    elif sub_in.lower() == 'dht':
        ret = 'dhb'
    else:
        ret = sub_in
    
    return ret

# Format the data
def format_data_from_df(df_in):
    sig_len = 34
    raw_data = []
    data_np = []
    label_np = []
    j=0
    for i, row in df_in.iterrows():
        assert len(row['sig']) == sig_len
        raw_data.append(row['sig'])
        data_np.append(get_encoding(row['sig']))
        label_np.append(sub_value_to_idx_dict[get_correct_sub(row['sub'])])
        j+=1
    return np.asarray(data_np), np.asarray(label_np).astype('int'), np.array(raw_data)

data_np_nrps, label_np_nrps, raw_data_nrps = format_data_from_df(df_original)
data_np_test, label_np_test, raw_data_test = format_data_from_df(df_new)
#data_np = np.asarray(data_np)
#label_np = np.asarray(label_np).astype('int')
#raw_data = np.array(raw_data)

In [12]:
def get_hamming_distance(str1, str2):
    return sum(i != j for i, j in zip(str1, str2))

# For all points in test_data, choose the minimum hamming distance from all of train data, and return the distance list
def get_hamming_distance_bucket_info(test_data, train_data):
    dist_list = []
    for test_data_pt in test_data:
        dist_list.append(min([get_hamming_distance(test_data_pt, train_data_pt) for train_data_pt in train_data]))
    return np.array(dist_list)

def round_dec(num, dec=2):
    return float(round(num* 10.**dec))/(10**dec)

In [13]:
# Build classifier dictionary using sklearn multiclass classifiers

clf_dict={}
clf_dict['lr'] = LogisticRegression(random_state=0, max_iter=400, multi_class='multinomial', solver='newton-cg')
clf_dict['svm'] = make_pipeline(StandardScaler(), LinearSVC(random_state=0, multi_class='crammer_singer', tol=1e-9, max_iter=2000))
clf_dict['knn'] = KNeighborsClassifier(weights='distance')
clf_dict['mlp_sklearn'] = MLPClassifier(random_state=1, max_iter=400, early_stopping=False, )
clf_dict['rand_for'] = RandomForestClassifier(max_depth=4, criterion='entropy')
clf_dict['dec_tree'] = DecisionTreeClassifier(random_state=0, criterion='entropy')
clf_dict['ber_nb'] = BernoulliNB()
clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')
clf_dict['gau_nb'] = GaussianNB()
clf_dict['label_prop'] = LabelPropagation(kernel='knn')
clf_dict['label_spread'] = LabelSpreading(kernel='knn')
clf_dict['lda'] = LinearDiscriminantAnalysis()
clf_dict['ridge_cv'] = RidgeClassifierCV()
clf_dict['n_cent'] = NearestCentroid()
clf_dict['ridge'] = RidgeClassifier()

In [14]:
def print_stats(metric_list, metric_list_bucket, n_bucket):
    print(f"\n\nOverall Stats:")

    for item in metric_list:
        print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc']:.3f} Average Recall: {item['rec']:.3f} Average Precision: {item['pre']:.3f} Average F1 Score: {item['f1']:.3f}")

    for bucket in range(n_bucket):
        print(f"\n\nBucket {bucket+1} Stats:")
        for item in metric_list_bucket:
            print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc'][bucket]:.3f} Average Recall: {item['rec'][bucket]:.3f} Average Precision: {item['pre'][bucket]:.3f} Average F1 Score: {item['f1'][bucket]:.3f}")

def save_stats(metric_list, metric_list_bucket, n_bucket, pct_list, csv_name='stats'):
    overall_dict = {}
    for item in metric_list:
        overall_dict[item['pct']] = item['acc']
    
    with open(csv_name + '.csv', 'w', newline='') as csvfile:
        fieldnames = ['pct', 'overall_acc'] + ['Bucket '+str(item) for item in range(n_bucket)]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for item in metric_list_bucket:
            temp_dict = {'pct': item['pct'], 'overall_acc': round_dec(overall_dict[item['pct']])}
            for bucket in range(n_bucket):
                temp_dict['Bucket ' + str(bucket)] = round_dec(item['acc'][bucket])
            writer.writerow(temp_dict)

# n_iter signifies number of iterations
# test_percentage_list is the list of percentages of test data with respect to total data
# Code will iterate for each percentage for n_iter iterations
def train_and_validate(clf_type, n_iter=10, test_percentage_list = [5, 10, 25, 40, 50], print_stat=True, save_stat=True):
    print(f'\nUsing {clf_type} classifier')
    #n=len(label_np)
    avg = 'micro'
    # metric_list contains overall accuracy, precision, recall and f1 score for all data
    # metric_list_bucket contains hammning distance bucket-wise accuracy, precision, recall and f1 score
    metric_list = []
    metric_list_bucket = []
    n_bucket = 16
    eps = 1e-8
    test_percentage_list = [round_dec(len(data_np_test)/(len(data_np_test)+len(data_np_nrps)))]
    #n_iter = 5

    for test_percentage in test_percentage_list:
        assert test_percentage>0 and test_percentage<100
        acc_sum = 0
        pre_sum = 0
        rec_sum = 0
        f1_sum = 0
        print(f'Test data percentage wrt total data: {test_percentage}')
        n_iter_ar = np.zeros(n_bucket)
        acc_sum_ar = np.zeros(n_bucket)
        pre_sum_ar = np.zeros(n_bucket)
        rec_sum_ar = np.zeros(n_bucket)
        f1_sum_ar = np.zeros(n_bucket)
        bucket_pct_ar = np.zeros(n_bucket)
        dist_buckets_all = []

        for iter in range(n_iter):
            # Create filter for random split
            '''
            test_elig = np.random.random(size=(n)) <= (test_percentage/100)
            test_data = data_np[test_elig]
            test_label = label_np[test_elig]
            train_data = data_np[(test_elig-1).astype('bool')]
            train_label = label_np[(test_elig-1).astype('bool')]

            raw_train_data = raw_data[(test_elig-1).astype('bool')]
            raw_test_data = raw_data[test_elig]
            '''
            test_data, test_label, raw_test_data = data_np_test, label_np_test, raw_data_test
            train_data, train_label, raw_train_data = data_np_nrps, label_np_nrps, raw_data_nrps
            
            dist_buckets = get_hamming_distance_bucket_info(raw_test_data, raw_train_data)
            dist_buckets_all.extend(list(dist_buckets))
            #print(dist_buckets)

            clf = clf_dict[clf_type]
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                clf.fit(train_data, train_label)
            test_predicted = clf.predict(test_data)

            accuracy = acc(test_label, test_predicted)
            precision = pre(test_label, test_predicted, average=avg)
            recall = rec(test_label, test_predicted, average=avg)
            f1_score = f1(test_label, test_predicted, average=avg)
            acc_sum += accuracy
            pre_sum += precision
            rec_sum += recall
            f1_sum += f1_score
            print(f'Accuracy: {accuracy:.3f} Recall: {recall:.3f} Precision: {precision:.3f} F1 Score: {f1_score:.3f}')

            for bucket in range(n_bucket):
                b_filter = dist_buckets == bucket
                filtered_test_label = test_label[b_filter]
                filtered_test_predicted = test_predicted[b_filter]
                if len(filtered_test_label):
                    accuracy = acc(filtered_test_label, filtered_test_predicted)
                    precision = pre(filtered_test_label, filtered_test_predicted, average=avg)
                    recall = rec(filtered_test_label, filtered_test_predicted, average=avg)
                    f1_score = f1(filtered_test_label, filtered_test_predicted, average=avg)
                    acc_sum_ar[bucket] += accuracy
                    pre_sum_ar[bucket] += precision
                    rec_sum_ar[bucket] += recall
                    f1_sum_ar[bucket] += f1_score
                    n_iter_ar[bucket] += 1
                    bucket_pct_ar[bucket] += len(filtered_test_label)


        metric_list.append({'pct':test_percentage, 'acc':acc_sum/n_iter, 'pre':pre_sum/n_iter, 'rec':rec_sum/n_iter, 'f1':f1_sum/n_iter})
        metric_list_bucket.append({'pct':test_percentage, 'acc':acc_sum_ar/(n_iter+eps), 'pre':pre_sum_ar/(n_iter+eps), 'rec':rec_sum_ar/(n_iter+eps), 'f1':f1_sum_ar/(n_iter+eps), 'bkt_pct':bucket_pct_ar/((n_iter+eps)*len(data_np_test))})
        freq_stat = {value: round_dec(len(list(freq))*100./len(dist_buckets_all), 2) for value, freq in groupby(sorted(dist_buckets_all))}
        print(f"Frequency of hamming distance: ", freq_stat)


    if print_stat:
        print_stats(metric_list, metric_list_bucket, n_bucket)
    if save_stat:
        save_stats(metric_list, metric_list_bucket, n_bucket, test_percentage_list, csv_name='../results/filter_Adomain_old_new_stats_'+clf_type+'_iter_'+str(n_iter)+'_pct_'+'_'.join(map(str, test_percentage_list)))
    return {'overall':metric_list, 'bucket':metric_list_bucket}


In [15]:
def save_consolidated_stats(metric_struct, metric_bucket_struct, n_iter):
    algos = list(metric_struct.keys())
    if algos == []:
        return
    pct_list = [item['pct'] for item in metric_struct[algos[0]]]
    n_bucket = len(metric_bucket_struct[algos[0]][0]['acc'])
    bucket_pct = [ str(round(item, 3)) for item in metric_bucket_struct[algos[0]][0]['bkt_pct'] ]
    overall_struct = {}
    bucket_struct = {}
    for algo in algos:
        temp = {}
        for item in metric_struct[algo]:
            temp[item['pct']] = round_dec(item['acc'])
        overall_struct[algo] = temp
        temp = {}
        for item in metric_bucket_struct[algo]:
            temp[item['pct']] = [round_dec(it) for it in item['acc']]
        bucket_struct[algo] = temp
    for pct in pct_list:
        with open('../results/filter_Adomain_old_new_consolidated_stats_algo_'+'_'.join(map(str, algos))+'_n_iter_'+str(n_iter)+'_pct_'+str(pct)+'_bucket_'+str(n_bucket)+ '.csv', 'w', newline='') as csvfile:
            Bucket_fields = ['Bucket '+str(item)+'('+bucket_pct[item]+')' for item in range(n_bucket)]
            fieldnames = ['Algo', 'Overall'] + Bucket_fields
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for algo in algos:
                temp = {'Algo': algo, 'Overall':overall_struct[algo][pct]}
                temp.update(dict(zip(Bucket_fields, bucket_struct[algo][pct])))
                writer.writerow(temp)

In [16]:
algo_overall_metric = {}
algo_bucket_metric = {}
n_iter = 3
for clf in list(clf_dict.keys()):
#for clf in ['lr', 'svm']:
#for clf in ['lr']:
    ret = train_and_validate(clf, n_iter=n_iter)
    algo_overall_metric[clf] = ret['overall']
    algo_bucket_metric[clf] = ret['bucket']
save_consolidated_stats(algo_overall_metric, algo_bucket_metric, n_iter)


Using lr classifier
Test data percentage wrt total data: 0.81
Accuracy: 0.814 Recall: 0.814 Precision: 0.814 F1 Score: 0.814
Accuracy: 0.814 Recall: 0.814 Precision: 0.814 F1 Score: 0.814
Accuracy: 0.814 Recall: 0.814 Precision: 0.814 F1 Score: 0.814
Frequency of hamming distance:  {0: 55.67, 1: 2.35, 2: 2.28, 3: 3.06, 4: 8.08, 5: 5.54, 6: 3.19, 7: 2.15, 8: 1.83, 9: 0.98, 10: 1.24, 11: 0.78, 12: 1.24, 13: 1.37, 14: 1.5, 15: 0.91, 16: 0.78, 17: 0.85, 18: 0.46, 19: 3.78, 20: 0.2, 21: 0.39, 22: 0.59, 23: 0.39, 24: 0.26, 26: 0.13}


Overall Stats:
Test percentage: 0.81 Average Accuracy: 0.814 Average Recall: 0.814 Average Precision: 0.814 Average F1 Score: 0.814


Bucket 1 Stats:
Test percentage: 0.81 Average Accuracy: 0.978 Average Recall: 0.978 Average Precision: 0.978 Average F1 Score: 0.978


Bucket 2 Stats:
Test percentage: 0.81 Average Accuracy: 0.833 Average Recall: 0.833 Average Precision: 0.833 Average F1 Score: 0.833


Bucket 3 Stats:
Test percentage: 0.81 Average Accuracy: 0.80

Accuracy: 0.817 Recall: 0.817 Precision: 0.817 F1 Score: 0.817
Accuracy: 0.817 Recall: 0.817 Precision: 0.817 F1 Score: 0.817
Accuracy: 0.817 Recall: 0.817 Precision: 0.817 F1 Score: 0.817
Frequency of hamming distance:  {0: 55.67, 1: 2.35, 2: 2.28, 3: 3.06, 4: 8.08, 5: 5.54, 6: 3.19, 7: 2.15, 8: 1.83, 9: 0.98, 10: 1.24, 11: 0.78, 12: 1.24, 13: 1.37, 14: 1.5, 15: 0.91, 16: 0.78, 17: 0.85, 18: 0.46, 19: 3.78, 20: 0.2, 21: 0.39, 22: 0.59, 23: 0.39, 24: 0.26, 26: 0.13}


Overall Stats:
Test percentage: 0.81 Average Accuracy: 0.817 Average Recall: 0.817 Average Precision: 0.817 Average F1 Score: 0.817


Bucket 1 Stats:
Test percentage: 0.81 Average Accuracy: 0.978 Average Recall: 0.978 Average Precision: 0.978 Average F1 Score: 0.978


Bucket 2 Stats:
Test percentage: 0.81 Average Accuracy: 0.833 Average Recall: 0.833 Average Precision: 0.833 Average F1 Score: 0.833


Bucket 3 Stats:
Test percentage: 0.81 Average Accuracy: 0.800 Average Recall: 0.800 Average Precision: 0.800 Average F1 Sco

Accuracy: 0.791 Recall: 0.791 Precision: 0.791 F1 Score: 0.791
Accuracy: 0.791 Recall: 0.791 Precision: 0.791 F1 Score: 0.791
Accuracy: 0.791 Recall: 0.791 Precision: 0.791 F1 Score: 0.791
Frequency of hamming distance:  {0: 55.67, 1: 2.35, 2: 2.28, 3: 3.06, 4: 8.08, 5: 5.54, 6: 3.19, 7: 2.15, 8: 1.83, 9: 0.98, 10: 1.24, 11: 0.78, 12: 1.24, 13: 1.37, 14: 1.5, 15: 0.91, 16: 0.78, 17: 0.85, 18: 0.46, 19: 3.78, 20: 0.2, 21: 0.39, 22: 0.59, 23: 0.39, 24: 0.26, 26: 0.13}


Overall Stats:
Test percentage: 0.81 Average Accuracy: 0.791 Average Recall: 0.791 Average Precision: 0.791 Average F1 Score: 0.791


Bucket 1 Stats:
Test percentage: 0.81 Average Accuracy: 0.910 Average Recall: 0.910 Average Precision: 0.910 Average F1 Score: 0.910


Bucket 2 Stats:
Test percentage: 0.81 Average Accuracy: 0.833 Average Recall: 0.833 Average Precision: 0.833 Average F1 Score: 0.833


Bucket 3 Stats:
Test percentage: 0.81 Average Accuracy: 0.886 Average Recall: 0.886 Average Precision: 0.886 Average F1 Sco

Accuracy: 0.770 Recall: 0.770 Precision: 0.770 F1 Score: 0.770
Accuracy: 0.770 Recall: 0.770 Precision: 0.770 F1 Score: 0.770
Accuracy: 0.770 Recall: 0.770 Precision: 0.770 F1 Score: 0.770
Frequency of hamming distance:  {0: 55.67, 1: 2.35, 2: 2.28, 3: 3.06, 4: 8.08, 5: 5.54, 6: 3.19, 7: 2.15, 8: 1.83, 9: 0.98, 10: 1.24, 11: 0.78, 12: 1.24, 13: 1.37, 14: 1.5, 15: 0.91, 16: 0.78, 17: 0.85, 18: 0.46, 19: 3.78, 20: 0.2, 21: 0.39, 22: 0.59, 23: 0.39, 24: 0.26, 26: 0.13}


Overall Stats:
Test percentage: 0.81 Average Accuracy: 0.770 Average Recall: 0.770 Average Precision: 0.770 Average F1 Score: 0.770


Bucket 1 Stats:
Test percentage: 0.81 Average Accuracy: 0.883 Average Recall: 0.883 Average Precision: 0.883 Average F1 Score: 0.883


Bucket 2 Stats:
Test percentage: 0.81 Average Accuracy: 0.833 Average Recall: 0.833 Average Precision: 0.833 Average F1 Score: 0.833


Bucket 3 Stats:
Test percentage: 0.81 Average Accuracy: 0.886 Average Recall: 0.886 Average Precision: 0.886 Average F1 Sco

Accuracy: 0.810 Recall: 0.810 Precision: 0.810 F1 Score: 0.810
Accuracy: 0.810 Recall: 0.810 Precision: 0.810 F1 Score: 0.810
Accuracy: 0.810 Recall: 0.810 Precision: 0.810 F1 Score: 0.810
Frequency of hamming distance:  {0: 55.67, 1: 2.35, 2: 2.28, 3: 3.06, 4: 8.08, 5: 5.54, 6: 3.19, 7: 2.15, 8: 1.83, 9: 0.98, 10: 1.24, 11: 0.78, 12: 1.24, 13: 1.37, 14: 1.5, 15: 0.91, 16: 0.78, 17: 0.85, 18: 0.46, 19: 3.78, 20: 0.2, 21: 0.39, 22: 0.59, 23: 0.39, 24: 0.26, 26: 0.13}


Overall Stats:
Test percentage: 0.81 Average Accuracy: 0.810 Average Recall: 0.810 Average Precision: 0.810 Average F1 Score: 0.810


Bucket 1 Stats:
Test percentage: 0.81 Average Accuracy: 0.974 Average Recall: 0.974 Average Precision: 0.974 Average F1 Score: 0.974


Bucket 2 Stats:
Test percentage: 0.81 Average Accuracy: 0.833 Average Recall: 0.833 Average Precision: 0.833 Average F1 Score: 0.833


Bucket 3 Stats:
Test percentage: 0.81 Average Accuracy: 0.886 Average Recall: 0.886 Average Precision: 0.886 Average F1 Sco

TypeError: save_consolidated_stats() missing 1 required positional argument: 'n_iter'

In [17]:
save_consolidated_stats(algo_overall_metric, algo_bucket_metric, n_iter)

FileNotFoundError: [Errno 2] No such file or directory: '../results/filter_Adomain_old_new_consolidated_stats_algo_lr_svm_knn_mlp_sklearn_rand_for_dec_tree_ber_nb_xtra_tree_gau_nb_label_prop_label_spread_lda_ridge_cv_n_cent_ridge_n_iter_3_pct_0.81_bucket_16.csv'

In [None]:
algos