In [1]:
import numpy as np
import os
import pandas as pd
import scipy as sc
import warnings
import csv
from itertools import groupby
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import time
from datetime import datetime


from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import ExtraTreeClassifier as XtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import RidgeClassifier


from sklearn.metrics import recall_score as rec, precision_score as pre, f1_score as f1, accuracy_score as acc

In [2]:
datetime.now().strftime("%y%m%d_%H%M%S")

'201202_114258'

In [3]:
#file_name = 'sig_extract/a.fa.hmmpfam2'
file_name = 'sig_extract/Adomain_Substrate.fa.hmmpfam2'

In [4]:
print('the start at ', time.ctime())

the start at  Wed Dec  2 11:43:02 2020


In [5]:
# Hamming Distance between two strings
def hamming(str1, str2):
    ret = 0
    for (a,b) in zip(str1, str2):
        ret += (a!=b)
    return ret

# Normalized Hamming Distance between two strings based on string length
def hamming_frac(str1, str2):
    return hamming(str1, str2)/min(len(str1), len(str2))

#print(hamming('aaaa', 'baab'))
#print(hamming_frac('aaaa', 'baab'))

In [6]:
def remove_multiple_spaces(string, remove_lead_trail=True):
    if remove_lead_trail:
        string = string.strip()
    while '  ' in string:
        string = string.replace('  ', ' ')
    return string

def get_state(state, line, print_state_transitions=False):
    #print(line)
    #print('incoming state ', state)
    newstate = ''
    if state in ['init', 'parsing']:
        if line.startswith('//') or line.startswith('- - - -'):
            newstate = 'recognise'
    elif state == 'recognise':
        if line.startswith('Alignments of top'):
            newstate = 'parsing'

    if print_state_transitions:
        if newstate != '':
            print('Changed state from ',state,' to ', newstate)
        else:
            newstate = state
    
    if newstate == '':
        newstate = state
    return newstate

def parse_hmmsearch_output(lines, hmmfiles):
    align_dict = {}
    align_code_dict = {}
    score_dict = {}
    detail_dict = {}
    line_score_idx = 0
    
    
    line_idx = 0
    line_align_idx = 0
    Id = ''
    state = 'init'
    curr_hmm = ''
    #print(lines)
    
    for line in lines:
        line_idx += 1        
        state = get_state(state, line)
        
        if state == 'recognise':
            if line.startswith('Query sequence'):
                Id = line.split(': ')[1].strip('\n')
                #print('set id to ',Id)
                detail_dict[Id] = {}

        elif state == 'parsing':
            hmmheader = np.asarray([line.startswith(hmmfile) for hmmfile in hmmfiles]).any()
            if hmmheader:
                line_align_idx = line_idx
                curr_hmm = line.split(':')[0]
                #print(curr_hmm)
                split = line.split(' ')
                #print(split)
                score_idx = split.index('score')+1
                from_idx = split.index('from')+1
                to_idx = split.index('to')+1
                detail_dict[Id][curr_hmm] = {'score':float(split[score_idx].strip(',')),'from': int(split[from_idx]),
                                             'to': int(split[to_idx].strip(':')), 'top':'', 'bottom':''}
                #print(detail_dict)
            elif line.startswith(' '):
                #print(line_idx, line_align_idx, line)
                if (line_idx - line_align_idx) % 4 == 1:
                    detail_dict[Id][curr_hmm]['top'] += remove_multiple_spaces(line).strip('*-><')
                elif (line_idx - line_align_idx) % 4 == 3:
                    detail_dict[Id][curr_hmm]['bottom'] += remove_multiple_spaces(line).split()[2]
        
    return detail_dict

def parse_hmmsearch_output_from_file(filename, hmmfile):
    with open(filename, 'r') as file:
        content = file.readlines()
    return parse_hmmsearch_output(content, hmmfile)

In [7]:
detail_dict = parse_hmmsearch_output_from_file(file_name, ['aa-activating-core.198-334', 'aroundLys517'])

In [8]:
def get_best_alignment(mydict):
    #print(mydict)
    ret_dict = {}
    for Id in mydict:
        #print(Id)
        start = False
        score = -1
        best_hmm = ''
        for hmm in mydict[Id]:
            #print(hmm, best_hmm)
            if best_hmm == '' or score < mydict[Id][hmm]['score']:
                best_hmm = hmm
        ret_dict[Id] = mydict[Id][best_hmm].copy()
        ret_dict[Id]['hmm'] = best_hmm
        
    return ret_dict

best_align_dict = get_best_alignment(detail_dict)

In [9]:
def get_hmm_alignment(mydict, hmm):
    ret_dict = {}
    for key in mydict:
        try:
            ret_dict[key] = mydict[key][hmm].copy()
        except:
            print('Could not get',hmm,' for Id',key)
    return ret_dict

a_align_dict = get_hmm_alignment(detail_dict, 'aa-activating-core.198-334')

In [10]:
def removetopindels(indict, print_change=False):
    mydict = indict.copy()
    for Id in mydict:
        top_tmp = ''
        bot_tmp = ''
        idx = mydict[Id]['from']
        idx_list = []
        for a,b in zip(mydict[Id]['top'], mydict[Id]['bottom']):
            if a != '.':
                top_tmp += a
                bot_tmp += b
                if b == '-':
                    idx_list.append(idx-0.5)
                else:
                    idx_list.append(idx)
            if b != '-':
                idx += 1
        if print_change and mydict[Id]['top'] != top_tmp:
            print('Id:',Id,' top changed from ',mydict[Id]['top'], 'to', top_tmp)
        if print_change and mydict[Id]['bottom'] != bot_tmp:
            print('Id:',Id,' bottom changed from ',mydict[Id]['bottom'], 'to', bot_tmp)
        mydict[Id]['top'] = top_tmp
        mydict[Id]['bottom'] = bot_tmp
        assert(len(mydict[Id]['bottom']) == len(idx_list))
        mydict[Id]['idx_list'] = idx_list.copy()
    return mydict

a_align_dict_no_indel = removetopindels(a_align_dict)

In [11]:
def extractCharacters(Id, target, source, source_idx_list, pattern, idxs) :
    assert len(source) == len(source_idx_list)
    try:
        start = target.index(pattern)
    except:
        print('Problem at Id ', Id, ' pattern ', pattern, ' target ', target)
    ret = ''
    pos = []
    for idx in idxs:
        ret += source[start+idx]
        pos.append(source_idx_list[start+idx])
    return ret, pos

In [12]:
#extractCharacters('gig', 'rty', 'ggui', [345,346,349,350], 't', [2])

In [16]:
len('KGVmveHrnvvnlvkwl'+'LqfssAysFDaSvweifgaLLnGgt'+'iTvlnltPsl'+'LrrvlvGGEaL'+'liNaYGPTEtTVcaTi' )

79

In [13]:
def extract_sig(Id, top, bottom, idx_list):
    try:
        s1, p1 = extractCharacters(Id, top, bottom, idx_list, "KGVmveHrnvvnlvkwl", [12, 15, 16])
        s2, p2 = extractCharacters(Id, top, bottom, idx_list, "LqfssAysFDaSvweifgaLLnGgt", [3,8,9,10,11,12,13,14,17])
        s3, p3 = extractCharacters(Id, top, bottom, idx_list, "iTvlnltPsl", [4,5])
        s4, p4 = extractCharacters(Id, top, bottom, idx_list, "LrrvlvGGEaL", [4,5,6,7,8])
        s5, p5 = extractCharacters(Id, top, bottom, idx_list, "liNaYGPTEtTVcaTi", [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])

        return s1+s2+s3+s4+s5, p1+p2+p3+p4+p5
    except:
        return '', []
def extract_sig_dict(mydict):
    ret_dict = {}
    #ret_dict = mydict.copy()
    for Id in mydict:
        ret_dict[Id] = {}
        ret_dict[Id]['sig'], ret_dict[Id]['pos'] = extract_sig(Id, mydict[Id]['top'], mydict[Id]['bottom'], mydict[Id]['idx_list'])
        #ret_dict[Id]['sig'] = extract_sig(Id, mydict[Id]['top'], mydict[Id]['bottom'])
    return ret_dict

In [14]:
sig_dict = extract_sig_dict(a_align_dict_no_indel)
sig_dict_orig = sig_dict.copy()

In [15]:
nrps_res_file = 'Adomain_Substrate.report'
df_nrps_res = pd.read_csv(nrps_res_file, delimiter='\t')
test_sig = list(df_nrps_res['8A-signature'])
test_id = list(df_nrps_res['#sequence-id'])

In [16]:
# Check if NRPS result matches extracted
unmatched = 0
for i, s in zip(test_id, test_sig):
    sig = sig_dict[i]['sig']
    if s!=sig:
        print(s, sig)
        unmatched += 1
if unmatched:
    print('All signatures from',nrps_res_file,'matched to our generated signatures')
else:
    print(unmatched,' signatures from', nrps_res_file, 'did not match to our generated ones')

0  signatures from Adomain_Substrate.report did not match to our generated ones


In [17]:
#for Id in sig_dict:
    #print(Id,',',sig_dict[Id]['sig'],',',sig_dict[Id]['pos'])

In [18]:
#print(sig_dict['O30409_6|L'])
print(sig_dict)

{'O30408_1|P': {'sig': 'LFTTFDVCYQESSLITAGEHNHYGPSETHVVTTC', 'pos': [142, 145, 146, 162, 166, 167, 168, 169, 170, 171, 172, 175, 210, 211, 237, 238, 239, 240, 241, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275]}, 'O30408_2|F': {'sig': 'SWNLFDAFALTTVFMLGGEMNAYGPTESSVMATY', 'pos': [141, 144, 145, 161, 165, 166, 167, 168, 169, 170, 171, 174, 209, 210, 234, 235, 236, 237, 238, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272]}, 'O30408_3|F': {'sig': 'LVFAFDASVWDGTLITAGSVNGYGPTESTVCATL', 'pos': [142, 145, 146, 162, 166, 167, 168, 169, 170, 171, 172, 175, 210, 211, 231, 232, 233, 234, 235, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266]}, 'O30409_1|N': {'sig': 'YWASFDLTVTSTKLIVGGEFNEYGPTETVVGCMI', 'pos': [142, 145, 146, 162, 166, 167, 168, 169, 170, 171, 172, 175, 208, 209, 231, 232, 233, 234, 235, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270]}, 'O30409_2|Q': {'sig': 'HWMTFDASVWELQMFC

In [19]:
def check_for_gap_sig(my_sig_dict, print_gap_sig = False):
    count = 0
    for Id in my_sig_dict:
        pos = my_sig_dict[Id]['pos']
        pos_frac_sum = sum([p-float(int(p)) for p in pos])
        if pos_frac_sum > 0:
            if print_gap_sig:
                print(Id, ':', my_sig_dict[Id]['sig'])
                print(Id, ':', pos)
            count += 1
    print('There are',count,'signatures with gap among',len(sig_dict),'extracted signatures')

check_for_gap_sig(sig_dict)

There are 24 signatures with gap among 1546 extracted signatures


In [20]:
aa_alias_dict = {'A': 'Ala', 'V': 'Val', 'L': 'Leu', 'I': 'Ile', 'P': 'Pro', 'F': 'Phe', 'W': 'Trp', 'M': 'Met',
'K': 'Lys', 'R': 'Arg', 'H': 'His', 'G': 'Gly', 'S': 'Ser', 'T': 'Thr', 'C': 'Cys', 'Y': 'Tyr',
'N': 'Asn', 'Q': 'Gln', 'D': 'Asp', 'E': 'Glu'}

In [21]:
def get_rev_dict(mydict, lowercase=True):
    ret_dict = {}
    ret_inv_dict = {}
    for key in mydict:
        if lowercase:
            ret_dict[key.lower()] = mydict[key].lower()
            ret_inv_dict[mydict[key].lower()] = key.lower()
        else:
            ret_dict[key] = mydict[key]
            ret_inv_dict[mydict[key]] = key
    return ret_dict, ret_inv_dict

In [22]:
aa_alias_dict_lower, inv_aa_alias_dict_lower = get_rev_dict(aa_alias_dict)

In [23]:
seq_inp_dir='seq_input'
seq_inp_files=os.listdir(seq_inp_dir)

In [24]:
inp_seq_id_dict = {}
seq_id_inp_dict = {}
for seq_inp_filename in seq_inp_files:
    assert seq_inp_filename.endswith('.fa')
    seq_inp_file_abs_path = os.path.abspath(os.path.join(seq_inp_dir, seq_inp_filename))
    with open(seq_inp_file_abs_path, 'r') as f:
        content = f.readlines()
        #print(content)
        #print(content[0], seq_inp_file_abs_path)
        if(len(content)>1):
            #print(content[0].split(':')[1][:-1])
            inp_seq_id_dict[seq_inp_filename.split('.')[0]] = content[0].split(':')[1][:-1] 
            seq_id_inp_dict[ content[0].split(':')[1][:-1] ] = seq_inp_filename.split('.')[0]

In [25]:
inp_seq_id_dict

{'inp_1': 'O30408_1|p',
 'inp_10': 'O31827_0|i',
 'inp_100': 'A9ELY3_0|a',
 'inp_1000': 'F4CRZ3_0|dhb',
 'inp_1001': 'C7XK05_0|a',
 'inp_1002': 'D4RQS6_0|a',
 'inp_1003': 'C2MI38_0|a',
 'inp_1004': 'D9N8D3_0|a',
 'inp_1005': 'E8A4T8_0|dhb',
 'inp_1006': 'E8FJI3_0|dhb',
 'inp_1007': 'D2FTV7_0|a',
 'inp_1008': 'D8HHB2_0|a',
 'inp_1009': 'E3DZM8_0|a',
 'inp_101': 'A9EPP8_0|dhb',
 'inp_1010': 'D6LW22_0|a',
 'inp_1011': 'E6HZ45_0|a',
 'inp_1012': 'F3WF22_0|dhb',
 'inp_1013': 'E5W9D5_0|a',
 'inp_1014': 'E7S8R8_0|a',
 'inp_1015': 'C8KAL4_0|a',
 'inp_1016': 'C2TUM0_0|a',
 'inp_1017': 'E4IX22_0|a',
 'inp_1018': 'E8IIB2_0|dhb',
 'inp_1019': 'F3DBK1_0|dhb',
 'inp_102': 'A9VKV6_0|a',
 'inp_1020': 'D9UBW1_0|dhb',
 'inp_1021': 'E8DS52_0|dhb',
 'inp_1022': 'D4PSG5_0|a',
 'inp_1023': 'B4U3P2_0|a',
 'inp_1024': 'Q5M0A8_0|a',
 'inp_1025': 'E1RZT1_0|dhb',
 'inp_1026': 'Q03DW1_0|a',
 'inp_1027': 'Q251N2_0|a',
 'inp_1028': 'A6M1G2_0|a',
 'inp_1029': 'F0LA69_0|dhb',
 'inp_103': 'B0AQG6_0|a',
 'inp_1030': 'E

In [26]:
pdb_filename = '398987.pdb.modified'

In [27]:
def parse_pdb_output(lines):
    mod_lines = []
    N_dict = {}
    N_dict['loc'] = {}
    N_dict['x'] = {}
    N_dict['y'] = {}
    N_dict['z'] = {}
    for line in lines:
        #print(len(line))
        if line.startswith('ATOM'):
            mod_lines.append(remove_multiple_spaces(line.strip('\n')))
            line_split = line.split()
            if line_split[2] == 'N':
                N_dict['loc'][int(line_split[4])] = inv_aa_alias_dict_lower[line_split[3].lower()].upper()
                N_dict['x'][int(line_split[4])] = float(line_split[5])
                N_dict['y'][int(line_split[4])] = float(line_split[6])
                N_dict['z'][int(line_split[4])] = float(line_split[7])
                #print(line)
    return N_dict

def parse_pdb_output_from_file(filename):
    with open(filename, 'r') as file:
        content = file.readlines()
    return parse_pdb_output(content)

In [28]:
#print(parse_pdb_output_from_file('398987.pdb'))

In [29]:
pdb_content = parse_pdb_output_from_file(pdb_filename)
#pdb_content = parse_pdb_output_from_file('398987.pdb')
'''
N_dict = {}

for line in pdb_content:
    all_good = True
    #print(line)
    line_split = line.split()
    try:
        assert len(line_split) == 11
    except:
        all_good = False
        print('Line<',line,'> does not have required spacing')
    if line_split[2] == 'N':
        N_dict[int(line_split[4])] = inv_aa_alias_dict_lower[line_split[3].lower()].upper()
'''

"\nN_dict = {}\n\nfor line in pdb_content:\n    all_good = True\n    #print(line)\n    line_split = line.split()\n    try:\n        assert len(line_split) == 11\n    except:\n        all_good = False\n        print('Line<',line,'> does not have required spacing')\n    if line_split[2] == 'N':\n        N_dict[int(line_split[4])] = inv_aa_alias_dict_lower[line_split[3].lower()].upper()\n"

In [30]:
def get_seq_from_pos(pos_list, ref_dict, Id, offset=0, print_error=True, convert_to_float=False):
    ret = []
    try:
        for pos in pos_list:
            ret.append(float(ref_dict[int(pos + offset)]) if convert_to_float else ref_dict[int(pos + offset)])
    except:
        if print_error:
            print('some error occured with Id',Id,'list',pos_list)
    return ret if convert_to_float else ''.join(ret)

hamming_frac(get_seq_from_pos(sig_dict['O30409_6|L']['pos'], pdb_content['loc'], 'O30409_6|L'), sig_dict['O30409_6|L']['sig'])

0.0

In [31]:
'''
basedir_raptorX = 'raptorX'
files_basedir_raptorX = os.listdir(basedir_raptorX)
req_files_basedir_raptorX = []
N_dict = {}

for file in files_basedir_raptorX:
    file_abs_path = os.path.abspath(os.path.join(basedir_raptorX, file))
    #print(file, file_abs_path)
    if os.path.isdir(file_abs_path):
        files_subdir = os.listdir(file_abs_path)
        for file_subdir in files_subdir:
            file_subdir_abs_path = os.path.abspath(os.path.join(file_abs_path, file_subdir))
            if os.path.isdir(file_subdir_abs_path):
                #print(os.listdir(file_subdir_abs_path))
                files_subsubdir = os.listdir(file_subdir_abs_path)
                #print(files_subsubdir)
                fasta_files = list(filter(lambda x: x.endswith('.fasta'), files_subsubdir))
                pdb_files = list(filter(lambda x: x.endswith('.pdb'), files_subsubdir))
                if len(fasta_files) > 0 and len(pdb_files) > 0:
                    fasta_file = os.path.join(file_subdir_abs_path, fasta_files[0])
                    #print(fasta_file)
                    with open(fasta_file, 'r') as f:
                        contents = f.readlines()
                        Id_orig = contents[0].strip('>\n')
                        Id = '_'.join(Id_orig.split('_')[:-1]) + '|' + Id_orig.split('_')[-1]
                        #print(Id_orig, Id)
                        N_dict[Id] = {}
                    pdb_file = os.path.join(file_subdir_abs_path, pdb_files[0])
                    #print(fasta_file, pdb_file)
                    N_dict[Id] = parse_pdb_output_from_file(pdb_file)
print(N_dict.keys())
'''

"\nbasedir_raptorX = 'raptorX'\nfiles_basedir_raptorX = os.listdir(basedir_raptorX)\nreq_files_basedir_raptorX = []\nN_dict = {}\n\nfor file in files_basedir_raptorX:\n    file_abs_path = os.path.abspath(os.path.join(basedir_raptorX, file))\n    #print(file, file_abs_path)\n    if os.path.isdir(file_abs_path):\n        files_subdir = os.listdir(file_abs_path)\n        for file_subdir in files_subdir:\n            file_subdir_abs_path = os.path.abspath(os.path.join(file_abs_path, file_subdir))\n            if os.path.isdir(file_subdir_abs_path):\n                #print(os.listdir(file_subdir_abs_path))\n                files_subsubdir = os.listdir(file_subdir_abs_path)\n                #print(files_subsubdir)\n                fasta_files = list(filter(lambda x: x.endswith('.fasta'), files_subsubdir))\n                pdb_files = list(filter(lambda x: x.endswith('.pdb'), files_subsubdir))\n                if len(fasta_files) > 0 and len(pdb_files) > 0:\n                    fasta_file = o

In [32]:
'''
basedir_modeller = 'build3Dmodel_out/20200822'
files_basedir_raptorX = os.listdir(basedir_modeller)
req_files_basedir_raptorX = []
#N_dict = {}
req_files = list( filter(lambda x: x.startswith('inp_') and (x.endswith('.acc') or x.endswith('.ss3') or x.endswith('.ss8') or x.endswith('.diso')) and len(x.split('.')) >= 2, files_basedir_raptorX) )

for req_file in req_files:
    print(req_file)
    #seq_id = pdb_file.split('.')[0]
    #pdb_file_abs_path = os.path.abspath(os.path.join(basedir_modeller, pdb_file))
    #print(seq_id, pdb_file_abs_path)
    #N_dict[seq_id] = parse_pdb_output_from_file(pdb_file_abs_path)
'''

"\nbasedir_modeller = 'build3Dmodel_out/20200822'\nfiles_basedir_raptorX = os.listdir(basedir_modeller)\nreq_files_basedir_raptorX = []\n#N_dict = {}\nreq_files = list( filter(lambda x: x.startswith('inp_') and (x.endswith('.acc') or x.endswith('.ss3') or x.endswith('.ss8') or x.endswith('.diso')) and len(x.split('.')) >= 2, files_basedir_raptorX) )\n\nfor req_file in req_files:\n    print(req_file)\n    #seq_id = pdb_file.split('.')[0]\n    #pdb_file_abs_path = os.path.abspath(os.path.join(basedir_modeller, pdb_file))\n    #print(seq_id, pdb_file_abs_path)\n    #N_dict[seq_id] = parse_pdb_output_from_file(pdb_file_abs_path)\n"

In [33]:
basedir_modeller = 'build3Dmodel_out/20200822'
files_basedir_raptorX = os.listdir(basedir_modeller)
req_files_basedir_raptorX = []
N_dict = {}

sig_unique_list = []
sig_dict_keys = list(sig_dict.keys())
dup_count = 0

req_files = list( filter(lambda x: x.startswith('1amuA_inp_') and (x.endswith('.pdb')) and len(x.split('.')) > 1, files_basedir_raptorX) )
#print(len(req_files))
for req_file in req_files:
    seq_id = inp_seq_id_dict[req_file.split('.')[0][6:]]
    #seq_id = pdb_file.split('.')[0]
    pdb_file_abs_path = os.path.abspath(os.path.join(basedir_modeller, req_file))
    #print(seq_id, pdb_file_abs_path)
    seq_id_split = seq_id.split('|')
    #sig_dict_seq_id = seq_id_split[0] + '|' + (seq_id_split[1].lower() if len(seq_id_split[1]) > 1 else seq_id_split[1].upper())
    filtered_sig_dict_keys = list(filter(lambda x: x.startswith(seq_id_split[0]+'|'), sig_dict_keys))
    try:
        assert len(filtered_sig_dict_keys) == 1
    except:
        print('Found list', filtered_sig_dict_keys,'for id', seq_id_split[0])
        pass
    sig_dict_seq_id = filtered_sig_dict_keys[0]
    #print(seq_id, sig_dict[sig_dict_seq_id]['sig'])
    #'''
    # Removing all duplicate entries
    if sig_dict[sig_dict_seq_id]['sig'] in sig_unique_list:
        #print('Id', seq_id, 'with signature', sig_dict[sig_dict_seq_id]['sig'],'already exists')
        dup_count += 1
    else:
        sig_unique_list.append(sig_dict[sig_dict_seq_id]['sig'])
        N_dict[seq_id] = parse_pdb_output_from_file(pdb_file_abs_path)
    #'''
print('Total', dup_count, 'duplicates signatures found')

Total 876 duplicates signatures found


In [34]:
len(N_dict)

658

In [35]:
#N_dict.keys()

In [36]:
def parse_acc_file(filename):
    with open(filename, 'r') as f:
        content = f.readlines()
        ret_dict = {}
        ret_dict['loc'] = {}
        ret_dict['acc1'] = {}
        ret_dict['acc2'] = {}
        ret_dict['acc3'] = {}
        for line in content[3:]:
            split_line = line.split(' ')
            #print(split_line)
            #print(split_line[-8])
            ret_dict['loc'][int(split_line[-9])] = split_line[-8]
            ret_dict['acc1'][int(split_line[-9])] = split_line[-4]
            ret_dict['acc2'][int(split_line[-9])] = split_line[-3]
            ret_dict['acc3'][int(split_line[-9])] = split_line[-2]        
        return ret_dict

def parse_rap_prop_file(filename, f_type, fields, field_poss, field_str, data_start_idx, pos_idx):
    assert(filename.endswith(f_type))
    assert(len(fields) == len(field_poss))
    assert(len(fields) == len(field_str))
    with open(filename, 'r') as f:
        content = f.readlines()
        ret_dict = {}
        for field in fields:
            ret_dict[field] = {}
        
        for line in content[data_start_idx:]:
            split_line = line.split(' ')
            #print(split_line)
            for i in range(len(fields)):
                #print(i)
                ret_dict[fields[i]][int(split_line[-pos_idx])] = split_line[-field_poss[i]] if field_str[i] else float(split_line[-field_poss[i]])
            #print(ret_dict)
            #assert 1==2
        #print(ret_dict)
        return ret_dict

#print(parse_acc_file('RaptorX_property/out/inp_2.acc'))
#print(parse_rap_prop_file('RaptorX_property/out/inp_1.ss3', 'ss3', ['ss31', 'ss32', 'ss33'], [4, 3, 2], 2, 9))
#print(parse_rap_prop_file('RaptorX_property/out/inp_1.acc', 'acc', ['loc', 'acc1', 'acc2', 'acc3'], [8, 4, 3, 2], 3, 9) == parse_acc_file('RaptorX_property/out/inp_1.acc'))
#print(parse_rap_prop_file('RaptorX_property/out/inp_1.diso', 'diso', ['diso'], [2], 3, 5))
#print(parse_rap_prop_file('RaptorX_property/out/inp_1.ss8', 'ss8', ['ss81', 'ss82', 'ss83', 'ss84', 'ss85', 'ss86', 'ss87', 'ss88'], [9, 8, 7, 6, 5, 4, 3, 2], 2, 14))

In [37]:
#'''
basedir_modeller = 'RaptorX_property/out/'
files_basedir_raptorX = os.listdir(basedir_modeller)
req_files_basedir_raptorX = []
N_prop_dict = {}
req_files = list( filter(lambda x: x.startswith('inp_') and (x.endswith('.acc') or x.endswith('.ss3') or x.endswith('.ss8') or x.endswith('.diso') or x.endswith('.tm2') or x.endswith('.tm8')) and len(x.split('.')) >= 2, files_basedir_raptorX) )

for req_file in req_files:
    N_prop_dict[inp_seq_id_dict[req_file.split('.')[0]]] = {}

for req_file in req_files:
    seq_id = inp_seq_id_dict[req_file.split('.')[0]]
    #print(seq_id, req_file)
    ret_prp_dict = {}
    if req_file.endswith('acc'):
        ret_prp_dict = parse_rap_prop_file(basedir_modeller+req_file, 'acc', ['loc', 'acc1', 'acc2', 'acc3'], [8, 4, 3, 2], [1,0,0,0], 3, 9)
    elif req_file.endswith('ss3'):
        ret_prp_dict = parse_rap_prop_file(basedir_modeller+req_file, 'ss3', ['ss31', 'ss32', 'ss33'], [4, 3, 2], [0,0,0], 2, 9)
    elif req_file.endswith('ss8'):
        ret_prp_dict = parse_rap_prop_file(basedir_modeller+req_file, 'ss8', ['ss81', 'ss82', 'ss83', 'ss84', 'ss85', 'ss86', 'ss87', 'ss88'], [9, 8, 7, 6, 5, 4, 3, 2], [0,0,0,0,0,0,0,0], 2, 14)
    elif req_file.endswith('diso'):
        ret_prp_dict = parse_rap_prop_file(basedir_modeller+req_file, 'diso', ['diso'], [2], [0], 3, 5)
    elif req_file.endswith('tm2'):
        ret_prp_dict = parse_rap_prop_file(basedir_modeller+req_file, 'tm2', ['tm21'], [2], [0], 3, 5)
    elif req_file.endswith('tm8'):
        ret_prp_dict = parse_rap_prop_file(basedir_modeller+req_file, 'tm8', ['tm81', 'tm82', 'tm83', 'tm84', 'tm85', 'tm86', 'tm87', 'tm88', ], [9,8,7,6,5,4,3,2], [0,0,0,0,0,0,0,0], 2, 14)

    for key in ret_prp_dict:
            N_prop_dict[seq_id][key] = ret_prp_dict[key].copy()

#print(N_prop_dict)
#'''

In [38]:
N_prop_dict.keys()

dict_keys(['O30408_1|p', 'O31827_0|i', 'A9ELY3_0|a', 'F4CRZ3_0|dhb', 'C7XK05_0|a', 'D4RQS6_0|a', 'C2MI38_0|a', 'D9N8D3_0|a', 'E8A4T8_0|dhb', 'E8FJI3_0|dhb', 'D2FTV7_0|a', 'D8HHB2_0|a', 'E3DZM8_0|a', 'A9EPP8_0|dhb', 'D6LW22_0|a', 'E6HZ45_0|a', 'F3WF22_0|dhb', 'E5W9D5_0|a', 'E7S8R8_0|a', 'C8KAL4_0|a', 'C2TUM0_0|a', 'E4IX22_0|a', 'E8IIB2_0|dhb', 'F3DBK1_0|dhb', 'A9VKV6_0|a', 'D9UBW1_0|dhb', 'E8DS52_0|dhb', 'D4PSG5_0|a', 'B4U3P2_0|a', 'Q5M0A8_0|a', 'E1RZT1_0|dhb', 'Q03DW1_0|a', 'Q251N2_0|a', 'A6M1G2_0|a', 'F0LA69_0|dhb', 'B0AQG6_0|a', 'E5R9M1_0|a', 'F2H0V4_0|a', 'E2YHB0_0|a', 'C7WZA2_0|a', 'F3V380_0|dhb', 'E6ETF2_0|a', 'F0D2B9_0|a', 'E2KHJ6_0|dhb', 'C7V4A0_0|a', 'E7XVS9_0|dhb', 'B0AQY3_0|dhb', 'E7UAG6_0|dhb', 'A5Z788_0|a', 'F3UU74_0|a', 'D4G345_0|a', 'E6GEA9_0|a', 'E0TWR6_0|a', 'A8AUB6_0|a', 'F0FPI8_0|a', 'C8KU82_0|a', 'E6IZ61_0|a', 'B0D6R6_0|aad', 'F3MGX0_0|dhb', 'E6HQ43_0|a', 'E1LWF9_0|a', 'Q5XP05_0|a', 'F3UM75_0|a', 'E7S2G8_0|a', 'E7X8R4_0|dhb', 'C3BHU7_0|a', 'E5TU15_0|a', 'A5LLN5_0|a',

In [39]:
#N_dict.keys()

In [8]:
# Read Signatures and z scores
csv_name = "labeled_sigs"
z_score_csv_name = "z_score_aa_001"
df_original = pd.read_csv(csv_name, delimiter='\t')
z_scores_df = pd.read_csv(z_score_csv_name, delimiter=' ')

In [9]:
# Prep for hardcoded values
m=26
nrow = 12
D = np.zeros((nrow, m))
D1 = D[0 ,:]
D2 = D[1 ,:]
D3 = D[2 ,:]
D4 = D[3 ,:]
D5 = D[4 ,:]
D6 = D[5 ,:]
D7 = D[6 ,:]
D8 = D[7 ,:]
D9 = D[8 ,:]
D10 = D[9 ,:]
D11 = D[10 ,:]
D12 = D[11 ,:]

In [10]:
# Hard coded values, taken from NRPSPredictor2 github

#1 aa-alpha-helix.aaindex
D1[0]=1.420;D1[1]=0.000;D1[2]=0.700;D1[3]=1.010;D1[4]=1.510;D1[5]=1.130;D1[6]=0.570;D1[7]=1.000;D1[8]=1.080;D1[9]=0.000;D1[10]=1.160;D1[11]=1.210;D1[12]=1.450;D1[13]=0.670;D1[14]=0.000;D1[15]=0.570;D1[16]=1.110;D1[17]=0.980;D1[18]=0.770;D1[19]=0.830;D1[20]=0.000;D1[21]=1.060;D1[22]=1.080;D1[23]=0.000;D1[24]=0.690;D1[25]=0.000;
#2 aa-beta-sheet.aaindex
D2[0]=0.830;D2[1]=0.000;D2[2]=1.190;D2[3]=0.540;D2[4]=0.370;D2[5]=1.380;D2[6]=0.750;D2[7]=0.870;D2[8]=1.600;D2[9]=0.000;D2[10]=0.740;D2[11]=1.300;D2[12]=1.050;D2[13]=0.890;D2[14]=0.000;D2[15]=0.550;D2[16]=1.100;D2[17]=0.930;D2[18]=0.750;D2[19]=1.190;D2[20]=0.000;D2[21]=1.700;D2[22]=1.370;D2[23]=0.000;D2[24]=1.470;D2[25]=0.000;
#3 aa-beta-turn.aaindex
D3[0]=0.740;D3[1]=0.000;D3[2]=0.960;D3[3]=1.520;D3[4]=0.950;D3[5]=0.660;D3[6]=1.560;D3[7]=0.950;D3[8]=0.470;D3[9]=0.000;D3[10]=1.190;D3[11]=0.500;D3[12]=0.600;D3[13]=1.460;D3[14]=0.000;D3[15]=1.560;D3[16]=0.960;D3[17]=1.010;D3[18]=1.430;D3[19]=0.980;D3[20]=0.000;D3[21]=0.590;D3[22]=0.600;D3[23]=0.000;D3[24]=1.140;D3[25]=0.000;
#4 aa-hydrogenbond.aaindex
D4[0]=0.000;D4[1]=0.000;D4[2]=0.000;D4[3]=1.000;D4[4]=1.000;D4[5]=0.000;D4[6]=0.000;D4[7]=1.000;D4[8]=0.000;D4[9]=0.000;D4[10]=2.000;D4[11]=0.000;D4[12]=0.000;D4[13]=2.000;D4[14]=0.000;D4[15]=0.000;D4[16]=2.000;D4[17]=4.000;D4[18]=1.000;D4[19]=1.000;D4[20]=0.000;D4[21]=0.000;D4[22]=1.000;D4[23]=0.000;D4[24]=1.000;D4[25]=0.000;
#5 aa-hydrophobicity-neu1.aaindex
D5[0]=0.060;D5[1]=0.000;D5[2]=-0.560;D5[3]=0.970;D5[4]=0.850;D5[5]=-0.990;D5[6]=0.320;D5[7]=0.150;D5[8]=-1.000;D5[9]=0.000;D5[10]=1.000;D5[11]=-0.830;D5[12]=-0.680;D5[13]=0.700;D5[14]=0.000;D5[15]=0.450;D5[16]=0.710;D5[17]=0.800;D5[18]=0.480;D5[19]=0.380;D5[20]=0.000;D5[21]=-0.750;D5[22]=-0.570;D5[23]=0.000;D5[24]=-0.350;D5[25]=0.000;
#6 aa-hydrophobicity-neu2.aaindex
D6[0]=-0.250;D6[1]=0.000;D6[2]=-0.400;D6[3]=-0.080;D6[4]=-0.100;D6[5]=0.180;D6[6]=-0.320;D6[7]=-0.030;D6[8]=-0.030;D6[9]=0.000;D6[10]=0.320;D6[11]=0.050;D6[12]=-0.010;D6[13]=-0.060;D6[14]=0.000;D6[15]=0.230;D6[16]=-0.020;D6[17]=0.190;D6[18]=-0.150;D6[19]=-0.100;D6[20]=0.000;D6[21]=-0.190;D6[22]=0.310;D6[23]=0.000;D6[24]=0.400;D6[25]=0.000;
#7 aa-hydrophobicity-neu3.aaindex
D7[0]=0.250;D7[1]=0.000;D7[2]=-0.140;D7[3]=0.080;D7[4]=-0.050;D7[5]=0.150;D7[6]=0.280;D7[7]=-0.100;D7[8]=0.100;D7[9]=0.000;D7[10]=0.110;D7[11]=0.010;D7[12]=0.040;D7[13]=0.170;D7[14]=0.000;D7[15]=0.410;D7[16]=0.120;D7[17]=-0.410;D7[18]=0.230;D7[19]=0.290;D7[20]=0.000;D7[21]=0.030;D7[22]=0.340;D7[23]=0.000;D7[24]=-0.020;D7[25]=0.000;
#8 aa-isoelectric.aaindex
D8[0]=6.000;D8[1]=0.000;D8[2]=5.050;D8[3]=2.770;D8[4]=3.220;D8[5]=5.480;D8[6]=5.970;D8[7]=7.590;D8[8]=6.020;D8[9]=0.000;D8[10]=9.740;D8[11]=5.980;D8[12]=5.740;D8[13]=5.410;D8[14]=0.000;D8[15]=6.300;D8[16]=5.650;D8[17]=10.760;D8[18]=5.680;D8[19]=5.660;D8[20]=0.000;D8[21]=5.960;D8[22]=5.890;D8[23]=0.000;D8[24]=5.660;D8[25]=0.000;
#9 aa-polar-grantham.aaindex
D9[0]=8.100;D9[1]=0.000;D9[2]=5.500;D9[3]=13.000;D9[4]=12.300;D9[5]=5.200;D9[6]=9.000;D9[7]=10.400;D9[8]=5.200;D9[9]=0.000;D9[10]=11.300;D9[11]=4.900;D9[12]=5.700;D9[13]=11.600;D9[14]=0.000;D9[15]=8.000;D9[16]=10.500;D9[17]=10.500;D9[18]=9.200;D9[19]=8.600;D9[20]=0.000;D9[21]=5.900;D9[22]=5.400;D9[23]=0.000;D9[24]=6.200;D9[25]=0.000;
#10 aa-polar-radzicka.aaindex
D10[0]=-0.060;D10[1]=0.000;D10[2]=1.360;D10[3]=-0.800;D10[4]=-0.770;D10[5]=1.270;D10[6]=-0.410;D10[7]=0.490;D10[8]=1.310;D10[9]=0.000;D10[10]=-1.180;D10[11]=1.210;D10[12]=1.270;D10[13]=-0.480;D10[14]=0.000;D10[15]=1.100;D10[16]=-0.730;D10[17]=-0.840;D10[18]=-0.500;D10[19]=-0.270;D10[20]=0.000;D10[21]=1.090;D10[22]=0.880;D10[23]=0.000;D10[24]=0.330;D10[25]=0.000;
#11 aa-polar-zimmerman.aaindex
D11[0]=0.000;D11[1]=0.000;D11[2]=1.480;D11[3]=49.700;D11[4]=49.900;D11[5]=0.350;D11[6]=0.000;D11[7]=51.600;D11[8]=0.130;D11[9]=0.000;D11[10]=49.500;D11[11]=0.130;D11[12]=1.430;D11[13]=3.380;D11[14]=0.000;D11[15]=1.580;D11[16]=3.530;D11[17]=52.000;D11[18]=1.670;D11[19]=1.660;D11[20]=0.000;D11[21]=0.130;D11[22]=2.100;D11[23]=0.000;D11[24]=1.610;D11[25]=0.000;
#12 aa-volume.aaindex
D12[0]=90.000;D12[1]=0.000;D12[2]=103.300;D12[3]=117.300;D12[4]=142.200;D12[5]=191.900;D12[6]=64.900;D12[7]=160.000;D12[8]=163.900;D12[9]=0.000;D12[10]=167.300;D12[11]=164.000;D12[12]=167.000;D12[13]=124.700;D12[14]=0.000;D12[15]=122.900;D12[16]=149.400;D12[17]=194.000;D12[18]=95.400;D12[19]=121.500;D12[20]=0.000;D12[21]=139.000;D12[22]=228.200;D12[23]=0.000;D12[24]=197.000;D12[25]=0.000;



In [11]:
# Map Amino acid abbreviations with indices of D[i, :]

aa2ind_map = {}
aa2ind_map['A'] = 0
aa2ind_map['R'] = 17
aa2ind_map['D'] = 3
aa2ind_map['N'] = 13
aa2ind_map['C'] = 2
aa2ind_map['E'] = 4
aa2ind_map['Q'] = 16
aa2ind_map['G'] = 6
aa2ind_map['H'] = 7
aa2ind_map['I'] = 8
aa2ind_map['L'] = 11
aa2ind_map['K'] = 10
aa2ind_map['M'] = 12
aa2ind_map['F'] = 5
aa2ind_map['P'] = 15
aa2ind_map['S'] = 18
aa2ind_map['T'] = 19
aa2ind_map['W'] = 22
aa2ind_map['Y'] = 24
aa2ind_map['V'] = 21

In [12]:
# Dictionary to store 15 dimensional vector values for each amino acid

aa_dict = {}
for i, row in z_scores_df.iterrows():
    aa_dict[row['aa_short']] = [row['z1'], row['z2'], row['z3'], ] # Store z values
    aa_dict[row['aa_short']].extend(D[:, aa2ind_map[row['aa_short']]]) # Store rest of the 12 hard-coded values


In [14]:
aa2ind_map

{'A': 0,
 'R': 17,
 'D': 3,
 'N': 13,
 'C': 2,
 'E': 4,
 'Q': 16,
 'G': 6,
 'H': 7,
 'I': 8,
 'L': 11,
 'K': 10,
 'M': 12,
 'F': 5,
 'P': 15,
 'S': 18,
 'T': 19,
 'W': 22,
 'Y': 24,
 'V': 21}

In [13]:
z_scores_df

Unnamed: 0,idx,aa_short,aa,rf_1,rf_2,rf_3,rf_4,rf_5,rf_6,rf_7,hnmr_1,hnmr_2,hnmr_3,vdw,mw,z1,z2,z3
0,1,A,Ala,60,24,29,9,52,23,37,4A3,3.77,3.3,13.7,89.1,0.07,-1.73,0.09
1,2,V,Val,74,42,43,22,68,43,72,3.94,3.6,3.03,34.1,117.2,-2.69,-2.53,-1.29
2,3,L,Leu,82,52,50,33,73,53,83,4.05,3.71,3.23,44.4,131.2,-4.19,-1.03,-0.98
3,4,I,ile,80,50,49,28,72,53,83,4.02,3.66,3.08,44.4,131.2,-4.44,-1.68,-1.03
4,5,P,Pro,52,20,50,11,57,31,68,4.37,4.11,3.51,30.7,115.1,-1.22,0.88,2.23
5,6,F,Phe,82,50,52,35,66,52,87,4.31,3.98,3.48,56.1,165.2,-4.92,1.3,0.45
6,7,W,Trp,86,54,54,40,-,51,88,4.36,4.05,3.56,75.1,204.2,-4.75,3.65,0.85
7,8,M,Met,77,45,50,29,63,49,79,4.20,3.84,3.3,45.0,149.2,-2.49,-0.27,-0.41
8,9,K,Lys,1,7,2,1,19,7,1,4.05,3.74,3.27,51.1,147.0,2.84,1.41,-3.14
9,10,R,Arg,1,9,6,1,19,7,1,4.06,3.76,3.2,64.9,175.0,2.88,2.52,-3.44


In [45]:
# Create index to value map and value to index map from a list using list index
def get_dict_from_list(mylist):
    assert len(mylist) > 0
    idx_to_value_dict = {}
    value_to_idx_dict = {}
    for i in range(len(mylist)):
        idx_to_value_dict[i] = mylist[i]
        value_to_idx_dict[mylist[i]] = int(i)
    return idx_to_value_dict, value_to_idx_dict

# Get index-value maps for substrates
sub_vocab_set = set(df_original['sub'].tolist())
sub_vocab_list = list(sub_vocab_set)
sub_idx_to_value_dict, sub_value_to_idx_dict = get_dict_from_list(sub_vocab_list)

In [46]:
# Remove the entries with only one substrate occurence
df_cnt_filtered = df_original.groupby('sub').filter(lambda x: len(x) > 1)
print("Reduced data size from ",len(df_original)," to ", len(df_cnt_filtered), " due to count filtering")

Reduced data size from  454  to  429  due to count filtering


In [47]:
# Re-Create substrate index-value maps after removing the substrates with count 1

sub_vocab_set = set(df_cnt_filtered['sub'].tolist())
sub_vocab_list = list(sub_vocab_set)
sub_idx_to_value_dict, sub_value_to_idx_dict = get_dict_from_list(sub_vocab_list)

In [48]:
# Given a signature of amino acids(34 aa long), construct the 34x15=510 dimentional representation
def get_encoding(signature):
    ret = []
    for i in signature:
        ret.extend(aa_dict[i])
    return np.asarray(ret)


In [49]:
def list_concat_zip3(l1, l2, l3):
    assert len(l1) == len(l2)
    assert len(l1) == len(l3)
    ret = []
    for idx in range(len(l1)):
        ret.append(l1[idx])
        ret.append(l2[idx])
        ret.append(l3[idx])
    return np.asarray(ret)

In [50]:
#list_concat_zip3([1, 2, 3, 4], [5, 6, 7, 8], [10, 11, 12, 13])

In [51]:
'''
sig_dict_old = sig_dict.copy()
sig_dict = {}
for key in sig_dict_old:
    #print(key.split('|')[0])
    sig_dict[key.split('|')[0]] = sig_dict_old[key].copy()
'''

"\nsig_dict_old = sig_dict.copy()\nsig_dict = {}\nfor key in sig_dict_old:\n    #print(key.split('|')[0])\n    sig_dict[key.split('|')[0]] = sig_dict_old[key].copy()\n"

In [52]:
sig_dict_old = sig_dict.copy()

#sig_dict = {}
for key in sig_dict_old:
    sig_dict[key.split('|')[0]+'|'+key.split('|')[1].lower()] = sig_dict_old[key].copy()
    #sig_dict[key.split('|')[0]] = sig_dict_old[key].copy()


In [53]:
len(sig_dict.keys())

2666

In [54]:
def get_sub_3_lower(sub_in):
    sub_in = sub_in.lower()
    if len(sub_in) == 1:
        return aa_alias_dict_lower[sub_in]
    elif sub_in.endswith('orn'):
        return 'orn'
    elif sub_in.endswith('ala'):
        return 'ala'
    return sub_in

In [55]:
sub_set = set()
for key in sig_dict_orig:
    sub_set.add(key.split('|')[1].lower())

In [56]:
#sig_dict.keys()

In [57]:
grsA_edge_filename = 'GrsA_filtered_edges_first_100.csv'
grsA_edge_list = []

with open(grsA_edge_filename, 'r') as grsA_edge_file:
    grsA_edge_content = grsA_edge_file.readlines()
    for line in grsA_edge_content:
        line_list = line.strip('\n').split(',')
        grsA_edge_list.append([int(idx) for idx in line_list])

In [58]:
grsA_edge_list[:20]

[[7, 6],
 [26, 25],
 [28, 27],
 [9, 8],
 [8, 7],
 [6, 5],
 [32, 31],
 [2, 1],
 [18, 17],
 [27, 26],
 [10, 9],
 [29, 28],
 [30, 29],
 [21, 20],
 [13, 12],
 [15, 14],
 [20, 19],
 [22, 21],
 [31, 30],
 [5, 4]]

In [59]:
def get_dist(a, b):
    return np.sqrt(np.sum(np.square(a-b)))

In [60]:
def get_edge_dist_based_grsA(xyz_content):
    sig_len = 34
    assert len(np.asarray(xyz_content).reshape(-1)) == sig_len*3
    xyz_content_reshaped = np.asarray(xyz_content).reshape(sig_len, 3)
    ret = []
    for edge in grsA_edge_list:
        #print('dist b2n', xyz_content_reshaped[edge[0]], xyz_content_reshaped[edge[1]])
        ret.append(get_dist(xyz_content_reshaped[edge[0]], xyz_content_reshaped[edge[1]]))
    
    return ret

In [61]:
#get_edge_dist_based_grsA((list(range(34*3))))

In [62]:
dup_dict = {}
unique_sig_id_list = []
unique_sig_id_dict = {}
unique_sig_id_no_sub_list = []
dup_count = 0

for key in sig_dict_old:
    #print(key)
    sig_now = sig_dict_old[key]['sig']
    #print(sig_now)
    try:
        dup_dict[sig_now] += 1
        dup_count += 1
        assert(unique_sig_id_dict[sig_now].split('|')[1] == key.split('|')[1])
    except:
        dup_dict[sig_now] = 1
        unique_sig_id_dict[sig_now] = key
        unique_sig_id_list.append(key)
        unique_sig_id_no_sub_list.append(key.split('|')[0])

print(len(dup_dict), dup_count)

670 876


In [63]:
#unique_sig_id_no_sub_list

In [64]:
# Format the data
sig_len = 34
raw_data = []
data_np = []
label_np = []
plot_np = []

raw_data_dict = {}
nrps_data_dict = {}
raptorx_3d_dict = {}
transformed_3d_dict = {}
raptorx_prop_dict = {}
raptorx_3d_angle_dict = {}
label_dict = {}

j=0
key_filtering = False
key_list = ['O30409_6|L', 'O68006_3|L', 'P39846_2|T', 'B2IXJ7_2|T']
#key_list = ['O30409_6|L', 'P39846_2|T']
field_list = ['acc1', 'acc2', 'acc3', 'diso', 'ss31', 'ss32', 'ss33', 'ss81', 'ss82', 'ss83', 'ss84', 'ss85', 'ss86', 'ss87', 'ss88']
assert not key_filtering or len(key_list) > 0
xyz_idx_filtering = False
xyz_idx_list = [5, 6, 9, 12, 14, 16, 21, 29, 30]
assert not xyz_idx_filtering or len(xyz_idx_list) > 0
xyz_raw_data_mean_normalization = True
xyz_raw_data_var_normalization = False

#assert 1==3

#for i, row in df_cnt_filtered.iterrows():
    #print((row['sub']))
    #assert len(row['sig']) == sig_len
    #raw_data.append(row['sig'])
    #data_np.append(get_encoding(row['sig']))
    #label_np.append(sub_value_to_idx_dict[row['sub']])
    #j+=1

for key in N_dict:
    #print(key, sig_dict[key]['sig'])
    #print(get_seq_from_pos(sig_dict[key]['pos'], N_dict[key]['x'], key, convert_to_float=True))
    #full_key_with_sub = list(filter(lambda x:x.startswith(key), sig_dict_orig.keys()))[0]
    full_key_with_sub = key
    key_without_sub = key.split('|')[0]
    #print(key, full_key_with_sub)
    sub_N_dict = full_key_with_sub.split('|')[1]
    sub = get_sub_3_lower(sub_N_dict)
    #print(key, sub, sub in sub_value_to_idx_dict)
    #key = full_key_with_sub.split('|')[0]+'|'+(sub_N_dict.upper() if len(sub_N_dict) == 1 else 'Orn' if sub_N_dict == 'orn' else sub_N_dict)
    key = full_key_with_sub.split('|')[0]+'|'+sub_N_dict.lower()
    #print(key, full_key_with_sub, sub_N_dict, key_without_sub)
    
    #if sub in sub_value_to_idx_dict and key_without_sub in unique_sig_id_no_sub_list:
    if sub in sub_value_to_idx_dict:# N_dict already filtered
        #assert 1==2
        field_data = []
        for field in field_list:
            #print(field)
            field_data.extend(get_seq_from_pos(sig_dict[key]['pos'], N_prop_dict[full_key_with_sub][field], key, convert_to_float=True))
        
        #print(key_without_sub)
        #assert 1==2
        #raw_data.append(get_seq_from_pos(sig_dict[key]['pos'], N_dict[key]['loc'], key))
        x_data = get_seq_from_pos(sig_dict[key]['pos'], N_dict[full_key_with_sub]['x'], key, convert_to_float=True)
        y_data = get_seq_from_pos(sig_dict[key]['pos'], N_dict[full_key_with_sub]['y'], key, convert_to_float=True)
        z_data = get_seq_from_pos(sig_dict[key]['pos'], N_dict[full_key_with_sub]['z'], key, convert_to_float=True)

        if xyz_idx_filtering:
            x_data = np.asarray(x_data)[xyz_idx_list]
            y_data = np.asarray(y_data)[xyz_idx_list]
            z_data = np.asarray(z_data)[xyz_idx_list]

        if xyz_raw_data_mean_normalization:
            x_data -= np.mean(x_data)
            y_data -= np.mean(y_data)
            z_data -= np.mean(z_data)
        if xyz_raw_data_var_normalization:
            x_data /= np.std(x_data)
            y_data /= np.std(y_data)
            z_data /= np.std(z_data)
        #data_np.append(np.asarray(list(get_encoding(get_seq_from_pos(sig_dict[key]['pos'], N_dict[key]['loc'], key)))+x_data+y_data+z_data))
        #data_np.append(np.asarray(x_data+y_data+z_data))
        if not key_filtering or (key_filtering and key in key_list):
            #print(key)
            #print(key, np.mean(x_data), np.mean(y_data), np.mean(z_data))
            coords = list_concat_zip3(x_data, y_data, z_data)
            #print(coords)
            #print(len(x_data))

            try:
            #if True:
                a, b, c = sc.linalg.svd(coords.reshape(len(x_data), 3), full_matrices=True)
                transformed_coords = np.matmul(a, np.pad(np.diag(b), ((0,a.shape[0]-b.shape[0]), (0,0)), mode='constant'))
                for idx_2 in range(transformed_coords.shape[1]):
                    transformed_coords[:, idx_2] *= np.sign(transformed_coords[0, idx_2])
                diff_coord = np.asarray(transformed_coords[1:]-transformed_coords[:-1])
                #print(transformed_coords.shape)
                #print(diff_coord.shape)
                dist_norm = np.linalg.norm(diff_coord, axis=1)
                #if (dist_norm<1e-17).any():
                    #print(key)
                    #print(dist_norm)
                    #print(np.max(dist_norm, 1e-17))
                #print(np.expand_dims(dist_norm, 1).shape, diff_coord.shape)
                #diff_coord =  diff_coord/np.expand_dims(dist_norm, 1)
                #print(diff_coord)
                cos_ar = []
                for i in range(len(diff_coord)-1):
                    #print(diff_coord[i], diff_coord[i+1], 'qwerty')
                    if dist_norm[i]>0 and dist_norm[i+1]>0:
                        cos_ar.append(-sum(diff_coord[i]*diff_coord[i+1])/dist_norm[i]/dist_norm[i+1])
                    else:
                        cos_ar.append(-10)
                    #cos_ar.append(-sum(diff_coord[i]*diff_coord[i+1])/dist_norm[i]/dist_norm[i+1])
                    #tyuh = 1
                #print(len(cos_ar))
                #assert 1==2
                edge_lens = get_edge_dist_based_grsA(transformed_coords)
                
                #plot_np.append(transformed_coords.copy())

                #raw_data.append(get_seq_from_pos(sig_dict[key]['pos'], N_dict[full_key_with_sub]['loc'], key))
                #data_np.append(transformed_coords.copy().reshape(-1).tolist()+dist_norm.tolist()+cos_ar+
                #               list(get_encoding(get_seq_from_pos(sig_dict[key]['pos'], N_dict[full_key_with_sub]['loc'], key))))
                #data_np.append(transformed_coords.copy().reshape(-1).tolist()+dist_norm.tolist()+cos_ar+edge_lens)
                #data_np.append(dist_norm.tolist()+cos_ar+edge_lens)
                #print((data_np[0]))
                #label_np.append(sub_value_to_idx_dict[sub])
                j += 1
            #else:
            except:
                print('Something wrong with key '+key+'. Investigate.')
            #assert 1==2
            
            
            nrps_data_dict[key_without_sub] = list(get_encoding(get_seq_from_pos(sig_dict[key]['pos'], N_dict[full_key_with_sub]['loc'], key)))
            transformed_3d_dict[key_without_sub] = transformed_coords.copy().reshape(-1).tolist()
            raptorx_prop_dict[key_without_sub] = field_data
            raptorx_3d_angle_dict[key_without_sub] = dist_norm.tolist()+cos_ar+edge_lens
            label_dict[key_without_sub] = sub_value_to_idx_dict[sub]
            raw_data_dict[key_without_sub] = get_seq_from_pos(sig_dict[key]['pos'], N_dict[full_key_with_sub]['loc'], key)
            #assert 1==2
    elif key_without_sub in unique_sig_id_no_sub_list:
        print('Rejected id:',key_without_sub,'sub',sub)
    else:
        print('Rejected id:',key_without_sub,'sub',sub)


Rejected id: Q1PSF4_1 sub hyv-d
Rejected id: D1FVF0_1 sub hyv-d
Rejected id: B6D9A8_1 sub hyv-d
Rejected id: Q00869_1 sub hyv-d


In [65]:
#sub_value_to_idx_dict

In [66]:
len(N_dict)

658

In [67]:
len(unique_sig_id_no_sub_list)

682

In [68]:
len(nrps_data_dict)

654

In [69]:
assert(len(nrps_data_dict) == len(transformed_3d_dict))
assert(len(nrps_data_dict) == len(raptorx_prop_dict))
assert(len(nrps_data_dict) == len(raptorx_3d_angle_dict))
assert(len(nrps_data_dict) == len(label_dict))
assert(len(unique_sig_id_no_sub_list) > len(label_dict))
#assert 1==2

In [70]:

data_np = np.asarray(data_np)
plot_np = np.asarray(plot_np)
data_np_orig = data_np.copy()
label_np = np.asarray(label_np).astype('int')
raw_data = np.array(raw_data)

In [71]:
def save_pct_id(pct, max_iter):
    pct_fname = 'pct_ids/size_'+str(len(label_dict))+'_test_pct_'+str(pct)+'_iter_'+str(max_iter)+'_date_'+time.strftime('%Y_%m_%d_%H%M%S'+'.romel')
    pct_fname = 'pct_ids/size_'+str(len(label_dict))+'_test_pct_'+str(pct)+'_iter_'+str(max_iter)+'_date_'+time.strftime('%Y_%m_%d_%H%M'+'.romel')
    #pct_fname = 'pct_ids/size'
    print('Saving in file', pct_fname)
    tot_ids = list(label_dict.keys())
    with open(pct_fname, 'w', newline='\n') as f:
        f.writelines('Format: all ids on first line, then train id, test id for max_iter iterations\n')
        f.writelines(' '.join(tot_ids))
        f.writelines('\n')
        for it in range(max_iter):
            test_elig = np.random.random(size=(len(tot_ids))) <= (pct/100)
            test_ids = []
            train_ids = []
            for i in range(len(tot_ids)):
                if test_elig[i]:
                    test_ids.append(tot_ids[i])
                else:
                    train_ids.append(tot_ids[i])
            f.writelines(' '.join(train_ids))
            f.writelines('\n')
            f.writelines(' '.join(test_ids))
            f.writelines('\n')
            #print(len(train_ids), len(train_ids)/len(tot_ids))

In [72]:
save_new_pct = False

In [73]:
if save_new_pct:
    for pct in list(range(5, 100, 5)):
        for it in [200]:
            save_pct_id(pct, it)

In [74]:
pct_iter_id_dict = {}

pct_list = list(range(5, 100, 5))
max_it = 40

pct_f_name_pre = 'pct_ids/size_654_test_pct_'
pct_f_name_suf = '_iter_200_date_2020_10_18_2338.romel'

for pct in pct_list:
    pct_iter_id_dict[str(pct)] = {}
    pct_iter_id_dict[str(pct)]['train'] = []
    pct_iter_id_dict[str(pct)]['test'] = []
    pct_f_name = pct_f_name_pre + str(pct) + pct_f_name_suf
    
    with open(pct_f_name, 'r') as f:
        content = f.readlines()
        tot_id = content[1].strip('\n').split(' ')
        for i in range(2, (max_it+1)*2, 2):
            train_id = content[i].strip('\n').split(' ')
            test_id = content[i+1].strip('\n').split(' ')
            assert(len(tot_id) == len(train_id) + len(test_id))
            
            #print(int(i/2)-1)
            pct_iter_id_dict[str(pct)]['train'].append(train_id)
            pct_iter_id_dict[str(pct)]['test'].append(test_id)
        #assert 1==2

In [75]:
#assert 1==2

In [76]:
#sub_idx_to_value_dict
#str(int(np.random.random()*50000))

In [77]:
#d = data_np[0].reshape(34, 3)
#ref = data_np[0].reshape(34, 3)
#print(sc.linalg.orth(np.matmul(np.linalg.pinv(d), ref)))
#print(np.matmul(d, sc.linalg.orth(np.matmul(np.linalg.pinv(d), ref))))
#print(d.shape)

In [78]:
#np.linalg.pinv(np.asarray([3.804, 1.785, 22.271]).reshape(3, 1))

In [79]:
#data_np[0]

In [80]:
def get_hamming_distance(str1, str2):
    return sum(i != j for i, j in zip(str1, str2))

# For all points in test_data, choose the minimum hamming distance from all of train data, and return the distance list
def get_hamming_distance_bucket_info(test_data, train_data):
    dist_list = []
    for test_data_pt in test_data:
        dist_list.append(min([get_hamming_distance(test_data_pt, train_data_pt) for train_data_pt in train_data]))
    return np.array(dist_list)

def round_dec(num, dec=2):
    return float(round(num* 10.**dec))/(10**dec)

In [81]:
#clf_dict={}
#clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')

In [82]:
def print_stats(metric_list, metric_list_bucket, n_bucket):
    print(f"\n\nOverall Stats:")

    for item in metric_list:
        #print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc']:.3f} Average Recall: {item['rec']:.3f} Average Precision: {item['pre']:.3f} Average F1 Score: {item['f1']:.3f}")
        print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc']:.3f} ")

    for bucket in range(n_bucket):
        print(f"\n\nBucket {bucket+1} Stats:")
        for item in metric_list_bucket:
            #print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc'][bucket]:.3f} Average Recall: {item['rec'][bucket]:.3f} Average Precision: {item['pre'][bucket]:.3f} Average F1 Score: {item['f1'][bucket]:.3f}")
            print(f"Test percentage: {item['pct']:.2f} Average Accuracy: {item['acc'][bucket]:.3f}")
                  


In [83]:
def save_stats(metric_list, metric_list_bucket, n_bucket, pct_list, csv_name='stats'):
    overall_dict = {}
    for item in metric_list:
        overall_dict[item['pct']] = item['acc']
    
    with open(csv_name + '.csv', 'w', newline='') as csvfile:
        fieldnames = ['pct', 'overall_acc'] + ['Bucket '+str(item) for item in range(n_bucket)]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for item in metric_list_bucket:
            temp_dict = {'pct': item['pct'], 'overall_acc': round_dec(overall_dict[item['pct']])}
            for bucket in range(n_bucket):
                temp_dict['Bucket ' + str(bucket)] = round_dec(item['acc'][bucket])
            writer.writerow(temp_dict)

In [84]:
# n_iter signifies number of iterations
# test_percentage_list is the list of percentages of test data with respect to total data
# Code will iterate for each percentage for n_iter iterations
def train_and_validate(data_whole, clf_type, n_iter=10, test_percentage_list = [5, 10, 25, 40, 50], print_stat=True, save_stat=True):
    print(f'\nUsing {clf_type} classifier')
    n=len(label_np)
    avg = 'micro'
    # metric_list contains overall accuracy, precision, recall and f1 score for all data
    # metric_list_bucket contains hammning distance bucket-wise accuracy, precision, recall and f1 score
    metric_list = []
    metric_list_bucket = []
    n_bucket = 16
    n_bucket = 34
    eps = 1e-8
    #test_percentage_list = [round_dec(len(data_np_test)/(len(data_np_test)+len(data_np_nrps)))]
    #n_iter = 5
    
    for test_percentage in test_percentage_list:
        assert test_percentage>0 and test_percentage<100
        acc_sum = 0
        
        print(f'Test data percentage wrt total data: {test_percentage}')
        
        n_iter_ar = np.zeros(n_bucket)
        acc_sum_ar = np.zeros(n_bucket)
        bucket_pct_ar = np.zeros(n_bucket)
        dist_buckets_all = []
        test_data_tot_len = 0
        
        
        for it in range(n_iter):
            # Create filter for random split
#             print(data_whole[str(test_percentage)][it].keys())
#             'train_data', 'test_data', 'train_label', 'test_label', 'train_data_raw', 'test_data_raw'
#             test_elig = np.random.random(size=(n)) <= (test_percentage/100)
#             test_data = data_np[test_elig]
#             test_label = label_np[test_elig]
#             train_data = data_np[(test_elig-1).astype('bool')]
#             train_label = label_np[(test_elig-1).astype('bool')]
#             test_data_tot_len += len(test_data)

#             raw_train_data = raw_data[(test_elig-1).astype('bool')]
#             raw_test_data = raw_data[test_elig]
            test_data = data_whole[str(test_percentage)][it]['test_data']
            test_label = data_whole[str(test_percentage)][it]['test_label']
            train_data = data_whole[str(test_percentage)][it]['train_data']
            train_label = data_whole[str(test_percentage)][it]['train_label']
            test_data_tot_len += len(test_data)

            raw_train_data = data_whole[str(test_percentage)][it]['train_data_raw']
            raw_test_data = data_whole[str(test_percentage)][it]['test_data_raw']
            
            dist_buckets = get_hamming_distance_bucket_info(raw_test_data, raw_train_data)
            dist_buckets_all.extend(list(dist_buckets))
            #print(dist_buckets)
            #print(test_label)
            #assert 1==2

            clf = clf_dict[clf_type]
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                clf.fit(train_data, train_label)
            test_predicted = clf.predict(test_data)
            #assert 1==2
            
            accuracy = acc(test_label, test_predicted)
            precision = pre(test_label, test_predicted, average=avg)
            print()
            
            acc_sum += accuracy
            print(f'Accuracy: {accuracy:.3f}')
            
            for bucket in range(n_bucket):
                b_filter = dist_buckets == bucket
#                 print(b_filter)
#                 print(len(b_filter), len(test_label))
                filtered_test_label = test_label[b_filter]
                filtered_test_predicted = test_predicted[b_filter]
                n_iter_ar[bucket] += len(filtered_test_label)
                if len(filtered_test_label):
                    accuracy = acc(filtered_test_label, filtered_test_predicted)
                    acc_sum_ar[bucket] += accuracy*len(filtered_test_label)
                    
                    bucket_pct_ar[bucket] += len(filtered_test_label)
                    
        #print(acc_sum_ar)
        metric_list.append({'pct':test_percentage, 'acc':acc_sum/n_iter, 'num':test_data_tot_len})
        metric_list_bucket.append({'pct':test_percentage, 'acc':acc_sum_ar/(bucket_pct_ar+eps), 'num':bucket_pct_ar.astype(np.int32), 'bkt_pct':bucket_pct_ar/test_data_tot_len})
        freq_stat = {value: round_dec(len(list(freq))*100./len(dist_buckets_all), 2) for value, freq in groupby(sorted(dist_buckets_all))}
        print(f"Frequency of hamming distance: ", freq_stat)
        
    if print_stat:
        print_stats(metric_list, metric_list_bucket, n_bucket)
    if save_stat:
        save_stats(metric_list, metric_list_bucket, n_bucket, test_percentage_list, csv_name='consol_stats__'+clf_type+'_iter_'+str(n_iter)+'_pct_'+'_'.join(map(str, test_percentage_list))+datetime.now().strftime("_%y%m%d_%H%M%S"))
    return {'overall':metric_list, 'bucket':metric_list_bucket}

In [85]:
def round_and_str(a, b, n_dec=2):
    return str(round_dec(a, dec=n_dec)) + ' ('+ str(round_dec(b, dec=n_dec)) +')'

In [86]:
round_and_str(0.544275, 0.35436, n_dec=3)

'0.544 (0.354)'

In [87]:
def save_consolidated_stats_new(metric_struct, metric_bucket_struct, data_to_include, n_iter, round_and_to_str=False):
    fields = ['Exact', 'Inexact', 'B1-5', 'B6-10', 'B11-15', 'B16-20', 'B21-']
    fields = ['Exact', 'Inexact', 'B6+', 'B11+', 'B16+', 'B21+', 'B26+', 'B31+']
    algos = list(metric_struct.keys())
    if algos == []:
        return
    pct_list = [item['pct'] for item in metric_struct[algos[0]]]
    n_bucket = len(metric_bucket_struct[algos[0]][0]['acc'])
    bucket_pct = [ str(round(item, 3)) for item in metric_bucket_struct[algos[0]][0]['bkt_pct'] ]
    overall_struct = {}
    bucket_struct = {}
    #print(metric_struct)
    #print(metric_bucket_struct)
    #print('****************************')
    for algo in algos:
        temp = {}
        for item in metric_struct[algo]:
            temp[item['pct']] = str(round_dec(item['acc']))
        overall_struct[algo] = temp
        temp = {}
        for item in metric_bucket_struct[algo]:
            #temp[item['pct']] = [round_dec(it) for it in item['acc']]
            temp[item['pct']] = []
            if round_and_to_str:
                temp[item['pct']].append(round_and_str(item['acc'][0], item['bkt_pct'][0]))
                temp[item['pct']].append(round_and_str(np.sum(item['acc'][1:]*item['num'][1:])/np.sum(item['num'][1:]), np.sum(item['bkt_pct'][1:])))
                temp[item['pct']].append(round_and_str(np.sum(item['acc'][6:]*item['num'][6:])/np.sum(item['num'][6:]), np.sum(item['bkt_pct'][6:])))
                temp[item['pct']].append(round_and_str(np.sum(item['acc'][11:]*item['num'][11:])/np.sum(item['num'][11:]), np.sum(item['bkt_pct'][11:])))
                temp[item['pct']].append(round_and_str(np.sum(item['acc'][16:]*item['num'][16:])/np.sum(item['num'][16:]), np.sum(item['bkt_pct'][16:])))
                temp[item['pct']].append(round_and_str(np.sum(item['acc'][21:]*item['num'][21:])/np.sum(item['num'][21:]), np.sum(item['bkt_pct'][21:])))
                temp[item['pct']].append(round_and_str(np.sum(item['acc'][26:]*item['num'][26:])/np.sum(item['num'][26:]), np.sum(item['bkt_pct'][26:])))
                temp[item['pct']].append(round_and_str(np.sum(item['acc'][31:]*item['num'][31:])/np.sum(item['num'][31:]), np.sum(item['bkt_pct'][31:])))
            else:
                temp[item['pct']].append(item['acc'][0])
                temp[item['pct']].append(np.sum(item['acc'][1:]*item['num'][1:])/np.sum(item['num'][1:]))
                temp[item['pct']].append(np.sum(item['acc'][6:]*item['num'][6:])/np.sum(item['num'][6:]))
                temp[item['pct']].append(np.sum(item['acc'][11:]*item['num'][11:])/np.sum(item['num'][11:]))
                temp[item['pct']].append(np.sum(item['acc'][16:]*item['num'][16:])/np.sum(item['num'][16:]))
                temp[item['pct']].append(np.sum(item['acc'][21:]*item['num'][21:])/np.sum(item['num'][21:]))
                temp[item['pct']].append(np.sum(item['acc'][26:]*item['num'][26:])/np.sum(item['num'][26:]))
                temp[item['pct']].append(np.sum(item['acc'][31:]*item['num'][31:])/np.sum(item['num'][31:]))
            #print(item)
            #print('\n________________')
            
            #print(temp[item['pct']])
            #print('\n================')
        
        bucket_struct[algo] = temp
    #print(overall_struct)
    #print(bucket_struct)
    #print([][0])
    for pct in pct_list:
        with open('_'.join(data_to_include)+'_pct_'+str(pct)+'_n_iter_'+str(n_iter)+'_bucket_'+str(n_bucket)+'_algo_'+'_'.join(map(str, algos))+datetime.now().strftime("_%y%m%d_%H%M%S")+ '.csv', 'w', newline='') as csvfile:
            #Bucket_fields = ['Bucket '+str(item)+'('+bucket_pct[item]+')' for item in range(n_bucket)]
            #fieldnames = ['Algo', 'Overall'] + Bucket_fields
            fieldnames = ['Algo', 'Overall'] + fields
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for algo in algos:
                temp = {'Algo': algo, 'Overall':overall_struct[algo][pct]}
                temp.update(dict(zip(fields, bucket_struct[algo][pct])))
                writer.writerow(temp)

In [88]:
'N_Rptrx_3d_xyz_consolidated_stats_algo_'

'N_Rptrx_3d_xyz_consolidated_stats_algo_'

In [89]:
#save_consolidated_stats_new(algo_overall_metric, algo_bucket_metric, n_iter)

In [90]:
'''
#clf_dict={}
#clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')
clf_dict={}
clf_dict['2g4'] = ExtraTreesClassifier(n_estimators=16, random_state=0, criterion='gini')
clf_dict['2e4'] = ExtraTreesClassifier(n_estimators=16, random_state=0, criterion='entropy')
clf_dict['2g8'] = ExtraTreesClassifier(n_estimators=256, random_state=0, criterion='gini')
clf_dict['2e8'] = ExtraTreesClassifier(n_estimators=256, random_state=0, criterion='entropy')
clf_dict['2g9'] = ExtraTreesClassifier(n_estimators=512, random_state=0, criterion='gini')
clf_dict['2e9'] = ExtraTreesClassifier(n_estimators=512, random_state=0, criterion='entropy')
clf_dict['g10'] = ExtraTreesClassifier(n_estimators=1024, random_state=0, criterion='gini')
clf_dict['e10'] = ExtraTreesClassifier(n_estimators=1024, random_state=0, criterion='entropy')
clf_dict['g11'] = ExtraTreesClassifier(n_estimators=2048, random_state=0, criterion='gini')
clf_dict['e11'] = ExtraTreesClassifier(n_estimators=2048, random_state=0, criterion='entropy')
clf_dict['g12'] = ExtraTreesClassifier(n_estimators=4096, random_state=0, criterion='gini')
clf_dict['e12'] = ExtraTreesClassifier(n_estimators=4096, random_state=0, criterion='entropy')
#clf_dict['g13'] = ExtraTreesClassifier(n_estimators=8192, random_state=0, criterion='gini')
#clf_dict['e13'] = ExtraTreesClassifier(n_estimators=8192, random_state=0, criterion='entropy')
#clf_dict['g14'] = ExtraTreesClassifier(n_estimators=16384, random_state=0, criterion='gini')
#clf_dict['e14'] = ExtraTreesClassifier(n_estimators=16384, random_state=0, criterion='entropy')
#clf_dict['g16'] = ExtraTreesClassifier(n_estimators=65536, random_state=0, criterion='gini')
#clf_dict['e16'] = ExtraTreesClassifier(n_estimators=65536, random_state=0, criterion='entropy')
'''

"\n#clf_dict={}\n#clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')\nclf_dict={}\nclf_dict['2g4'] = ExtraTreesClassifier(n_estimators=16, random_state=0, criterion='gini')\nclf_dict['2e4'] = ExtraTreesClassifier(n_estimators=16, random_state=0, criterion='entropy')\nclf_dict['2g8'] = ExtraTreesClassifier(n_estimators=256, random_state=0, criterion='gini')\nclf_dict['2e8'] = ExtraTreesClassifier(n_estimators=256, random_state=0, criterion='entropy')\nclf_dict['2g9'] = ExtraTreesClassifier(n_estimators=512, random_state=0, criterion='gini')\nclf_dict['2e9'] = ExtraTreesClassifier(n_estimators=512, random_state=0, criterion='entropy')\nclf_dict['g10'] = ExtraTreesClassifier(n_estimators=1024, random_state=0, criterion='gini')\nclf_dict['e10'] = ExtraTreesClassifier(n_estimators=1024, random_state=0, criterion='entropy')\nclf_dict['g11'] = ExtraTreesClassifier(n_estimators=2048, random_state=0, criterion='gini')\nclf_dict['e11'] = ExtraTreesCl

In [91]:
# nrps_data_dict[key_without_sub] = list(get_encoding(get_seq_from_pos(sig_dict[key]['pos'], N_dict[full_key_with_sub]['loc'], key)))
# transformed_3d_dict[key_without_sub] = transformed_coords.copy().reshape(-1).tolist()
# raptorx_prop_dict[key_without_sub] = field_data
# raptorx_3d_angle_dict[key_without_sub] = dist_norm.tolist()+cos_ar+edge_lens
# label_dict[key_without_sub] = sub_value_to_idx_dict[sub]
# raw_data_dict[key_without_sub]

In [92]:
#nrps_data_dict.keys()

In [93]:
def get_val_from_dict(ids, data_to_include):
    ret = []
    #print(len(ids))
    for idx in ids:
        temp = []
        if 'NRPS properties' in data_to_include:
            #print(nrps_data_dict[idx])
            temp.extend(nrps_data_dict[idx])
        if '3d coordinates' in data_to_include:
            temp.extend(transformed_3d_dict[idx])
        if 'Property features' in data_to_include:
            temp.extend(raptorx_prop_dict[idx])
        if '3d Distance and Angle' in data_to_include:
            temp.extend(raptorx_3d_angle_dict[idx])
        if 'label' in data_to_include:
            temp = label_dict[idx]
        if 'raw' in data_to_include:
            temp.extend(raw_data_dict[idx])
        ret.append(temp)
    return ret

def get_train_test_from_pct_dict(data_to_include=['NRPS properties'], test_pct_list=[50], max_iter=1):
    ret_dict = {}
    for pct in test_pct_list:
        assert str(pct) in pct_iter_id_dict.keys()
        ret_dict[str(pct)] = []
        for it in range(max_iter):
            train_ids = pct_iter_id_dict[str(pct)]['train'][it]
            test_ids = pct_iter_id_dict[str(pct)]['test'][it]
            
            temp = {}
            temp['train_data'] = np.asarray(get_val_from_dict(train_ids, data_to_include))
            temp['test_data'] = np.asarray(get_val_from_dict(test_ids, data_to_include))
            temp['train_label'] = np.asarray(get_val_from_dict(train_ids, ['label']))
            temp['test_label'] = np.asarray(get_val_from_dict(test_ids, ['label']))
            temp['train_data_raw'] = get_val_from_dict(train_ids, ['raw'])
            temp['test_data_raw'] = get_val_from_dict(test_ids, ['raw'])
            
            ret_dict[str(pct)].append(temp)
    return ret_dict

In [94]:
#a = get_train_test_from_pct_dict(data_to_include=['nrps', '3d'])

In [95]:
#a['50'][0]['train_label']

In [96]:
assert 1==2

AssertionError: 

In [97]:
# Build classifier dictionary using sklearn multiclass classifiers

clf_dict={}
# clf_dict['extra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')
clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')
# clf_dict['gmd10'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini', max_depth=10)
# clf_dict['gmd30'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini', max_depth=30)
# clf_dict['gmd50'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini', max_depth=50)
# clf_dict['gmd70'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini', max_depth=70)
# clf_dict['gmd100'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini', max_depth=100)
# clf_dict['emd10'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', max_depth=10)
# clf_dict['emd30'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', max_depth=30)
# clf_dict['emd50'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', max_depth=50)
# clf_dict['emd70'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', max_depth=70)
# clf_dict['emd100'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', max_depth=100)
# clf_dict['ne2_1'] = ExtraTreesClassifier(n_estimators=2, random_state=0, criterion='gini')
# clf_dict['ne2_2'] = ExtraTreesClassifier(n_estimators=4, random_state=0, criterion='gini')
# clf_dict['ne2_3'] = ExtraTreesClassifier(n_estimators=8, random_state=0, criterion='gini')
# clf_dict['ne2_4'] = ExtraTreesClassifier(n_estimators=16, random_state=0, criterion='gini')
# clf_dict['ne2_5'] = ExtraTreesClassifier(n_estimators=32, random_state=0, criterion='gini')
# clf_dict['ne2_6'] = ExtraTreesClassifier(n_estimators=64, random_state=0, criterion='gini')
# clf_dict['ne2_7'] = ExtraTreesClassifier(n_estimators=128, random_state=0, criterion='gini')
# clf_dict['ne2_8'] = ExtraTreesClassifier(n_estimators=256, random_state=0, criterion='gini')
# clf_dict['ne2_9'] = ExtraTreesClassifier(n_estimators=512, random_state=0, criterion='gini')
# clf_dict['ne2_10'] = ExtraTreesClassifier(n_estimators=1024, random_state=0, criterion='gini')
# clf_dict['e'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy')
# clf_dict['b1'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.01)
# clf_dict['b5'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.05)
# clf_dict['b10'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.1)
# clf_dict['b20'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.2)
# clf_dict['b30'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.3)
# clf_dict['b40'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.4)
# clf_dict['b50'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.5)
# clf_dict['b60'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.6)
# clf_dict['b70'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.7)
# clf_dict['b80'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.8)
# clf_dict['b90'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=0.9)
# clf_dict['b'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', bootstrap=True, max_samples=None)


# clf_dict['10im'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', min_impurity_decrease=0.1)
# clf_dict['5im'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', min_impurity_decrease=0.05)
# clf_dict['2im'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', min_impurity_decrease=0.02)
# clf_dict['1im'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', min_impurity_decrease=0.01)
# clf_dict['1i5'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', min_impurity_decrease=0.005)
# clf_dict['1i2'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', min_impurity_decrease=0.002)
# clf_dict['1i1'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy', min_impurity_decrease=0.001)

# clf_dict['lr'] = LogisticRegression(random_state=0, max_iter=400, multi_class='multinomial', solver='newton-cg')
# clf_dict['svm'] = make_pipeline(StandardScaler(), LinearSVC(random_state=0, multi_class='crammer_singer', tol=1e-9, max_iter=2000))
# clf_dict['knn'] = KNeighborsClassifier(weights='distance')
# clf_dict['mlp_sklearn'] = MLPClassifier(random_state=1, max_iter=400, early_stopping=False, )
# clf_dict['rand_for'] = RandomForestClassifier(max_depth=4, criterion='entropy')
# clf_dict['dec_tree'] = DecisionTreeClassifier(random_state=0, criterion='entropy')
# clf_dict['ber_nb'] = BernoulliNB()
# clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')
# clf_dict['gau_nb'] = GaussianNB()
# clf_dict['label_prop'] = LabelPropagation(kernel='knn')
# clf_dict['label_spread'] = LabelSpreading(kernel='knn')
# clf_dict['lda'] = LinearDiscriminantAnalysis()
# clf_dict['ridge_cv'] = RidgeClassifierCV()
# clf_dict['n_cent'] = NearestCentroid()
# clf_dict['ridge'] = RidgeClassifier()

In [98]:
from itertools import combinations 

f_list = ['3d coordinates', 'NRPS properties', '3d Distance and Angle', 'Property features']
for i in range(1,len(f_list)+1):
    c = [list(it) for it in list(combinations(f_list, i))]
    print(c)

[['3d coordinates'], ['NRPS properties'], ['3d Distance and Angle'], ['Property features']]
[['3d coordinates', 'NRPS properties'], ['3d coordinates', '3d Distance and Angle'], ['3d coordinates', 'Property features'], ['NRPS properties', '3d Distance and Angle'], ['NRPS properties', 'Property features'], ['3d Distance and Angle', 'Property features']]
[['3d coordinates', 'NRPS properties', '3d Distance and Angle'], ['3d coordinates', 'NRPS properties', 'Property features'], ['3d coordinates', '3d Distance and Angle', 'Property features'], ['NRPS properties', '3d Distance and Angle', 'Property features']]
[['3d coordinates', 'NRPS properties', '3d Distance and Angle', 'Property features']]


In [99]:
assert 1==2

AssertionError: 

In [100]:
#'''
#clf_dict={}
#clf_dict['xtra_tree'] = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='gini')
algo_overall_metric = {}
algo_bucket_metric = {}
n_iter = 20
test_pct_list = [10, 30, 50, 70]
test_pct_list = [30]
data_to_include = ['3d coordinates', 'NRPS properties', '3d Distance and Angle', 'Property features']
data_to_include = ['Property features', 'NRPS properties']
data_whole = get_train_test_from_pct_dict(data_to_include, test_pct_list, n_iter)
#test_pct_list = [20]

for clf in list(clf_dict.keys()):
#for clf in ['lr', 'svm']:
#for clf in ['lr']:
    ret = train_and_validate(data_whole, clf, n_iter=n_iter, test_percentage_list = test_pct_list, save_stat=False)
    algo_overall_metric[clf] = ret['overall']
    algo_bucket_metric[clf] = ret['bucket']

#print([][0])
save_consolidated_stats_new(algo_overall_metric, algo_bucket_metric, data_to_include, n_iter)
#'''


Using xtra_tree classifier
Test data percentage wrt total data: 30
Accuracy: 0.702
Accuracy: 0.693
Accuracy: 0.701
Accuracy: 0.705
Accuracy: 0.714
Accuracy: 0.706
Accuracy: 0.710
Accuracy: 0.695
Accuracy: 0.706
Accuracy: 0.658
Accuracy: 0.677
Accuracy: 0.670
Accuracy: 0.704
Accuracy: 0.698
Accuracy: 0.671
Accuracy: 0.675
Accuracy: 0.678
Accuracy: 0.680
Accuracy: 0.654
Accuracy: 0.772
Frequency of hamming distance:  {1: 19.69, 2: 14.05, 3: 11.64, 4: 6.43, 5: 6.86, 6: 3.3, 7: 4.52, 8: 3.71, 9: 3.35, 10: 3.51, 11: 3.89, 12: 4.32, 13: 2.9, 14: 3.33, 15: 2.52, 16: 1.85, 17: 0.89, 18: 0.91, 19: 0.38, 20: 0.28, 21: 0.41, 22: 0.61, 23: 0.33, 24: 0.33}


Overall Stats:
Test percentage: 30.00 Average Accuracy: 0.693 


Bucket 1 Stats:
Test percentage: 30.00 Average Accuracy: 0.000


Bucket 2 Stats:
Test percentage: 30.00 Average Accuracy: 0.921


Bucket 3 Stats:
Test percentage: 30.00 Average Accuracy: 0.899


Bucket 4 Stats:
Test percentage: 30.00 Average Accuracy: 0.878


Bucket 5 Stats:
Test

  temp[item['pct']].append(np.sum(item['acc'][26:]*item['num'][26:])/np.sum(item['num'][26:]))
  temp[item['pct']].append(np.sum(item['acc'][31:]*item['num'][31:])/np.sum(item['num'][31:]))


In [None]:
#save_consolidated_stats(algo_overall_metric, algo_bucket_metric, n_iter)

In [None]:
#algo_overall_metric

In [None]:
#algo_bucket_metric

In [None]:
#print([][0])

In [None]:
#np.sum(algo_bucket_metric['xtra_tree'][0]['bkt_pct'])

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
print('the end at ', time.ctime())