In [2]:
import pandas as pd

In [3]:
df_JCM4529 = pd.read_csv("JCM4529_anotado.metadata.CBPs.tsv", sep='\t')
df_NRRL_B12102 = pd.read_csv("NRRL_B-12102_anotado.metadata.CBPs.tsv", sep='\t')
df_SID8161 = pd.read_csv("SID8161_anotado.metadata.CBPs.tsv", sep='\t')
df_NRRL_B2410 = pd.read_csv("NRRL_B-2410_anotado.metadata.CBPs.tsv", sep='\t')

data_frames = [df_JCM4529 , df_NRRL_B12102, df_SID8161, df_NRRL_B2410]

In [4]:
Doms_JCM4529=df_JCM4529['Domain architecture']
Doms_NRRL_B12102=df_NRRL_B12102['Domain architecture']

In [5]:
#Transforming the strings of domain architectures in lists
def domain_list(domain_architecture):
    final_list = []
    p_domain = ""
    for i in domain_architecture:
        if i == '|':
            final_list.append(p_domain)
            p_domain = ''
        elif (i != ' ') and (i != '>'):
            p_domain = p_domain+i
            
    return final_list

#Extracting the domains of a given type from a dataframe
def proteins_of_type(protein_type, cbps):
    architecture=[]
    for i in range(len(cbps)):
        if cbps['Core Biosynthetic Protein type'][i]==protein_type:
            architecture.append(domain_list(cbps['Domain architecture'][i]))
                   
    return architecture

In [6]:
#From a pair of lists we get the first entry k in the shorter list matching some entry j in the larger list. 
#If such a k exists, the list [[k,j]] (with the smallest j) is returned. Otherwise an empty list is returned. 
def first_matching_pairs(short_list, long_list):
    coordinates = []
    i = len(coordinates)
    k = 0
    while k < len(short_list) and i == 0:
        for j in range(len(long_list)):
            if short_list[k] == long_list[j]:
                coordinates.append([k,j])
        k = k+1
        i = len(coordinates)
    return coordinates

#From two lists it returns the list of all possible [k,j] such that the k-th entry of the short list and
#and the j-th entry of the long list are possible first entries of the largest common subsequence of 
#both lists. 
def departing_coordinates(short_list, long_list):
    d_coordinates = []
    
    for i in range(len(short_list)):
        if len(d_coordinates) == 0:
            d_coordinates = first_matching_pairs(short_list, long_list)
        
        else:
            n_coordinates = first_matching_pairs(short_list[d_coordinates[0][0]+1:len(short_list)], long_list[0:d_coordinates[0][1]])
            n_coordinates_adj = []
            for coordinate in n_coordinates:
                n_coordinates_adj.append([d_coordinates[0][0]+coordinate[0]+1,coordinate[1]])
            
            d_coordinates = n_coordinates_adj+d_coordinates
            
    return d_coordinates
    
    


In [7]:
#It returns the list(s) of [i,j], where the list of short_list[i]=long_list[j] is the largest_common_subsequence.  
def largest_common_subsequence(short_list, long_list):
    
    approximations = []
    i = 0
    k = 0
    while (i < (len(short_list))) and (k == 0):
        if len(approximations) == 0:
            new_coordinates = departing_coordinates(short_list ,long_list)
            n_approximations = []
            for j in new_coordinates:
                n_approximations.append([j])
                
            if len(n_approximations) == 0:
                k = 1
            approximations = n_approximations
            
        else:
            n_approximations = []
            for j in range(len(approximations)):
                departure = approximations[j][-1]
                further_coordinates = departing_coordinates(short_list[departure[0]+1:len(short_list)],long_list[departure[1]+1:len(long_list)])
                if len(further_coordinates) > 0:
                    n_approximations_j = []
                    for coordinates in further_coordinates:
                        n_approximations_j.append(approximations[j]+[[departure[0]+coordinates[0]+1,departure[1]+coordinates[1]+1]])
                    
                    n_approximations = n_approximations + n_approximations_j
            
            if len(n_approximations) == 0:
                k = 1
            else:
                approximations = n_approximations
            
        i = i+1
        
        
    return approximations

In [8]:
#
def domain_selection(short_list, long_list):
    limits = largest_common_subsequence(short_list, long_list)[0]
    left_limits = limits[0]
    right_limits = limits[-1]
    left_extension = left_limits[1]-left_limits[0]
    right_extension = right_limits[1]+(len(short_list)-right_limits[0])
    
    return [short_list, long_list[left_extension:right_extension]]

In [9]:
def ordering_lists(list_0, list_1):
    if len(list_1)<len(list_0):
        return [[list_1, list_0],0]
    else:
        return [[list_0, list_1],1]

In [17]:
print(fixed_t1pks_string )

['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP']


In [18]:
#Proof 1 on T1PKS 
#Fixed domain string 

fixed_t1pks_string = domain_list(df_JCM4529['Domain architecture'][48])
print(fixed_t1pks_string )


for data_frame in [df_NRRL_B12102 , df_SID8161 , df_NRRL_B2410]:
    compared_list = proteins_of_type('T1PKS', data_frame)
    list_selected_domains = []
    for list_ in compared_list:
        order = ordering_lists(fixed_t1pks_string, list_)
        selection = domain_selection(order[0][0], order[0][1])
        if order[1] == 0:
            list_selected_domains.append([selection[1], selection[0]])
        else: 
            list_selected_domains.append(selection)
            
    for pair in list_selected_domains:
        print(pair)

['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP']
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'KR', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'KR']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'KR', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'DH']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'KR', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'A

In [21]:
#Proof 2 on T1PKS 
#Fixed domain string 

fixed_t1pks_string = domain_list(df_JCM4529['Domain architecture'][61])
print("Fixed domain string")
print(fixed_t1pks_string )

print("Domain selection of fixed domain string vs. each domain string of the same type")
for data_frame in [df_NRRL_B12102 , df_SID8161 , df_NRRL_B2410]:
    compared_list = proteins_of_type('T1PKS', data_frame)
    list_selected_domains = []
    for list_ in compared_list:
        order = ordering_lists(fixed_t1pks_string, list_)
        selection = domain_selection(order[0][0], order[0][1])
        if order[1] == 0:
            list_selected_domains.append([selection[1], selection[0]])
        else: 
            list_selected_domains.append(selection)
            
    for pair in list_selected_domains:
        print(pair)

Fixed domain string
['Docking', 'KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR', 'T/ACP', 'KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'ER', 'KR', 'T/ACP', 'KS', 'KS_C']
Domain selection of fixed domain string vs. each domain string of the same type
[['Docking', 'KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR', 'T/ACP', 'KS', 'KS_C', 'KS_Ce'], ['Docking', 'KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'KR', 'T/ACP', 'KS', 'KS_C', 'KS_Ce']]
[['Docking', 'KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR'], ['Docking', 'KS', 'KS_C', 'KS_Ce', 'AT', 'KR']]
[['Docking', 'KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR', 'T/ACP', 'KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'ER', 'KR'], ['Docking', 'KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'KR', 'T/ACP', 'KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'KR']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR', 'T/ACP'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'T/ACP']]
[['KS', 'KS_C', 'KS_Ce', 'AT', 'PKS_DE', 'KR', 'T/ACP', 'KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'ER', 'KR'], ['KS', 'KS_C', 'KS_Ce', 'AT', 'DH', 'KR']]
[['Docking', 'KS'

In [22]:
#Proof 1 on NRPS
#Fixed domain string 

fixed_t1pks_string = domain_list(df_JCM4529['Domain architecture'][46])
print("Fixed domain string")
print(fixed_t1pks_string )

print("Domain selection of fixed domain string vs. each domain string of the same type")
for data_frame in [df_NRRL_B12102 , df_SID8161 , df_NRRL_B2410]:
    compared_list = proteins_of_type('NRPS', data_frame)
    list_selected_domains = []
    for list_ in compared_list:
        order = ordering_lists(fixed_t1pks_string, list_)
        selection = domain_selection(order[0][0], order[0][1])
        if order[1] == 0:
            list_selected_domains.append([selection[1], selection[0]])
        else: 
            list_selected_domains.append(selection)
            
    for pair in list_selected_domains:
        print(pair)

Fixed domain string
['C', 'A', 'A_C', 'T/ACP', 'C', 'A', 'KR']
Domain selection of fixed domain string vs. each domain string of the same type


IndexError: list index out of range

In [23]:
largest_common_subsequence(domain_list(df_NRRL_B12102['Domain architecture'][15]),domain_list(df_JCM4529['Domain architecture'][46]))

[[[1, 0]], [[1, 4]]]