# Overview
In this notebook inparalogous groups are determined using tools from ETE3. Groups of inparalogs are then named, and a table is saved to employ in subsequent analyses.

In [1]:
# loading libraries
import os, glob, ete3, networkx as nx, itertools, tqdm
import pandas as pd
from ete3 import Tree

# Defining functions

In [2]:
def create_edges_monophyly_assemblies_at_most_many_to_one(tree, leaves_graph, target_assemblies):
    for node in tree.get_monophyletic(values = target_assemblies, target_attr = 'assembly'):
            # create only if the monophyletic group contains: a. only sequences of desired assemblies (and for all assemblies), and at most in realtionship 1-to-many
            # count for assemblies
            assemblies_in = [leaf.split('.')[0] for leaf in node.get_leaf_names()]
            # create dictionary with counts
            assembly_count_dict = {}
            for assembly in target_assemblies:
                assembly_count_dict.update({assembly: assemblies_in.count(assembly)})
            # first verifying that only target assemblies are in this monophyletic group
            assemblies_in_set = set(assemblies_in)
            target_assemblies_set = set(target_assemblies)
            if (len(target_assemblies_set.difference(assemblies_in_set)) == 0) and (len(assemblies_in_set.difference(target_assemblies_set)) == 0):
                # now checking that, at most, one value for assembly is greater than one
                total_count = len(assembly_count_dict.keys())
                counts_upper_one = len([count for count in assembly_count_dict.values() if count > 1])
                if (total_count - counts_upper_one) >= (total_count - 1):
                    # adding edges to target graph
                    node_leafs = node.get_leaf_names()
                    # connect allt the pairwise combinations of them
                    for pairwise_nodes in itertools.combinations(node_leafs, 2):
                        # get them
                        node_a, node_b = pairwise_nodes
                        # add edge
                        leaves_graph.add_edge(node_a, node_b)

def getting_assembly_monophyletic_groups(tree, target_assemblies):
    # create the graph
    tree_graph = nx.Graph()
    # add nodes
    [tree_graph.add_node(leaf) for leaf in tree]
    # create the assembly dictionary
    node2assembly_dict = {}
    [node2assembly_dict.update({leaf.name: leaf.name.split('.')[0]}) for leaf in tree]
    # setting this features into the tree
    for leaf in tree:
        leaf.add_features(assembly = node2assembly_dict.get(leaf.name, 'none'))
    # traverse the tree in a 'postorder' modality
    for node in tree.traverse('postorder'):
        if node != tree:
            tree.set_outgroup(node)
        # performing the algorithm
        create_edges_monophyly_assemblies_at_most_many_to_one(tree = tree, leaves_graph = tree_graph, target_assemblies = target_assemblies)
    # get connected components
    connected_comps = [list(group) for group in list(nx.connected_components(tree_graph)) if len(list(group)) > 1]
    # return result
    return connected_comps

In [3]:
def getting_monophyletic_groups(tree_path):
    # getting treename
    tree_name = tree_path.rpartition('/')[2]
    # loading tree
    t = Tree(tree_path)
    # creating dictionary to allocate groups
    dixio = {}
    if len(t.get_leaf_names()) > 4:
        for species_code in species_codes:
            # updating dixio
            dixio.update({species_code: getting_assembly_monophyletic_groups(tree = t, target_assemblies = [species_code])})
    if len(t.get_leaf_names()) <= 4: # this can be done only because only trees for one-species trees were saved into the folder
        for species_code in species_codes:
            # updating dixio
            species_in_leaves = list(set([x.rpartition('.')[0] for x in t.get_leaf_names()]))
            if species_code in species_in_leaves:
                dixio.update({species_code: [[leaf for leaf in t.get_leaf_names()]]})
            else:
                dixio.update({species_code: []})
    return {tree_name: dixio}

# Getting the inparalogs

In [4]:
# defining used species_codes
species_codes = [x.rpartition('/')[2].split('.')[0].split('_')[0].title()[0]+x.rpartition('/')[2].split('.')[0].split('_')[1][0:3] for x in glob.glob('../data/platyhelminthes_dataset/*.faa')]

In [10]:
tree_monophyletics = {}

In [11]:
# apply function
for tree_path in tqdm.tqdm(glob.glob('../results/phylogenetic_trees/*/*.contree')+glob.glob('../results/phylogenetic_trees/*/*.tree')):
    tree_monophyletics.update(getting_monophyletic_groups(tree_path))

100%|██████████| 27190/27190 [11:08<00:00, 40.66it/s]  


In [12]:
# get gene codes
original2new = pd.read_csv('../results/misc/gene_code_correspondance.tsv', sep = '\t')

In [13]:
original2new.head()

Unnamed: 0,Original Name,New Name
0,TsM_000000100,Tsol.1
1,TsM_000000200,Tsol.2
2,TsM_000000300,Tsol.3
3,TsM_000000400,Tsol.4
4,TsM_000000500,Tsol.5


In [14]:
new2original_dict = {row['New Name']: row['Original Name'] for index,row in original2new.iterrows()}

In [15]:
# Given dictionary
# Dictionary to map species codes to species names
code2species = {
    'Tsol': 'T. solium',
    'Mlig': 'M. lignano',
    'Treg': 'T. regenti',
    'Hmic': 'H. microstoma',
    'Tsag': 'T. saginata',
    'Hdim': 'H. diminuta',
    'Fhep': 'F. hepatica',
    'Smat': 'S. mattheei',
    'Scur': 'S. curassoni',
    'Emul': 'E. multilocularis',
    'Sman': 'S. mansoni',
    'Tasi': 'T. asiatica',
    'Smar': 'S. margrebowiei',
    'Csin': 'C. sinensis',
    'Egra': 'E. granulosus',
    'Mcor': 'M. corti',
    'Shae': 'S. haematobium',
    'Sjap': 'S. japonicum',
    'Ecan': 'E. canadensis',
    'Smed': 'S. mediterranea',
    'Oviv': 'O. viverrini'
}

# Dictionary to store the table data
table_data = []

# Iterate over the dictionary
for key, value in tqdm.tqdm(tree_monophyletics.items()):
    family_code = key.rpartition('.')[0]  # Extract family code
    for species, genes in value.items():
        if genes:
            for idx, gene_list in enumerate(genes, start=1):
                gene_codes = ', '.join([new2original_dict.get(x) for x in gene_list])
                species_name = code2species.get(species, 'Unknown')
                monophyletic_group_code = f"{family_code}_{species.upper()}_G{idx}"
                table_data.append(pd.DataFrame.from_dict({'Genes': [gene_codes], 'monophyletic_group_code': [monophyletic_group_code], 'Species': [species_name]}))

100%|██████████| 27190/27190 [00:08<00:00, 3043.80it/s]


In [16]:
inparalogs_table = pd.concat(table_data)

In [17]:
inparalogs_table.to_csv('../results/misc/inparalogs_group_composition_final.tsv', sep = '\t', index = False)