# Overview
In this notebook tables presented in this paper are either created or referenced (in case they were created in a previous notebook, during analysis of data).

## Preliminary work: classify analyzed proteins
Many of the following tables use information for each protein analyzed in this paper and the classifications performed. Recapitulating this into a python dictionary in order to make table creation easier.

In [3]:
import glob
from Bio import SeqIO
import pandas as pd
import tqdm

In [4]:
# create a dictionary of proteins for each species
# get empty dictionaries for all proteins of each species

fasta_files = glob.glob('../data/platyhelminthes_dataset/*')
# Dictionary to hold everything
species_dict = {}
# Define the categories for the innermost dictionary
categories = ["SignalP", "SecretomeP", "TMHMM", "TargetP", "Secretome", "Transmembranome", "Homologs", "Singleton", "Inparalog"]
# Process each FASTA file
for fasta_file in fasta_files:
    # Extract species name from the file path
    parts = fasta_file.split('.')
    species_name = parts[-6].rpartition('/')[2].split('_')
    species = f"{species_name[0].capitalize()[0]}. {species_name[1]}"
    
    # Initialize the species dictionary if not already done
    if species not in species_dict:
        species_dict[species] = {}
    
    # Parse the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Each record ID becomes a key in the species dictionary
        species_dict[species][record.id] = {key: {} for key in categories}

In [5]:
new2old_dict = {row['Original Name']: row['New Name'] for index,row in pd.read_csv('../results/misc/gene_code_correspondance.tsv', sep = '\t').iterrows()}

In [6]:
secretome_and_transmembranome_predictions = pd.read_csv('../results/secretome_and_transmembranome_predictions/secretome_and_transmembranome_predictions.tsv', sep ='\t' ).applymap(lambda x: new2old_dict.get(x, x))

In [7]:
secretome_and_transmembranome_predictions.head()

Unnamed: 0,ID,SecretomeP,SignalP,TargetP,TMHMM,Secretome,Transmembranome,Mitochondrial
0,Smed.18673,Y,N,N,N,Y,N,N
1,Smed.18733,Y,N,N,N,Y,N,N
2,Smed.1254,Y,N,N,N,Y,N,N
3,Smed.21065,Y,N,N,N,Y,N,N
4,Smed.11902,Y,N,N,N,Y,N,N


In [8]:
[x.split('/')[-1].split('.')[-6].split('_')[0].capitalize()[0]+'. '+x.split('/')[-1].split('.')[-6].split('_')[1] for x 
 in glob.glob('../data/platyhelminthes_dataset/*')]

['T. solium',
 'M. lignano',
 'T. regenti',
 'H. microstoma',
 'T. saginata',
 'H. diminuta',
 'F. hepatica',
 'S. mattheei',
 'S. curassoni',
 'E. multilocularis',
 'S. mansoni',
 'T. asiatica',
 'S. margrebowiei',
 'C. sinensis',
 'E. granulosus',
 'M. corti',
 'S. haematobium',
 'S. japonicum',
 'E. canadensis',
 'S. mediterranea',
 'O. viverrini']

In [9]:
# get the SignalP, SecretomeP, TMHMM, TargetP and classification into Secretome, Transmembranome or neither for each protein

import pandas as pd
import glob
from Bio import SeqIO
from concurrent.futures import ThreadPoolExecutor

# Assuming FASTA files are structured and named in a specific way that allows species identification
fasta_files = glob.glob('../data/platyhelminthes_dataset/*')

# Function to extract species name from FASTA file path
def get_species_from_fasta_filename(filename):
    species_name = filename.split('/')[-1].split('.')[-6].split('_')[0].capitalize()[0]+'. '+filename.split('/')[-1].split('.')[-6].split('_')[1]
    return species_name

# Dictionary to hold species, and their protein information
species_dict = {}
for fasta_file in fasta_files:
    species = get_species_from_fasta_filename(fasta_file)
    if species not in species_dict:
        species_dict[species] = {}
    for record in SeqIO.parse(fasta_file, 'fasta'):
        species_dict[species][record.id] = {key: {} for key in ["SignalP", "SecretomeP", "TMHMM", "TargetP", "Secretome", "Transmembranome", "Homologs", "Singleton", "Inparalog"]}

def get_species_from_protein_id(protein_id):
    # Search for the species in the dictionary by protein ID
    for species, proteins in species_dict.items():
        if protein_id in proteins:
            return species
    return None

def update_protein_info(row):
    protein_id = row['ID']
    species = get_species_from_protein_id(protein_id)
    if species:
        for category in ['SecretomeP', 'SignalP', 'TargetP', 'TMHMM', 'Secretome', 'Transmembranome', 'Mitochondrial']:  # Skip the 'ID' column
            species_dict[species][protein_id][category] = row[category]

# Use ThreadPoolExecutor to update the dictionary
with ThreadPoolExecutor(max_workers=32) as executor:
    executor.map(update_protein_info, [row for index, row in secretome_and_transmembranome_predictions.iterrows()])


In [10]:
secretome_and_transmembranome_predictions.query("ID.str.contains('Tsol.1')")

Unnamed: 0,ID,SecretomeP,SignalP,TargetP,TMHMM,Secretome,Transmembranome,Mitochondrial
285305,Tsol.1333,Y,N,N,N,Y,N,N
285310,Tsol.1894,Y,N,N,N,Y,N,N
285311,Tsol.11358,Y,N,N,Y,N,Y,N
285314,Tsol.10360,Y,Y,N,N,Y,N,N
285318,Tsol.12227,Y,N,N,Y,N,Y,N
...,...,...,...,...,...,...,...,...
297546,Tsol.12133,N,N,N,Y,N,Y,N
297547,Tsol.10652,N,Y,N,Y,N,Y,N
297558,Tsol.11696,N,N,N,N,N,N,N
297559,Tsol.1392,N,N,N,N,N,N,N


In [11]:
# get status as homolog or singleton, as well as inparalog status

In [12]:
inparalogs_list = [new2old_dict.get(gene) for genes in pd.read_csv('../results/misc/inparalogs_group_composition_final.tsv', sep ='\t').Genes.to_list() for gene in genes.split(', ')]

In [13]:
# Path to the dataset of homolog groups
homolog_files = glob.glob('../results/homolog_groups/*')
# Dictionary to track group size and proteins
group_info = {}
# Populate group information
for file in homolog_files:
    proteins = list(SeqIO.parse(file, 'fasta'))
    group_size = len(proteins)
    for protein in proteins:
        group_info[protein.id] = {'size': group_size, 'group_file': file}
# Update species_dict based on homolog group size and inparalog status
for species, proteins in tqdm.tqdm(species_dict.items()):
    for protein_id in proteins:
        if protein_id in group_info:
            if group_info[protein_id]['size'] == 1:
                species_dict[species][protein_id]['Homologs'] = 'N'
                species_dict[species][protein_id]['Singleton'] = 'Y'
            else:
                species_dict[species][protein_id]['Homologs'] = 'Y'
                species_dict[species][protein_id]['Singleton'] = 'N'
        # Inparalog status update
        species_dict[species][protein_id]['Inparalog'] = 'Y' if protein_id in inparalogs_list else 'N'

100%|██████████| 21/21 [09:35<00:00, 27.40s/it]


Creating a dictionary with correspondance between species and genome codes (needed to retrieve assembly data from WormBase ParaSite)

In [14]:
code2assembly = {'CSIN':  'CSKR.v2',
                 'ECAN':  'ECANG7',
                 'EGRA':  'EGRAN001',
                 'EMUL':  'EMULTI002',
                 'FHEP':  'Fasciola_10x_pilon',
                 'HDIM':  'H.diminuta_WMSil1',
                 'HMIC':  'HMN_v3',
                 'MLIG':  'Mlig_3_7',
                 'MCOR':  'M_corti_Specht_Voge_0011_upd',
                 'OVIV':  'OpiViv1.0',
                 'SCUR':  'tdSchCurr1.1',
                 'SHAE':  'UoM_Shae.V3',
                 'SJAP':  'ASM636876v1',
                 'SMAN':  'SM_V10',
                 'SMAR':  'tdSchMarg1.1',
                 'SMAT':  'tdSchMatt1.1',
                 'SMED':  'SmedGD_c1.3',
                 'TASI':  'Taenia_asiatica_TASYD01_v1',
                 'TSAG':  'ASM169307v2',
                 'TSOL':  'Tsolium_Mexico_v1',
                 'TREG':  'tdTriRege1.1'}

In [15]:
assemblies = list(code2assembly.values())

In [16]:
platy_data = pd.read_csv('../data/species_Platyhelminthes--___.csv').query("Assembly in @assemblies")

In [17]:
platy_data.shape

(21, 10)

Now some InterPro annotations

In [19]:
interpro_header = ['Protein accession',
                   'Sequence MD5 digest',
                   'Sequence length',
                   'Analysis',
                   'Signature accession', 
                   'Signature description', 
                   'Start location',
                   'Stop location',
                   'Score',
                   'Status',
                   'Date', 
                   'InterPro annotations - accession',
                   'InterPro annotations - description', 
                   'GO annotations',
                   'Pathways annotations']

In [20]:
interpro_table = pd.concat([pd.read_csv(tsv_file, names = interpro_header, sep = '\t') for tsv_file in glob.glob('../results/functional_mapping/interpro_annotation/*')])

In [21]:
interpro_table.head()

Unnamed: 0,Protein accession,Sequence MD5 digest,Sequence length,Analysis,Signature accession,Signature description,Start location,Stop location,Score,Status,Date,InterPro annotations - accession,InterPro annotations - description,GO annotations,Pathways annotations
0,maker-E.canG7_contigs_5369-pred_gff_augustus_m...,8783dc2c9fe014f0bd2933b3bba33279,115,Pfam,PF00102,Protein-tyrosine phosphatase,1,113,1.6e-19,T,15-04-2024,IPR000242,"Tyrosine-specific protein phosphatase, PTPase ...",,
1,maker-E.canG7_contigs_5369-pred_gff_augustus_m...,8783dc2c9fe014f0bd2933b3bba33279,115,SUPERFAMILY,SSF52799,(Phosphotyrosine protein) phosphatases II,1,113,3.68e-25,T,15-04-2024,IPR029021,Protein-tyrosine phosphatase-like,,
2,maker-E.canG7_contigs_5369-pred_gff_augustus_m...,8783dc2c9fe014f0bd2933b3bba33279,115,Gene3D,G3DSA:3.90.190.10,Protein tyrosine phosphatase superfamily,1,114,1.1000000000000001e-27,T,15-04-2024,IPR029021,Protein-tyrosine phosphatase-like,,
3,maker-E.canG7_contigs_5369-pred_gff_augustus_m...,8783dc2c9fe014f0bd2933b3bba33279,115,PANTHER,PTHR19134,RECEPTOR-TYPE TYROSINE-PROTEIN PHOSPHATASE,1,113,1.8e-21,T,15-04-2024,-,-,,
4,maker-E.canG7_contigs_5369-pred_gff_augustus_m...,8783dc2c9fe014f0bd2933b3bba33279,115,ProSiteProfiles,PS50055,PTP type protein phosphatase domain profile.,1,115,14.818786,T,15-04-2024,IPR000242,"Tyrosine-specific protein phosphatase, PTPase ...",,


In [22]:
code2interpro = {row['Protein accession']: row['InterPro annotations - accession']+': '+row['InterPro annotations - description'] for index,row in tqdm.tqdm(interpro_table.iterrows())}

605473it [00:49, 12199.91it/s]


## Table 1: Number of inparalogs and singletons identified in each species

In [23]:
import os
if not os.path.exists('../results/tables'):
    os.mkdir('../results/tables')

In [24]:
# Assuming species_dict is structured as {species: {protein_id: {categories...}}}
summary_data = []

for species, proteins in species_dict.items():
    total_proteins = len(proteins)
    homologs_count = sum(1 for details in proteins.values() if details.get('Homologs') == 'Y')
    singletons_count = sum(1 for details in proteins.values() if details.get('Singleton') == 'Y')
    inparalogs_count = sum(1 for details in proteins.values() if details.get('Inparalog') == 'Y')
    
    summary_data.append({
        'Species': species,
        'Total Proteins': total_proteins,
        'Homologs': homologs_count,
        'Singletons': singletons_count,
        'Inparalogs': inparalogs_count
    })

table_one = pd.DataFrame(summary_data)

In [25]:
table_one.to_csv('../results/tables/table_1.tsv', sep = '\t', index = False)

In [26]:
table_one

Unnamed: 0,Species,Total Proteins,Homologs,Singletons,Inparalogs
0,T. solium,12356,7741,4615,154
1,M. lignano,49013,41292,7721,38828
2,T. regenti,14478,10466,4012,3259
3,H. microstoma,10139,8120,2019,1004
4,T. saginata,13160,10842,2318,673
5,H. diminuta,15165,9937,5228,3023
6,F. hepatica,9731,4666,5065,247
7,S. mattheei,10770,8108,2662,478
8,S. curassoni,10084,8804,1280,587
9,E. multilocularis,10663,9538,1125,515


## Supplementary Table 1
This table was created manually, parsing data from WormBase ParaSite.

## Supplementary Table 2: Summary of results for secreted and transmembrane protein-coding genes in each species analyzed. Mollusca and Annelida phyla were included.

First getting species2n50 and species2ncontig dictionaries

In [28]:
# Dictionaries
species2n50 = {}
species2ncontig = {}

for index,entry in platy_data.iterrows():
    genus, species = entry["Species Name"].split(' ')
    key = f"{genus[0]}. {species}"
    species2n50[key] = int(entry["N50"].replace(",", ""))
    species2ncontig[key] = entry["Number of Scaffolds"]

In [30]:
# Load the table with column names Genes and Annot
file_path = '/home/mauricio/mauricio_PROYECTOS/PEIPER_PLATYS/platyhelminthes_inparalogs/results/functional_mapping/platyhelminthes_dataset.longest_isoforms.all.all.diamondCOGs.emapper.goatools_format.annotations'
annotations_df = pd.read_csv(file_path, sep='\t', names=['Genes', 'Annot'])

# Update species_dict based on annotations
for index, row in annotations_df.iterrows():
    gene_id = row['Genes']
    go_annotation = 'Y' if pd.notna(row['Annot']) and row['Annot'].strip() != '' else 'N'
    
    # Assuming gene_id is directly accessible as a key in species_dict
    for species, proteins in species_dict.items():
        if gene_id in proteins:
            proteins[gene_id]['GOAnnotation'] = go_annotation

In [32]:
# Initialize an empty list to store summary data for each species
summary_data = []

# Iterate through each species in the dictionary
for species, proteins in species_dict.items():
    total_proteins = len(proteins)
    secreted_genes = sum(1 for details in proteins.values() if details.get('Secretome') == 'Y')
    transmembrane_genes = sum(1 for details in proteins.values() if details.get('Transmembranome') == 'Y')
    go_annotated_genes = sum(1 for details in proteins.values() if details.get('GOAnnotation') == 'Y')
    
    # Count the total number of Homologs and Inparalogs
    total_homologs = sum(1 for details in proteins.values() if details.get('Homologs') == 'Y')
    total_inparalogs = sum(1 for details in proteins.values() if details.get('Inparalog') == 'Y')
    
    # Count those Homologs and Inparalogs that also have GO annotations
    homologs_with_go = sum(1 for details in proteins.values() if details.get('Homologs') == 'Y' and details.get('GOAnnotation') == 'Y')
    inparalogs_with_go = sum(1 for details in proteins.values() if details.get('Inparalog') == 'Y' and details.get('GOAnnotation') == 'Y')
    unannotated_inparalogs = sum(1 for details in proteins.values() if details.get('Inparalog') == 'Y' and details.get('GOAnnotation') == 'N')

    # Calculating percentages
    secreted_percent = (secreted_genes / total_proteins * 100) if total_proteins > 0 else 0
    transmembrane_percent = (transmembrane_genes / total_proteins * 100) if total_proteins > 0 else 0
    go_annotated_percent = (go_annotated_genes / total_proteins * 100) if total_proteins > 0 else 0
    homologs_with_go_percent = (homologs_with_go / total_homologs * 100) if total_homologs > 0 else 0
    inparalogs_with_go_percent = (inparalogs_with_go / total_inparalogs * 100) if total_inparalogs > 0 else 0
    unannotated_inparalogs_percent = (unannotated_inparalogs / total_inparalogs * 100) if total_inparalogs > 0 else 0

    # Append data for this species to the list
    summary_data.append({
        'Species': species,
        'Total number of scaffolds': species2ncontig.get(species),
        'N50': species2n50.get(species),
        'Total number of proteins': total_proteins,
        'Secreted protein-coding genes (%)': secreted_percent,
        'Transmembrane protein-coding genes (%)': transmembrane_percent,
        'Genes with GO annotation (%)': go_annotated_percent,
        'Homologs with GO annotation (%)': homologs_with_go_percent,
        'Inparalogs with GO annotation (%)': inparalogs_with_go_percent,
        'Unannotated inparalogs (%)': unannotated_inparalogs_percent
    })

# Create a DataFrame from the summary data
supp_table_two = pd.DataFrame(summary_data)

In [33]:
supp_table_two.to_csv('../results/tables/supplementary_table_2.tsv', sep ='\t', index = False)

In [34]:
supp_table_two

Unnamed: 0,Species,Total number of scaffolds,N50,Total number of proteins,Secreted protein-coding genes (%),Transmembrane protein-coding genes (%),Genes with GO annotation (%),Homologs with GO annotation (%),Inparalogs with GO annotation (%),Unannotated inparalogs (%)
0,T. solium,11237,67829,12356,7.389123,17.651343,71.819359,80.299703,74.025974,0.0
1,M. lignano,5269,244885,49013,8.387571,22.573603,71.072981,73.82786,73.362007,0.0
2,T. regenti,573,125840967,14478,8.281531,20.251416,69.836994,78.081406,62.964099,0.0
3,H. microstoma,7,25815276,10139,6.973074,19.666634,71.841404,78.522167,55.179283,0.0
4,T. saginata,3626,585232,13160,10.471125,19.24772,58.168693,62.506918,42.793462,0.0
5,H. diminuta,719,2331359,15165,8.578965,15.522585,55.351137,61.406863,20.410189,0.0
6,F. hepatica,2816,1901411,9731,7.470969,18.816155,74.966602,93.120446,75.303644,0.0
7,S. mattheei,98,48524730,10770,5.561746,21.058496,77.437326,81.857425,76.569038,0.0
8,S. curassoni,370,46058712,10084,5.474018,21.410155,79.125347,82.451159,83.816014,0.0
9,E. multilocularis,1217,13762452,10663,7.849573,20.003751,69.90528,73.631789,66.019417,0.0


## Supplementary Table 3: GO terms enrichment analysis results
This table has already being produced at the **functional_enrichment_analysis.ipynb** notebook.

## Supplementary Table 4: 
The raw table for this Supp. Table was created in the **molecular_evolution_analyses.ipynb** notebook, as well as in the notebook defining inparalogs and computing InterPro annotation

In [46]:
import pandas as pd

In [47]:
# load raw table

In [48]:
codeml_results = pd.read_csv('../results/molecular_evolution_analyses/codeml_results/codeml_results.tsv', sep ='\t').drop('index', axis = 1)

In [49]:
df = codeml_results
# Create a dictionary for LnL values when Model is M1
lnl_dict = df[df['Model'] == 'M1'].set_index('Family')['LnL'].to_dict()

# Subset the DataFrame to keep only Model M2 and rename the LnL column to LnL_2
m2_df = df[df['Model'] == 'M2'].copy()
m2_df.rename(columns={'LnL': 'LnL_2'}, inplace=True)

# Assign a new column for LnL_1 using the dictionary
m2_df['LnL_1'] = m2_df['Family'].map(lnl_dict)

In [50]:
supp_table_4 = m2_df

In [51]:
# add Genes

In [52]:
mgc2genes = {row['monophyletic_group_code']: row['Genes'] for index,row in pd.read_csv('../results/misc/inparalogs_group_composition_final.tsv', sep = '\t').iterrows()}
mgc2species = {row['monophyletic_group_code']: row['Species'] for index,row in pd.read_csv('../results/misc/inparalogs_group_composition_final.tsv', sep = '\t').iterrows()}

In [53]:
# add Genes and Species
supp_table_4 = supp_table_4.assign(Genes = lambda df: [mgc2genes[row['Family']] for index,row in df.iterrows()],
                                   Species = lambda df: [mgc2species[row['Family']] for index,row in df.iterrows()])

In [54]:
supp_table_4 = supp_table_4[['Family', 'Genes', 'Species', 'LnL_1', 'LnL_2', 'Likelihood_ratio', 'Num. sites (BEB pp > 0.95)', 'Num. sites (BEB pp > 0.99)',
              'dN', 'dS', 'dN/dS', 'Significativo']]

In [55]:
# set table in correct order

In [56]:
supp_table_4

Unnamed: 0,Family,Genes,Species,LnL_1,LnL_2,Likelihood_ratio,Num. sites (BEB pp > 0.95),Num. sites (BEB pp > 0.99),dN,dS,dN/dS,Significativo
0,F79292_TASI_G1,"TASs00700g11196m00001, TASs00797g11394m00001, ...",T. asiatica,-561.882472,-561.882470,0.000004,0.0,0.0,0.0000,0.0000,0.5359,-
3,F8315_TASI_G1,"TASs00257g09507m00001, TASs00257g09505m00001",T. asiatica,-75.424929,-75.424872,0.000114,0.0,0.0,0.0000,0.0000,0.0000,-
4,F31918_MLIG_G1,"BOX15_Mlig003887g1, BOX15_Mlig003887g3, BOX15_...",M. lignano,-1319.672893,-1319.539216,0.267354,0.0,0.0,0.0000,0.0000,0.7685,-
6,F23474_MLIG_G1,"BOX15_Mlig006866g1, BOX15_Mlig006866g2, BOX15_...",M. lignano,-874.963685,-874.963684,0.000002,0.0,0.0,0.0000,0.0000,0.3769,-
8,F53749_MLIG_G1,"BOX15_Mlig031608g3, BOX15_Mlig031608g2, BOX15_...",M. lignano,-1140.362246,-1140.362241,0.000010,0.0,0.0,0.0058,0.0259,0.2219,-
...,...,...,...,...,...,...,...,...,...,...,...,...
39782,F129217_MLIG_G1,"BOX15_Mlig022362g1, BOX15_Mlig026567g1, BOX15_...",M. lignano,-1717.806086,-1711.561286,12.489600,74.0,34.0,9.3191,0.7274,12.8117,***
39784,F93693_MLIG_G1,"BOX15_Mlig018049g7, BOX15_Mlig018049g1",M. lignano,-1966.452456,-1966.014225,0.876462,0.0,0.0,0.0000,0.0000,124.1390,-
39787,F114209_MLIG_G1,"BOX15_Mlig011283g1, BOX15_Mlig011283g6, BOX15_...",M. lignano,-213.669497,-213.669409,0.000176,0.0,0.0,0.0000,0.0734,0.0000,-
39789,F80846_MLIG_G1,"BOX15_Mlig033116g1, BOX15_Mlig033116g2, BOX15_...",M. lignano,-1722.066589,-1718.970376,6.192426,0.0,0.0,0.0241,0.0079,3.0504,***


In [57]:
# Function to get InterPro annotation for a gene
def get_interpro_annotation(gene):
    return code2interpro.get(gene, "")

# Apply the function to each row to get InterPro annotations
supp_table_4['InterPro Annotation'] = supp_table_4['Genes'].apply(lambda x: '\n'.join([get_interpro_annotation(g) for g in x.split(', ')]))


In [58]:
# rename and save tsv
supp_table_4.rename({'Family': 'Inparalogs cluster code #DB',
                     'LnL_1': 'lnL (M1a = nearly neutral)',
                     'LnL_2': 'lnL (M2a = selection allowed)',
                     'dN/dS': 'Global dN/dS',
                     'Likelihood_ratio': 'LRT',
                     'Significativo': 'Significance (LRT > 6)',
                     'Num. sites (BEB pp > 0.95)': '# sites (BEB pp > 0.95)',
                     'Num. sites (BEB pp > 0.95)': '# sites (BEB pp > 0.95)', 
                     'InterPro Annotation': 'InterPro Annotations'}, axis = 1).to_csv('../results/tables/supplementary_table_4.tsv', sep = '\t', index = False)

## Supplementary Table 5
This table has already being produced (see notebooks/scripts in this folder).

## Supplementary Table 6
This table has already being produced at the **renaming_set_filtering_and_homology_inferences.ipynb** notebook.