# Overview
For Supp. Table 6 all cophenetic distances for _S. mansoni_ inparalog groups must be calculated. This script does this calculation using ape (package from R), taking into account only cases were a phylogenetic tree was inferred (i.e. groups with at least five sequences).

In [12]:
# import libraries
import ete3
import glob
import pandas as pd

In [13]:
inparalogs_smansoni = pd.read_csv('../results/misc/inparalogs_group_composition_final.tsv', sep = '\t').query("Species == 'S. mansoni'")

In [14]:
# get dictionary from wormbase code to internal code
original2code = {row['Original Name']: row['New Name'] for index,row in pd.read_csv('../results/misc/gene_code_correspondance.tsv', sep = '\t').iterrows()}

In [15]:
# get families with inparalogous genes for S. mansoni

In [16]:
smansoni_inparalogous_families = [x.split('_')[0] for x in inparalogs_smansoni.monophyletic_group_code.to_list()]

In [17]:
# get inparalogous genes for S. mansoni in each family, using internal code

In [18]:
family2inparalogs = {}
for index,row in inparalogs_smansoni.iterrows():
    genes = row['Genes'].split(', ')
    family = row['monophyletic_group_code'].split('_')[0]
    current_group = []
    for gene in genes:
        current_group.append(original2code.get(gene))
    # update dict
    if not family in family2inparalogs.keys():
        family2inparalogs.update({family: [current_group]})
    else:
        family2inparalogs.get(family).append(current_group)


In [19]:
import rpy2

In [20]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [29]:
codeml_out_sman = [x for x in glob.glob('../results/molecular_evolution_analyses/codeml_results/*/*/M0*/out') if 'SMAN' in x]

In [31]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector

# Activate the automatic conversion of pandas DataFrames to R DataFrames
pandas2ri.activate()

# Load the necessary R packages
base = rpackages.importr('base')
utils = rpackages.importr('utils')
treeio = rpackages.importr('treeio')
magrittr = rpackages.importr('magrittr')
corrr = rpackages.importr('corrr')
dplyr = rpackages.importr('dplyr')
ape = rpackages.importr('ape')

# Define the R function in the Python environment
ro.r('''
library(treeio)
library(magrittr)
library(corrr)
library(dplyr)
library(ape)

calculador_cophenetic_dS = function(data_codeml_mlc) {
    # obtengo arbol de este archivo mlc
        data_codeml_mlc %>% treeio::get.tree() -> arbol_codeml
    
        # obtengo distancia cofenetica, tomando como largo de rama el dS calculado por PAML
        arbol_codeml %>%
          treeio::as.treedata() %>% 
            as_tibble() %>%   
            dplyr::full_join(x = ., y = (data_codeml_mlc %>% get.data()), by = c('node')) %>% 
            dplyr::select(parent, node, dS, label) %>% 
            dplyr::rename(branch.length = 'dS') %>% 
            ape::as.phylo() %>% 
            ape::cophenetic.phylo(x = .) -> cophenetic_dS

        # modifico un poco la tabla de distancia cofeneticas
        cophenetic_dS %<>% 
        corrr::as_cordf() %>% 
        corrr::stretch(x = ., remove.dups=T, na.rm=T) %>% 
        dplyr::rename(`cophenetic dS` = 'r')
        cophenetic_dS %>% dplyr::select(`cophenetic dS`) -> valores_distancia
        cophenetic_dS %<>% dplyr::select(-`cophenetic dS`)
        #cophenetic_dS[] <- dicc_indexados[unlist(cophenetic_dS)]
        cophenetic_dS %<>% dplyr::mutate(`cophenetic dS` = valores_distancia$`cophenetic dS`)
        
        # ordeno las columnas de los genes
        #cophenetic_dS %<>% ordenador_tablas(tabla = .)
        
        # devuelvo el resultado
        return(cophenetic_dS)
}
''')

  values, tz_parsed = conversion.datetime_to_datetime64(data)


<rpy2.robjects.functions.SignatureTranslatedFunction object at 0x7f4e14334bc8> [RTYPES.CLOSXP]
R classes: ('function',)

In [32]:
cophenetic_ds_rows = []
# Process each file in the loop
for codeml_out in codeml_out_sman:
    ro.r.assign("codeml_out", codeml_out)
    ro.r('''
    data_codeml_mlc = treeio::read.codeml_mlc(codeml_out)
    cophenetic_dS = calculador_cophenetic_dS(data_codeml_mlc)
    ''')
    cophenetic_dS = ro.r('cophenetic_dS')
    cophenetic_ds_rows.append(cophenetic_dS)

In [33]:
cophenetic_ds_table = pd.concat(cophenetic_ds_rows)

In [36]:
code2original = {value:key for key,value in original2code.items()}

In [38]:
cophenetic_ds_table = cophenetic_ds_table.applymap(lambda x: code2original.get(x,x))

In [39]:
# saving table
cophenetic_ds_table.to_csv('../results/misc/cophenetic_ds_smansoni.tsv', sep = '\t', index = False)