In [174]:
import os
import numpy as np
from ete3 import Tree

class TreeEncoder:

    def encode_tree(self, tree_str):
        """
        Encode the tree structure into a format suitable for input into the neural network.
        """
        # Check if the tree is already encoded
        csv_file = tree_str[:-4] + '.csv'
        if os.path.exists(csv_file):
            return csv_file
        # Call the external script to get CDV encoding
        cmd = f"python -m CDV_full_tree -t {tree_str} > {tree_str[:-4]}.csv"
        os.system(cmd)

    def encode_all_trees(self, trees_directory):
        """
        Encode all the trees in the given directory.
        """
        tree_files = [os.path.join(trees_directory, file) for file in os.listdir(trees_directory) if file.endswith('.nwk')]
        for tree_file in tree_files:
            print(tree_file)
            self.encode_tree(tree_file)


In [181]:
trees_directory = "trees/"
encoder = TreeEncoder()
encoded_trees = encoder.encode_all_trees(trees_directory)

trees/bd.nwk
trees/bisse.nwk
trees/bisseness.nwk
trees/classe.nwk
trees/geosse.nwk
trees/musse.nwk


In [194]:
import pandas as pd

# Load parameter values as a dataframe
tree_files = [os.path.join(trees_directory, file) for file in os.listdir(trees_directory) if file.endswith('.nwk')]

def process_params_musse(param_file):
    # Reads the parameter file and saves the values in a csv file, one tree per row.
    '''
    Example:
    num_states 3
    lambda1  0.581637403143104
    lambda2  0.639469627428334
    lambda3  0.694642654564232
    mu1  0.076379950507544
    mu2  0.339707092673052
    mu3  0.246268530318048
    q12  0.191004574298859
    q13  0.143894099444151
    q21  0.154783235490322
    q23  0.62774416487664
    q31  0.460761926881969
    q32  0.606042030081153
    num_states 3
    ....
    '''
    # If the parameter file is already in the csv format, return
    if param_file.endswith('.csv'):
        return
    with open(param_file, 'r') as f:
        lines = f.readlines()
    param_values = []
    for line in lines:
        if line.startswith('num_states'):
            param_values.append([])
        else:
            param_values[-1].append(line.split()[1])
    param_df = pd.DataFrame(param_values)
    param_df.to_csv(param_file + '.csv', sep='\t', header=False, index=False)
        
        
    

# Create a dataframe to store the parameter values

process_params_musse(param_file=trees_directory + 'musse.params')

cutoff = 250  # Number of trees to process

param_train = pd.read_csv(trees_directory + 'musse.params.csv', nrows=cutoff, header=None, sep='\t', index_col=0)
param_test = pd.read_csv(trees_directory + 'musse.params.csv', sep='\t', header=None, index_col=0)

print(param_train.shape)
print(param_test.shape)

column_names = ['lambda1', 'lambda2', 'lambda3', 'mu1', 'mu2', 'mu3', 'q12', 'q13', 'q21', 'q23', 'q31', 'q32']

def rename_columns(df, names):
    df = df.rename(columns={i: names[int(i)-1] for i in df.columns})
    return df

param_train = rename_columns(param_train, column_names)
param_test = rename_columns(param_test, column_names)

print(param_train.head())
print(param_test.head())

# loading tree encodings/representations
# encoding has the following structure: 1 value of tree height, 500 values for tip states ('1' or '2')
# 1 value for tree height and 500 values for internal node heights
# + 2 values for nb of tips of each type (to be removed) and 1 value of rescaling (removed, but stocked for rescaling predicted values back to the original scale)

encoding_train = pd.read_csv(trees_directory + 'musse.csv', sep="\t", header=None, nrows=cutoff, index_col=0)
encoding_test = pd.read_csv(trees_directory + 'musse.csv', sep="\t", header=None, index_col=0)

print(encoding_train.shape)
print(encoding_test.shape)

print(encoding_train.head())


return
encoded_tree_files = [os.path.join(trees_directory, file) for file in os.listdir(trees_directory) if file.endswith('.csv')]
encoded_trees = pd.DataFrame()
for encoded_tree_file in encoded_tree_files:
    encoded_tree = pd.read_csv(encoded_tree_file, sep="\t", header=None, index_col=0, skiprows=1)  # Skip the header row
    encoded_trees = pd.concat([encoded_trees, encoded_tree], ignore_index=True)

# make sure there is correspondance between indexes of dataframe with parameter values and encodings
print(param_df.shape)
print(encoded_trees.shape)
param_df.index = encoded_trees.index

# split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_trees, param_df, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


# part of the relative path for writing down the output files
chemin = "trained_models/musse/"

# the suffix of output files
expname='_245_longest_absolute_error'

(236, 11)
(236, 11)
           lambda1   lambda2   lambda3       mu1       mu2       mu3  \
0                                                                      
0.581637  0.639470  0.694643  0.076380  0.339707  0.246269  0.191005   
0.171136  0.455613  0.551331  0.150403  0.076921  0.364527  0.860820   
0.587373  0.688031  0.478535  0.045761  0.677784  0.630738  0.375229   
0.267792  0.690626  0.640356  0.712357  0.443546  0.298779  0.660996   
0.454680  0.469158  0.566001  0.275795  0.631396  0.581336  0.173862   

               q12       q13       q21       q23       q31  
0                                                           
0.581637  0.143894  0.154783  0.627744  0.460762  0.606042  
0.171136  0.827794  0.251758  0.249944  0.578592  0.771593  
0.587373  0.631197  0.194841  0.177321  0.117500  0.329491  
0.267792  0.771743  0.765116  0.413921  0.564865  0.321370  
0.454680  0.273638  0.850283  0.287332  0.651840  0.670395  
           lambda1   lambda2   lambda3       mu1

SyntaxError: 'return' outside function (2037089225.py, line 81)

In [164]:
#correctly reshape parameters (rescaling) and encodings (remove nb of each type + rescale factor):

### TRAINING SET: PARAMETER VALUES
# norm the parameters
y_train_norm = y_train.copy()
y_train_norm['lambda1'] = y_train['lambda1'] / y_train['lambda1'].max()
y_train_norm['lambda2'] = y_train['lambda2'] / y_train['lambda2'].max()
y_train_norm['lambda3'] = y_train['lambda3'] / y_train['lambda3'].max()
y_train_norm['mu1'] = y_train['mu1'] / y_train['mu1'].max()
y_train_norm['mu2'] = y_train['mu2'] / y_train['mu2'].max()
y_train_norm['mu3'] = y_train['mu3'] / y_train['mu3'].max()
y_train_norm['q12'] = y_train['q12'] / y_train['q12'].max()
y_train_norm['q13'] = y_train['q13'] / y_train['q13'].max()
y_train_norm['q21'] = y_train['q21'] / y_train['q21'].max()
y_train_norm['q23'] = y_train['q23'] / y_train['q23'].max()
y_train_norm['q31'] = y_train['q31'] / y_train['q31'].max()
y_train_norm['q32'] = y_train['q32'] / y_train['q32'].max()

# save the normed parameters
y_train_norm.to_csv(chemin + 'y_train_norm' + expname + '.csv')

### TESTING SET: PARAMETER VALUES
# norm the parameters
y_test_norm = y_test.copy()
y_test_norm['lambda1'] = y_test['lambda1'] / y_test['lambda1'].max()
y_test_norm['lambda2'] = y_test['lambda2'] / y_test['lambda2'].max()
y_test_norm['lambda3'] = y_test['lambda3'] / y_test['lambda3'].max()
y_test_norm['mu1'] = y_test['mu1'] / y_test['mu1'].max()
y_test_norm['mu2'] = y_test['mu2'] / y_test['mu2'].max()
y_test_norm['mu3'] = y_test['mu3'] / y_test['mu3'].max()
y_test_norm['q12'] = y_test['q12'] / y_test['q12'].max()
y_test_norm['q13'] = y_test['q13'] / y_test['q13'].max()
y_test_norm['q21'] = y_test['q21'] / y_test['q21'].max()
y_test_norm['q23'] = y_test['q23'] / y_test['q23'].max()
y_test_norm['q31'] = y_test['q31'] / y_test['q31'].max()
y_test_norm['q32'] = y_test['q32'] / y_test['q32'].max()

# save the normed parameters
y_test_norm.to_csv(chemin + 'y_test_norm' + expname + '.csv')

# remove irrelevant columns: count of each type of tip and normalization factor
#### X_train = X_train.drop(columns=[1003, 1004, 1005], axis=1, inplace=True)

# save the encodings
X_train.to_csv(chemin + 'X_train' + expname + '.csv')
X_test.to_csv(chemin + 'X_test' + expname + '.csv')


In [167]:
# Read the data
X_train = pd.read_csv(chemin + 'X_train' + expname + '.csv', index_col=0)
X_test = pd.read_csv(chemin + 'X_test' + expname + '.csv', index_col=0)
y_train_norm = pd.read_csv(chemin + 'y_train_norm' + expname + '.csv', index_col=0)
y_test_norm = pd.read_csv(chemin + 'y_test_norm' + expname + '.csv', index_col=0)


#Choice of the parameters to predict

predict_all = True
if not predict_all:
    target1 = 'lambda1'
    target2 = 'lambda2'
    target3 = 'lambda3'
    target4 = 'mu1'
    target5 = 'mu2'
    target6 = 'mu3'
    target7 = 'q12'
    target8 = 'q13'
    target9 = 'q21'
    target10 = 'q23'
    target11 = 'q31'
    target12 = 'q32'
    
    targets = pd.DataFrame(y_train_norm[target1, target2, target3, target4, target5, target6, target7, target8, target9, target10, target11, target12])
    targets_test = pd.DataFrame(y_test_norm[target1, target2, target3, target4, target5, target6, target7, target8, target9, target10, target11, target12])
    
else:
    
    targets = y_train_norm
    targets_test = y_test_norm
    
features = X_train
features_test = X_test

# split in train and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)




(156, 502)
(156, 12)
(40, 502)
(40, 12)


In [168]:
    """
    #Add the known sampling fraction as 3*2 matrix into the representation (both train and test sets)
    add_target = "sampling_frac"
    added_targets = pd.DataFrame(param_train[add_target])
    features['1003'] = added_targets
    features['1004'] = added_targets
    features['1005'] = added_targets
    features['1006'] = added_targets
    features['1007'] = added_targets
    features['1008'] = added_targets

    added_targets2 = pd.DataFrame(param_test[add_target])
    features_test['1003'] = added_targets2
    features_test['1004'] = added_targets2
    features_test['1005'] = added_targets2
    features_test['1006'] = added_targets2
    features_test['1007'] = added_targets2
    features_test['1008'] = added_targets2
    """

'\n#Add the known sampling fraction as 3*2 matrix into the representation (both train and test sets)\nadd_target = "sampling_frac"\nadded_targets = pd.DataFrame(param_train[add_target])\nfeatures[\'1003\'] = added_targets\nfeatures[\'1004\'] = added_targets\nfeatures[\'1005\'] = added_targets\nfeatures[\'1006\'] = added_targets\nfeatures[\'1007\'] = added_targets\nfeatures[\'1008\'] = added_targets\n\nadded_targets2 = pd.DataFrame(param_test[add_target])\nfeatures_test[\'1003\'] = added_targets2\nfeatures_test[\'1004\'] = added_targets2\nfeatures_test[\'1005\'] = added_targets2\nfeatures_test[\'1006\'] = added_targets2\nfeatures_test[\'1007\'] = added_targets2\nfeatures_test[\'1008\'] = added_targets2\n'

In [170]:
# explore the data
print(features.shape)
print(targets.shape)
print(features_test.shape)

(196, 502)
(196, 12)
(49, 502)
