In [1]:
# Load trees from .csv files in trees/ directory. Each file its called <model>.csv. I want to load all of them and
# create a NN to predict the model given the tree. The model is a string (name of the file) and the trees are stored as csv (CDV format)

import os
import numpy as np
from ete3 import Tree

class TreeEncoder:

    def encode_tree(self, tree_str):
        """
        Encode the tree structure into a format suitable for input into the neural network.
        """
        # Check if the tree is already encoded
        csv_file = tree_str[:-4] + '.csv'
        if os.path.exists(csv_file):
            return csv_file
        # Call the external script to get CDV encoding
        cmd = f"python -m CDV_full_tree -t {tree_str} -f {csv_file}" # > {tree_str[:-4]}.csv"
        os.system(cmd)
        

    def encode_all_trees(self, trees_directory):
        """
        Encode all the trees in the given directory.
        """
        tree_files = [os.path.join(trees_directory, file) for file in os.listdir(trees_directory) if file.endswith('.nwk')]
        for tree_file in tree_files:
            print(tree_file)
            self.encode_tree(tree_file)

In [3]:
import sys

trees_directory = "trees/"

encoder = TreeEncoder()
encoder.encode_all_trees(trees_directory)

trees/bd.nwk
trees/bisse.nwk
trees/bisseness.nwk
trees/classe.nwk
trees/geosse.nwk
trees/musse_500k.nwk
trees/musse_90k.nwk
trees/quasse.nwk


In [31]:
import pandas as pd

trees = []
target = []

models = ["bd",
          #"bisse",
          #"bisseness",
          #"classe",
          "geosse"
          #"musse",
          #"quasse"
          ]

for model_name in models:
    encoding = pd.read_csv(trees_directory + model_name + '.csv', sep='\t', header=None, skiprows=0, index_col=0)
    trees.append(encoding)
    target.append([model_name] * len(encoding))
    
assert len(trees) == len(target)

print("Number of trees per model:")
for model, tree in zip(models, trees):
    print(f"{model}: {len(tree)}")
    
# Concatenate all the trees and targets
trees = pd.concat(trees)
target = np.concatenate(target)

print("Number of trees in total:", len(trees))
print("Number of targets in total:", len(target))

trees

Number of trees per model:
bd: 268
geosse: 68
Number of trees in total: 336
Number of targets in total: 336


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,993,994,995,996,997,998,999,1000,1001,1002
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13.975176,13.041968,13.591479,13.794207,13.001776,13.213374,12.199373,13.930781,11.732213,11.134086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.599156
1,8.012841,6.149481,5.708792,6.178136,7.791805,4.613272,3.817847,7.059666,7.898457,7.967581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.770129
2,18.280376,15.809396,15.670503,17.693003,17.110394,16.115005,17.440034,15.827314,17.977755,15.295922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.591183
3,17.996226,17.783270,17.744065,17.712306,17.127398,16.715255,16.365283,17.615009,17.236252,16.256012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.990354
4,5.910599,2.840898,4.631535,5.868371,4.553909,5.910599,0.000000,5.384971,4.906543,1.557961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,7.489382,4.045324,6.926766,7.320288,6.419889,7.428212,3.178314,6.567476,5.811917,6.575390,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.360962
63,7.238689,5.652068,6.180483,6.923859,7.196220,5.657487,7.238689,4.913195,5.806563,6.089097,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.086283
64,4.619219,3.115959,2.650620,2.269411,4.119396,4.421123,4.619219,0.742048,3.067237,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319923
65,4.728483,2.941984,4.089736,4.415814,2.107386,4.358293,2.081737,0.094746,0.000000,3.158536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438402


In [32]:
target

array(['bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd',
       'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'bd', 'b

In [33]:
# Save the data
trees.to_csv("trees.csv")
np.save("target.npy", target)

In [34]:
# Load the data
trees = pd.read_csv("trees.csv", index_col=0)
target = np.load("target.npy")

In [45]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(trees, target, test_size=0.2, random_state=42)

print(X_train)
print(y_train)
print(X_test)
print(y_test)

# Train an advanced model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Predictions:", y_pred)
print("True values:", y_test)


             1          2          3          4          5          6  \
0                                                                       
60   18.844285  17.222779  16.318785  17.901790  15.057214  16.401298   
227  13.640405  12.881331  12.651231  12.387678  13.264857  12.013968   
53    7.405947   0.000000   5.805164   7.175290   3.046024   6.964809   
49    5.500219   4.644110   4.281078   4.530771   5.209118   3.054051   
17   10.359671   8.861143   9.815362   8.551274   9.793306   8.425731   
..         ...        ...        ...        ...        ...        ...   
188  11.847936  10.054769   9.883503  10.650791  11.790603  10.567386   
71   13.484161  12.660974  11.922294  11.877828  10.920426  10.487894   
106   9.298918   7.225102   7.802530   9.203167   6.997003   9.177184   
1     9.112086   3.462969   7.991544   8.630310   8.852308   8.923661   
102  18.660933  18.274817  18.049140  17.416633  17.059121  18.293719   

             7          8          9         10  .

In [43]:
# Test the model


ValueError: stat: path too long for Windows

In [37]:
# Evaluate the model
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [None]:
# Save the model
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(clf, f)