In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import time

In [4]:
print(tf.config.list_physical_devices('GPU'))


[]


# Data loading // Get Dataset


In [5]:
#we load array
path_to_label = "Data/BD_simu/small_params_clear1.csv"
path_to_input = "Data/BD_simu/small_CBLV_data1.npy"

In [6]:
nb_estimates = 2
params_to_infer = ["R_nought", "infectious_time"]
PROP_TRAIN =  0.8
PROP_VAL = 0.2

all_inputs = np.transpose(np.load(path_to_input), axes=(0, 2, 1))
#all_inputs = np.load(path_to_input)
all_labels = pd.read_csv(path_to_label)[params_to_infer].to_numpy()

In [13]:
## !!! shuffle before 
num_samples = all_inputs.shape[0]
num_train, num_val = int(num_samples*PROP_TRAIN), int(num_samples*PROP_VAL)
# num_test = num_samples - num_train - num_val

x_train, y_train = all_inputs[:num_train, :, :], all_labels[:num_train, :]
x_val, y_val = all_inputs[num_train:num_train+num_val, :, :], all_labels[num_train:num_train+num_val, :]
x_test, y_test = all_inputs[num_train+num_val:, :, :], all_labels[num_train+num_val:, :]

In [14]:
train_size = x_train.shape[0]
print(f"There is {train_size} training data and {x_val.shape[0]} validation data")

There is 26648 training data and 6662 validation data


The input is CBLV (Compact Bijective Ladderize Vector) array of shape (maximum tree size of the dataset, 2)


# Design CNN model for parameter inference

The CNN consists of one input layer (of 400 and 1002 input nodes for trees with 50-199 and 200-500 tips,
respectively). This input is then reshaped into a matrix of size of 201* 2 and 501*2, for small and large trees,
respectively, with entries corresponding to tips and internal nodes separated into separate rows (and one extra column
with one entry in each row corresponding to the sampling probability). Then, there are two 1D convolutional layers
of 50 kernels each, of size 3 and 10, respectively, followed by max pooling of size 10 and another 1D convolutional
layer of 80 kernels of size 10. After the last convolutional layer, there is a GlobalPoolingAverage1D layer and a FFNN
![image.png](attachment:image.png)

4 sequential hidden layers organized in a funnel shape with 64-32-16-8 neurons and 1 output layer of size 2-4 depending on the number of parameters to be estimated. The neurons of the last hidden layer have linear activation, while others have exponential linear activation
![image-2.png](attachment:image-2.png)

To prevent overfitting during training, we use: [1] the early stopping algorithm evaluating MAPE on a validation set; and [2] dropout that we set to 0.5 in the feed-forward part of both NNs

In [15]:
size_sm_tree, size_lg_tree = 201, 501
SIZE = size_sm_tree

In [18]:
# FNN_part = keras.Sequential(
# [
#     keras.Input(shape=(80)),
#     layers.Dense(64, activation="elu"),
#     layers.Dropout(.5),
#     layers.Dense(32, activation="elu"),
#     layers.Dropout(.5),
#     layers.Dense(16, activation="elu"),
#     layers.Dropout(.5),
#     layers.Dense(8, activation="elu"),
#     layers.Dropout(.5),
#     layers.Dense(nb_estimates, activation="linear")
# ])
#FNN_part.summary()

CNN_infer = keras.Sequential(
    [
        keras.Input(shape=(SIZE, 2)),
        layers.Conv1D(filters=50, kernel_size=3),   
        layers.Conv1D(filters=50, kernel_size=10),
        layers.MaxPooling1D(pool_size=10),
        layers.Conv1D(filters=80, kernel_size=10),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation="elu"),
        layers.Dropout(.5),
        layers.Dense(32, activation="elu"),
        layers.Dropout(.5),
        layers.Dense(16, activation="elu"),
        layers.Dropout(.5),
        layers.Dense(8, activation="elu"),
        layers.Dropout(.5),
        layers.Dense(nb_estimates, activation="linear")
    ]
)
#CNN_infer.add(FNN_part)

CNN_infer.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_6 (Conv1D)           (None, 199, 50)           350       
                                                                 
 conv1d_7 (Conv1D)           (None, 190, 50)           25050     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 19, 50)           0         
 1D)                                                             
                                                                 
 conv1d_8 (Conv1D)           (None, 10, 80)            40080     
                                                                 
 global_average_pooling1d_2   (None, 80)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_10 (Dense)            (None, 64)               

In [26]:
CNN_infer2 = keras.Sequential(
    [
        keras.Input(shape=(SIZE, 2)),
        layers.Conv1D(filters=50, kernel_size=3),  ## !! normalement batch normalization puis relu apres conv 
#         layers.BatchNormalization(),
        layers.Activation(keras.activations.relu),
        layers.Conv1D(filters=50, kernel_size=10),
#         layers.BatchNormalization(),
        layers.Activation(keras.activations.relu),
        layers.MaxPooling1D(pool_size=10),
        layers.Conv1D(filters=80, kernel_size=10),
#         layers.BatchNormalization(),
        layers.Activation(keras.activations.relu),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation="elu"),
        layers.Dropout(.5),
        layers.Dense(32, activation="elu"),
        layers.Dropout(.5),
        layers.Dense(16, activation="elu"),
        layers.Dropout(.5),
        layers.Dense(8, activation="elu"),
        layers.Dropout(.5),
        layers.Dense(nb_estimates, activation="linear")
    ]
)
CNN_infer2.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_12 (Conv1D)          (None, 199, 50)           350       
                                                                 
 activation_3 (Activation)   (None, 199, 50)           0         
                                                                 
 conv1d_13 (Conv1D)          (None, 190, 50)           25050     
                                                                 
 activation_4 (Activation)   (None, 190, 50)           0         
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 19, 50)           0         
 1D)                                                             
                                                                 
 conv1d_14 (Conv1D)          (None, 10, 80)            40080     
                                                      

In [27]:
#TEST
intput = tf.random.normal((1, SIZE, 2))
CNN_infer2(intput).shape

TensorShape([1, 2])

# Design CNN For Model selection

In [81]:
# For small trees, the output is of size 2 selecting between BD and BDEI

# nb_models = 2
# CNN_select = keras.Sequential(
#     [
#         keras.Input(shape=(SIZE, 2)),
#         layers.Conv1D(filters=50, kernel_size=3),  ## !! normalement batch normalization puis relu apres conv 
#         layers.Conv1D(filters=50, kernel_size=10),
#         layers.MaxPooling1D(pool_size=10),
#         layers.Conv1D(filters=80, kernel_size=10),
#         layers.GlobalAveragePooling1D(),
#         layers.Dense(64, activation="elu"),
#         layers.Dropout(.5),
#         layers.Dense(32, activation="elu"),
#         layers.Dropout(.5),
#         layers.Dense(16, activation="elu"),
#         layers.Dropout(.5),
#         layers.Dense(8, activation="elu"),
#         layers.Dropout(.5),
#         layers.Dense(nb_models, activation=keras.activations.softmax)
#     ]
# )
# CNN_select.summary()

# CNN_select.compile(
#     optimizer="adam",
#     loss=tf.keras.losses.CategoricalCrossentropy(), #labels are expected to be in a one-hot representation
# )

# Training 

In [28]:
batch_size = 80 #8000
num_epochs = 1000

In the screenshot below, we see that at a certain point the validation loss increases while the training loss keeps decreasing. As explained in the Voznica's article, we should perform early stopping evaluating MAPE on validation set. 

![image.png](attachment:image.png)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=2, patience=250, verbose=1)
CNN_infer2.compile(
    optimizer="adam",
    loss=tf.keras.losses.MeanAbsolutePercentageError(), #loss used for parameter inference
    #metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)
start=time.time()
history = CNN_infer2.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=batch_size, epochs=num_epochs, callbacks=[callback]);
end_train = time.time()-start 
print(f" Training with {train_size} training data took {end_train/60} minutes")

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000

In [None]:
num_real_epochs = len(history.history['loss'])
train_loss = history.history['loss']
val_loss = history.history['val_loss']

#plotting
plt.plot(np.arange(num_real_epochs), train_loss, '-r', label="Training loss")
plt.plot(np.arange(num_real_epochs), val_loss, '-g', label="Validation loss")
plt.xlabel('epoch')
plt.legend()
plt.title("Learning curve for parameters inference")
# plt.ylabel('training loss')
plt.show()

In [None]:
# Save the model 
path_to_NN = "BD_small_CNN_2.h5"
CNN_infer2.save(path_to_NN)

## Testing on test data

In [None]:
mod = keras.models.load_model(path_to_NN)

In [24]:
print(f"The prediction is {mod(x_test).numpy()} \n The true value is {y_test}")

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.04309717, -0.191522  ]], dtype=float32)>

# Test with Zurich Dataset
Be aware that it is normal if the prediction is false. Indeed, the network was trained on tree from the BD model whereas Zurich tree is from BDSS model. This part only enable to manipulate this dataset as they did in the article.

In [102]:
Zurich_input = np.load("Zurich_test/CBLV_vector.npy").T

Zurich_out = mod(Zurich_input[None, :,:])
# print(Zurich_out)

with open("Zurich_test/resc_factor.txt") as f:
    _, resc_factor = f.readlines()[0].split(':')
    resc_factor = float(resc_factor)
    
print(f"The prediction for Zurich data is {Zurich_out*resc_factor}.")

(201, 2)
tf.Tensor([[-552.3214    -48.685516]], shape=(1, 2), dtype=float32)


# Load pre-trained model


![image.png](attachment:image.png)

In [51]:
#pre_train_model = keras.models.load_model("pre_trained_model/BD_SMALL_CNN.h5")
# pre_train_model.summary()

# CNN_model.load_weights("pre_trained_model/BD_SMALL_CNN.h5")

In [74]:
CNN_model(Zurich_input[None, :,:])

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-552.33154 ,  -48.698875]], dtype=float32)>

In [103]:
import json

with open("pre_trained_model/BD_SMALL_CNN.json", 'r') as f:
    #res = json.load(f)
    dd = f.readlines()[0]
    #print(f.readlines()[0])
    
phylodeep_config = tf.keras.models.model_from_json(dd)

phylodeep_config.summary()
phylodeep_config.load_weights("pre_trained_model/BD_SMALL_CNN.h5")