In [None]:
import pkg_resources
# following versions need to be installed
pkg_resources.require('ete3==3.1.1', 'pandas==0.23.4', 'numpy==1.18.5', 'scipy==1.1.0', 'scikit-learn==0.19.1',
                      'tensorflow==1.13.1', 'joblib==0.13.2', 'h5py==2.10.0', 'Keras==2.3.1', 'matplotlib==3.1.3')


import warnings
warnings.filterwarnings('ignore')
import pandas as pd

import tensorflow as tf
import keras
import numpy as np

from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Activation, Dense

from keras.utils.training_utils import multi_gpu_model
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.layers import Dense, Dropout
from keras.layers.core import Dense, Dropout, Activation, Flatten


In [None]:
#########loading data#########
# we load files with parameter values and files with tree representations (here full tree representations)

cutoff=1000000 #all: 1M examples for training

# loading parameters
param_train = pd.read_csv('..', nrows=cutoff, header=None, sep='\t', index_col=0, skiprows=1) # options: nrows: how many rows to load, sep: column separator, header: no header, index_col: the first column corresponds to indexes
param_test = pd.read_csv('..', sep='\t', header=None, index_col=0, skiprows=1)

# for renaming columns in parameter files
column_names = ['lambda1', 'lambda2', 'turnover', 'sampling_frac', 'tree_size', 'mu1', 'mu2', 'net_rate1', 'net_rate2', 'q01', 'q10', 'lambda2_ratio', 'q01_ratio']


def rename_columns(df, names):
    df = df.rename(columns={i: names[int(i)-1] for i in df.columns})
    return df

param_train = rename_columns(param_train, column_names)
param_test = rename_columns(param_test, column_names)

# loading tree encodings/representations
# encoding has the following structure: 1 value of tree height, 500 values for tip states ('1' or '2')
# 1 value for tree height and 500 values for internal node heights
# + 2 values for nb of tips of each type (to be removed) and 1 value of rescaling (removed, but stocked for rescaling predicted values back to the original scale)
encoding_train = pd.read_csv('../', sep="\t", header=None, nrows=cutoff, index_col=0)
encoding_test = pd.read_csv('../', sep="\t", header=None, index_col=0)

# make sure there is correspondance between indexes of dataframe with parameter values and encodings
encoding_train.index = param_train.index
encoding_test.index = param_test.index

# part of the relative path for writing down the output files
chemin = 'full_tree/all_samp_input/'

# the suffix of output files
expname='_1000000_longest_absolute_error'


In [None]:
# check
param_test

In [None]:
# check
print(encoding_test.shape)
print(param_train)

In [None]:
# check
encoding_test

In [None]:
#correctly reshape parameters (rescaling) and encodings (remove nb of each type + rescale factor):

### TRAINING SET: PARAMETER VALUES
# rescaling factor
param_train['norm_factor'] = encoding_train[1005]
# rescale target values according to scaling factor
param_train['net_rat1_rescaled'] = param_train['net_rate1']*param_train['norm_factor']
param_train['net_rat2_rescaled'] = param_train['net_rate2']*param_train['norm_factor']
param_train['lambda1_rescaled'] = param_train['lambda1']*param_train['norm_factor']
param_train['lambda2_rescaled'] = param_train['lambda2']*param_train['norm_factor']
param_train['q01_rescaled'] = param_train['q01']*param_train['norm_factor']

### TESTING SET: PARAMETER VALUES
# rescaling factor
param_test['norm_factor'] = encoding_test[1005]
# rescale target values
param_test['net_rat1_rescaled'] = param_test['net_rate1']*param_test['norm_factor']
param_test['net_rat2_rescaled'] = param_test['net_rate2']*param_test['norm_factor']
param_test['lambda1_rescaled'] = param_test['lambda1']*param_test['norm_factor']
param_test['lambda2_rescaled'] = param_test['lambda2']*param_test['norm_factor']
param_test['q01_rescaled'] = param_test['q01']*param_test['norm_factor']

# remove irrelevant columns: count of each type of tip and normalization factor
encoding_train.drop(columns=[1003, 1004, 1005], axis=1, inplace=True)
encoding_test.drop(columns=[1003, 1004, 1005], axis=1, inplace=True)


In [None]:
encoding_train.shape

In [None]:
#Choice of the parameters to predict
target_1 = "turnover"
target_2 = "lambda1_rescaled"
target_3 = "lambda2_rescaled"
target_4 = "q01_rescaled"

targets = pd.DataFrame(param_train[[target_1, target_2, target_3, target_4]])
targets_test = pd.DataFrame(param_test[[target_1, target_2, target_3, target_4]])

features = encoding_train
features_test = encoding_test

# how large is the validation set
valid_set_nb = 10000
valid_frac = valid_set_nb/features.shape[0]
train_size_frac = (features.shape[0]-valid_set_nb)/features.shape[0]

In [None]:
#Add the known sampling fraction as 3*2 matrix into the representation (both train and test sets)
add_target = "sampling_frac"
added_targets = pd.DataFrame(param_train[add_target])
features['1003'] = added_targets
features['1004'] = added_targets
features['1005'] = added_targets
features['1006'] = added_targets
features['1007'] = added_targets
features['1008'] = added_targets

added_targets2 = pd.DataFrame(param_test[add_target])
features_test['1003'] = added_targets2
features_test['1004'] = added_targets2
features_test['1005'] = added_targets2
features_test['1006'] = added_targets2
features_test['1007'] = added_targets2
features_test['1008'] = added_targets2

In [None]:
# rearranging the matrix (for each tre there is 1st vector (internal tips info) followed by 2nd vector (states at external tips))
tips_coor = np.arange(0,501)
samp = np.array([1002, 1003, 1004]) # this is corresponding to the sampling fraction
tips_coor = np.append(tips_coor, samp)
branches_coor = np.arange(501,1002)
samp2 = np.array([1005, 1006, 1007]) # this is corresponding to the sampling fraction
branches_coor = np.append(branches_coor, samp2)

#type_coor = np.arange(459,689,1) 


order_corr = np.append(branches_coor, tips_coor)

In [None]:
# check if in good order
branches_coor

In [None]:
# rearranging (for full tree BiSSE)
features = features.iloc[:, order_corr]
features_test = features_test.iloc[:, order_corr]

In [None]:
#Standardization of the input features: done for summary statistics
"""
scale = StandardScaler()
features = scale.fit_transform(features)
features_test = scale.transform(features_test)
"""

X = features
Y = targets

Y_test = targets_test
X_test = features_test

In [None]:
X_test

In [None]:
print(features.shape)

In [None]:
#Creation of the Network Model: model definition

def build_model():
    model = Sequential()
    
    model.add(keras.layers.Reshape((504, 2), input_shape=(X.shape[1],))) # the input of 1008 columns is reshaped into 504*2 (one column for tip states, the other for internal nodes distances + repeated value of tree height + 3*sampling fraction)
    # convolutional part
    model.add(Conv1D(filters = 50, kernel_size=(5), input_shape= (504, 2), activation='elu'))
    model.add(Conv1D(filters = 50, kernel_size=(10), activation='elu'))
    model.add(MaxPooling1D(10))
    model.add(Conv1D(filters = 80, kernel_size=(10), activation='elu'))
    # flattening the 2D 'feature maps' into 1D vector used in 'FFNN part'
    model.add(GlobalAveragePooling1D())
    # FFNN part
    keras.layers.Dropout(0.5)
    model.add(Dense(64, activation='elu'))
    keras.layers.Dropout(0.5)
    model.add(Dense(32, activation='elu'))
    keras.layers.Dropout(0.5)
    model.add(Dense(16, activation='elu'))
    keras.layers.Dropout(0.5)
    model.add(Dense(8, activation='elu'))
    keras.layers.Dropout(0.5)
    # output layer with 4 output neurons = nb of target parameters 
    model.add(Dense(4, activation='elu'))
    # show the model structure
    model.summary()
    return model

In [None]:
#Building of the model

from keras import losses

#model creation
estimator = build_model()

#Adam optimizer, loss measure: mean absolute error, metrics measured: MAPE
estimator.compile(loss='mae', optimizer = 'Adam', metrics=[losses.mean_absolute_percentage_error])

#early stopping to avoid overfitting
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100)

#display training progress for each completed epoch.
class PrintD(keras.callbacks.Callback):
  def on_epoch_end(self,epoch,logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

# maximum number of EPOCHS, ie full training cycles on the whole training dataset (how many times we see the same training set)
EPOCHS = 10000

#Training of the Network, with an independent validation set
history = estimator.fit(X, Y, verbose = 1, epochs=EPOCHS, validation_split=valid_frac, batch_size=8000, callbacks=[early_stop, PrintD()])



In [None]:
import seaborn as sns
#import statsmodel.formula.api as smf
import matplotlib.pyplot as plt

#Plot test vs predicted
# predict values for the test set
predicted_test = pd.DataFrame(estimator.predict(X_test))
predicted_test.columns = Y_test.columns # rename correctly the columns
predicted_test.index = Y_test.index # rename indexes for correspondence

elts = []

# just for subsetting columns more automatically + naming output plots
for elt in Y_test.columns:
    elts.append(elt)

for elt in elts:
    sub_df = pd.DataFrame({'predicted_minus_target_' + elt: predicted_test[elt] - Y_test[elt], 'target_'+elt: Y_test[elt], 'predicted_'+elt: predicted_test[elt]})
    if elt == elts[0]:
        df = sub_df
    else:
        sub_df.index = df.index
        df = pd.concat([df, sub_df], axis=1)

# fast plotting for analysis (with seaborn):
def target_vs_predicted(target_name, predicted_name, param_name, file_name_beg) : 
    sns.set_style('white')
    sns.set_context('talk')
    sns.regplot(x=target_name, y=predicted_name, data=df, ci=95, n_boot=500, 
                scatter_kws={'s':0.1, 'color':'grey'}, line_kws={ 'color':'green', 'linewidth':2})
    plt.title(param_name + ': target vs predicted test dataset')
    plt.xlabel('target')
    plt.ylabel('predicted')
    innerlimit = min(df[target_name])
    
    outerlimit = max(df[target_name])
    plt.plot([innerlimit, outerlimit], [innerlimit, outerlimit], linewidth=2, color='red')
    plt.show()
    return None

    
for elt in elts:
    target_vs_predicted('target_'+elt, 'predicted_'+elt, elt, file_name_beg=elt)

In [None]:
# table with statistics on errors
errors_index = elts
errors_columns = ['MAE', 'RMSE', 'RME']
errors = pd.DataFrame(index=errors_index, columns=errors_columns)

def get_mae_rmse(name_var):
    predicted_vals = df['predicted_' + name_var]
    target_vals = df['target_' + name_var]
    diffs_abs = abs(target_vals - predicted_vals)
    diffs_rel = diffs_abs/target_vals
    diffs_abs_squared = diffs_abs**2
    mae = np.sum(diffs_abs)/len(diffs_abs)
    rmse = np.sqrt(sum(diffs_abs_squared)/len(diffs_abs_squared))
    rme = np.sum(diffs_rel)/len(diffs_rel)
    return mae, rmse, rme
    

#errors.loc['R_nought'] = np.array(get_mae_rmse('R_nought'))
for elt in errors_index:
    errors.loc[elt] = np.array(get_mae_rmse(elt))

print(errors)

In [None]:
#print differences between predicted and target as function of target: showing structural bias

def predicted_minus_target_vs_target(pr_m_tar_name, target_name, param_name, file_name_beg) : 
    sns.set_style('white')
    sns.set_context('talk')
    sns.regplot(x=target_name, y=pr_m_tar_name, data=df, ci=95, n_boot=500, 
                scatter_kws={'s':0.1, 'color':'grey'}, line_kws={ 'color':'green', 'linewidth':2})
    plt.title(param_name + ': target vs (target-predicted) test dataset')
    plt.xlabel('target')
    plt.ylabel('target - predicted')
    innerlimit = min(df[target_name])
    
    outerlimit = max(df[target_name])
    
    plt.plot([innerlimit, outerlimit], [0, 0], linewidth=2, color='red')
    plt.show()
    return None


for elt in elts:
    predicted_minus_target_vs_target('predicted_minus_target_'+elt, 'target_'+elt, elt, file_name_beg=elt)


In [None]:
#measure of correlation: predicted vs target
from scipy.stats import spearmanr, pearsonr

from sklearn.metrics import r2_score

pearson_cors = []

for elt in elts:
    pearson_cors.append(pearsonr(Y_test[elt], predicted_test[elt])[0])

print("Global pearson correlation between predicted and effective parameter: ", Y_test.columns, pearson_cors)


In [None]:
###save the model, weights (and scaler for sumstats only)

from keras.models import model_from_json

# save model
model_trial_1000 = model.to_json()
with open('../Model/' + chemin + 'model_all' + expname + '.json','w') as json_file:
    json_file.write(model_trial_1000)

# save weights
model.save_weights('../Model/' + chemin + 'model_all_weights' + expname +'.h5')
print('model saved!')

'''
#load the model
json_file = open('../Model/' + chemin + 'model_all' + expname + '.json', 'r')
loaded_file = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_file)
#load weights
loaded_model.load_weights(../Model/' + chemin + 'model_all_weights' + expname +'.h5'5)
print('model loaded!')

'''

#save scaler when there is one (FFNN-SS)
"""
from sklearn.externals import joblib

scale_filename = '../../Model/' + chemin + 'all_standardscaler' + expname + '.pkl'
joblib.dump(scale, scale_filename)

print('scale saved!')
#load scaler:
#scale = joblib.load(scale_filename)
"""


In [None]:
####save the predicted and the target dataframes

Y_test.to_csv('../Data/' + chemin + expname + 'target_all.csv', header=True)

predicted_test.to_csv('../Data/' + chemin + expname + 'predicted_all.csv', header=True)

