In [None]:
import pkg_resources
# following versions need to be installed
pkg_resources.require('ete3==3.1.1', 'pandas==0.23.4', 'numpy==1.18.5', 'scipy==1.1.0', 'scikit-learn==0.19.1',
                      'tensorflow==1.13.1', 'joblib==0.13.2', 'h5py==2.10.0', 'Keras==2.3.1', 'matplotlib==3.1.3')

import warnings
warnings.filterwarnings('ignore')
import pandas as pd

import tensorflow as tf
import keras
import numpy as np

from keras.wrappers.scikit_learn import KerasRegressor

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from keras.models import Sequential, Model

from sklearn.model_selection import GridSearchCV

from keras.layers import Activation, Dense

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from keras.utils.training_utils import multi_gpu_model

from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

from keras.layers import Dense, Dropout

from keras.layers.core import Dense, Dropout, Activation, Flatten


In [None]:
#cutoff=1000000 #all

#Reading input data, features and parameters
param_train = pd.read_csv('../../../results_eBDM_vs3_200_500_tips/csv_for_train.csv')
param_test = pd.read_csv('../../../results_eBDM_vs3_200_500_tips/csv_parameters_for_test.csv')
param_train['diversification_rate'] = param_train['birth_rate'] - param_train['extinction_rate']
param_test['diversification_rate'] = param_test['birth_rate'] - param_test['extinction_rate']


#Reshape parameters: drop index column
param_train = param_train.drop(param_train.columns[0],1)
param_test = param_test.drop(param_test.columns[0], 1)

encoding_train = pd.read_csv('../../../results_eBDM_vs3_200_500_tips/TARGET_MEDIAN_sumstats_ecology_vs3_forced_200_500_tips_train_01_1/sumstats_data.csv', sep="\t", header=None)
encoding_test = pd.read_csv('../../../results_eBDM_vs3_200_500_tips/TARGET_MEDIAN_sumstats_ecology_vs3_forced_200_500_tips_test_01_1/sumstats_data.csv', sep="\t", header=None)

chemin = 'sumstats/all_samp_input/'
expname='_ffnn_ss_mae'


In [None]:
#correctly reshape parameters (rescaling) and encodings (rescale factor):

### TRAINING SET: PARAMETER VALUES
# drop rescaling factor and add it to param tables
param_train['scaling_factor'] = encoding_train.iloc[:,-1]
encoding_train = encoding_train.drop(encoding_train.columns[-1], axis =1)
# rescale target values corresponding to the rescale factor
param_train['birth_rate_resc'] = param_train['birth_rate']*param_train['scaling_factor']
param_train['extinction_rate_resc'] = param_train['extinction_rate']*param_train['scaling_factor']
param_train['diversification_rate_resc'] = param_train['diversification_rate']*param_train['scaling_factor']

### TESTING SET: PARAMETER VALUES
param_test['scaling_factor'] = encoding_test.iloc[:,-1]
encoding_test = encoding_test.drop(encoding_test.columns[-1], axis =1)
# rescale target values corresponding to the rescale factor
param_test['birth_rate_resc'] = param_test['birth_rate']*param_test['scaling_factor']
param_test['extinction_rate_resc'] = param_test['extinction_rate']*param_test['scaling_factor']
param_test['diversification_rate_resc'] = param_test['diversification_rate']*param_test['scaling_factor']


#Reshape features
encoding_train = encoding_train.drop(encoding_train.columns[0], axis =1)
encoding_test = encoding_test.drop(encoding_test.columns[0], axis =1)    

In [None]:
# check
print(encoding_test.shape)
print(param_train)

In [None]:
# check
param_train.describe()

In [None]:
#Choice of the parameters to predict
target_1 = "turnover_rate"
target_2 = "birth_rate_resc"
target_3 = "extinction_rate_resc"
target_4 = "diversification_rate_resc"

targets = pd.DataFrame(param_train[[target_1, target_2, target_3, target_4]])
targets_test = pd.DataFrame(param_test[[target_1, target_2, target_3, target_4]])

features = encoding_train
features_test = encoding_test

# how large is the validation set
valid_set_nb = 10000
valid_frac = valid_set_nb/features.shape[0]
train_size_frac = (features.shape[0]-valid_set_nb)/features.shape[0]

In [None]:
#Add the known sampling fraction into the representation (both train and test sets)
add_target = "sampling_frac"
added_targets = pd.DataFrame(param_train[add_target])
features['399'] = added_targets

add_target_2 = "sampling_frac"
added_targets_2 = pd.DataFrame(param_test[add_target])
features_test['399'] = added_targets_2


In [None]:
print(features.shape)
print(targets.shape)

In [None]:
#Standardization of the input features with a standard scaler
scale = StandardScaler()
features = scale.fit_transform(features)
features_test = scale.fit_transform(features_test)

X = features
Y = targets

Y_test = targets_test
X_test = features_test

In [None]:
print(features.shape)

In [None]:
#Creation of the Network Model: model definition

def build_model():
    model = Sequential()    
    model.add(Dense(64, input_dim=98, activation='elu'))
    keras.layers.Dropout(0.5)
    model.add(Dense(32, activation='elu'))
    keras.layers.Dropout(0.5)
    model.add(Dense(16, activation='elu'))
    keras.layers.Dropout(0.5)
    model.add(Dense(8, activation='elu'))
    keras.layers.Dropout(0.5)
    model.add(Dense(4, activation='linear'))
    model.summary()
    return model

In [None]:
#Building of the model

from keras import losses

# model creation
model = build_model()

estimator = model

#Adam optimizer, loss measure: mean absolute error, metrics measured: MAPE
estimator.compile(loss='mae', optimizer = 'Adam', metrics=[losses.mean_absolute_percentage_error])

#early stopping to avoid overfitting
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=100)

#display training progress for each completed epoch.
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self,epoch,logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

# maximum number of EPOCHS, ie full training cycles on the whole training dataset (how many times we see the same training set)
EPOCHS = 10000

#Training of the Network, with an independent validation set
history = estimator.fit(X, Y, verbose = 1, epochs=EPOCHS, validation_split=valid_frac, batch_size=8000, callbacks=[early_stop, PrintDot()])



In [None]:
import seaborn as sns
#import statsmodel.formula.api as smf
import matplotlib.pyplot as plt

#Plot test vs predicted
# predict values for the test set
predicted_test = pd.DataFrame(estimator.predict(X_test))
predicted_test.columns = Y_test.columns # rename correctly the columns
predicted_test.index = Y_test.index # rename indexes for correspondence

elts = []

# just for subsetting columns more automatically + naming output plots
for elt in Y_test.columns:
    elts.append(elt)

for elt in elts:
    sub_df = pd.DataFrame({'predicted_minus_target_' + elt: predicted_test[elt] - Y_test[elt], 'target_'+elt: Y_test[elt], 'predicted_'+elt: predicted_test[elt]})
    if elt == elts[0]:
        df = sub_df
    else:
        sub_df.index = df.index
        df = pd.concat([df, sub_df], axis=1)

# fast plotting for analysis (with seaborn):
def target_vs_predicted2(target_name, predicted_name, param_name, file_name_beg) : 
    sns.set_style('white')
    sns.set_context('talk')
    sns.regplot(x=target_name, y=predicted_name, data=df, ci=95, n_boot=500, 
                scatter_kws={'s':0.1, 'color':'grey'}, line_kws={ 'color':'green', 'linewidth':2})
    plt.title(param_name + ': target vs predicted test dataset')
    plt.xlabel('target')
    plt.ylabel('predicted')
    innerlimit = min(df[target_name])
    
    outerlimit = max(df[target_name])
    plt.plot([innerlimit, outerlimit], [innerlimit, outerlimit], linewidth=2, color='red')
    plt.show()
    return None

    
for elt in elts:
    target_vs_predicted2('target_'+elt, 'predicted_'+elt, elt, file_name_beg=elt)

In [None]:
# table with statistics on errors
errors_index = elts
errors_columns = ['MAE', 'RMSE', 'RME']
errors = pd.DataFrame(index=errors_index, columns=errors_columns)

def get_mae_rmse(name_var):
    predicted_vals = df['predicted_' + name_var]
    target_vals = df['target_' + name_var]
    diffs_abs = abs(target_vals - predicted_vals)
    diffs_rel = diffs_abs/target_vals
    diffs_abs_squared = diffs_abs**2
    mae = np.sum(diffs_abs)/len(diffs_abs)
    rmse = np.sqrt(sum(diffs_abs_squared)/len(diffs_abs_squared))
    rme = np.sum(diffs_rel)/len(diffs_rel)
    return mae, rmse, rme
    

for elt in errors_index:
    errors.loc[elt] = np.array(get_mae_rmse(elt))

print(errors)

In [None]:
###save the model, weights and scaler

#save model and model weights + scaler

from keras.models import model_from_json

model_trial_1000 = model.to_json()
with open('../../Model/' + chemin + 'model_all' + expname + '.json','w') as json_file:
    json_file.write(model_trial_1000)

model.save_weights('../../Model/' + chemin + 'model_all_weights' + expname +'.h5')
print('model saved!')

#save scaler
from sklearn.externals import joblib

scale_filename = '../../Model/' + chemin + 'all_standardscaler' + expname + '.pkl'
joblib.dump(scale, scale_filename)

print('scale saved!')
#load scaler
#scale = joblib.load(scale_filename)



In [None]:
####save the predicted and the target dataframes

Y_test.to_csv('../../Data/' + chemin + expname + 'target_all.csv', header=True)

predicted_test['scaling_factor'] = param_test['scaling_factor']

predicted_test['birth_rate'] = predicted_test['birth_rate_resc']/predicted_test['scaling_factor']

predicted_test['extinction_rate'] = predicted_test['extinction_rate_resc']/predicted_test['scaling_factor']

predicted_test['diversification_rate'] = predicted_test['diversification_rate_resc']/predicted_test['scaling_factor']

predicted_test.to_csv('../../Data/' + chemin + expname + 'predicted_all.csv', header=True)



In [None]:
"""
##for CI predict

predicted_CI = pd.DataFrame(estimator.predict(features_CI))

predicted_CI.columns = Y_test.columns

predicted_CI.to_csv('../../Data/' + chemin + 'predicted_CI_all' + expname + '.csv', header=True)
"""
