In [46]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
import itertools as it
import pickle

from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import KFold

from functools import partial

#  model
from src.models.my_models.kerasModels.dnn import DNN, DNN_th

#  data transformations
from src.data.data_utils import augment_data, fill_random_2d, transform_x

# scores and metrics
from sklearn.metrics import r2_score, make_scorer
from src.metrics.metrics import (exponential_mae, exponential_mape, exponential_mse, exponential_r2, 
exponential_mae_per_class, exponential_mape_per_class, exponential_mse_per_class, exponential_r2_per_class, r2_per_class, r2)

%matplotlib qt
tqdm = partial(tqdm, position=0, leave=True)



In [47]:

# Paths
datasets_dir = Path('../../data/datasets/dataset_hplc_multi/')
reports_dir_test = Path('../../reports/cross_val')
reports_dir_train = Path('../../reports/cross_val_train')
dir_model = Path('../../model')
reports_dir_test.mkdir(parents=True, exist_ok=True)
reports_dir_train.mkdir(parents=True, exist_ok=True)
dir_model.mkdir(parents=True, exist_ok=True)

pigments_threshold = [0.00248, 0.05878, 0.003  , 0.00518, 0.003  , 0.01302, 0.0036 , 0.00968, 0.001  , 0.0018 , 0.00844, 0.00242, 0.001  ]
log_pigments_threshold = list(np.log(pigments_threshold))

metrics = {"mae": exponential_mae, "mse": exponential_mse, "r2": exponential_r2, "mape": exponential_mape,
           "mae_per_class": exponential_mae_per_class, "mse_per_class": exponential_mse_per_class, 
           "r2_per_class": exponential_r2_per_class, "mape_per_class": exponential_mape_per_class,
           "r2_log": r2_score, "r2_log_per_class": r2_per_class}


pigments = ['chlide_a[mg*m^3]', 'chla[mg*m^3]', 'chlb[mg*m^3]', 'chlc1+c2[mg*m^3]',
       'fucox[mg*m^3]', "19'hxfcx[mg*m^3]", "19'btfcx[mg*m^3]",
       'diadino[mg*m^3]', 'allox[mg*m^3]', 'diatox[mg*m^3]', 'zeaxan[mg*m^3]',
       'beta_car[mg*m^3]', 'peridinin[mg*m^3]']

n = 50

def conf_dnn_random(n, seed=1):
    confs = []
    np.random.seed(seed)
    for i in range(n):
        layers_num = np.random.randint(low=2, high=5)
        hl_dim = np.random.choice([13, 16, 32, 64], size=(layers_num - 1))
        hl_dim = np.hstack([hl_dim, [13]])
        conf = {'layers_num': layers_num, 
                'hl_dim': hl_dim,
                'dropout': np.random.choice([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]), 
                'lr': np.random.choice([0.01, 0.001]),
                'patience': np.random.choice([5, 10, 30, 50, 100, 1000]),
                'loss': np.random.choice(['mae', 'mse', 'mapee']),
                'epochs':np.random.choice([100, 200, 300, 500, 800, 1000, 1500, 3000]),
                # 'epochs':3000,
                'batch': np.random.choice([8, 16, 32])}               
        confs.append(conf)
    return confs

def conf_dnn_random_th(n, seed=1):
    confs = []
    np.random.seed(seed)
    for i in range(n):
        layers_num = np.random.randint(low=2, high=5)
        hl_dim = np.random.choice([13, 16, 32, 64, 128], size=(layers_num - 1))
        hl_dim = np.hstack([hl_dim, [13]])
        conf = {'layers_num': layers_num, 
                'hl_dim': hl_dim,
                'dropout': np.random.choice([0.0, 0.1, 0.2, 0.3, 0.4, 0.5]), 
                'lr': np.random.choice([0.01, 0.001]),
                'patience': np.random.choice([5, 10, 30, 50, 100, 1000]),
                'loss': np.random.choice(['mae', 'mse', 'mapee']),               
                'epochs':np.random.choice([100, 200, 300, 500, 800, 1000, 1500, 3000]),
                # 'epochs':3000,
                'batch': np.random.choice([8, 16, 32]),
                'threshold':log_pigments_threshold}               
        confs.append(conf)
    return confs

model_confs = conf_dnn_random(n)
model_confs_th = conf_dnn_random_th(n)

## Nested Cross validation. 


In [48]:
# Assuming log predictions
def nested_cv(x, y, model_class, models_confs_pool, train_conf, score, score_aux=None, outer_splits=5, inner_splits=3, seed=1):
    kfold_outer = KFold(n_splits=outer_splits, shuffle=True, random_state=seed)
    train_result = {}
    test_result = {}
    i = 0
    for train_index, test_index in tqdm(kfold_outer.split(x, y), total=outer_splits):
        X_train, X_test = x.iloc[train_index].copy(), x.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()
        X_train, y_train = augment_data(X_train, y_train, replicate=5)
        X_train.loc[:, ['lat', 'lon']] = fill_random_2d(X_train.loc[:, ['lat', 'lon']].values, 0.1)
        #X_train.loc[:, :] = fill_random_2d(X_train.loc[:, :].values, 0.1)

        kfold_inner = KFold(n_splits=inner_splits,  shuffle=True, random_state=seed)
        inner_metrics = []
        for train_index_inner, test_index_inner in tqdm(kfold_inner.split(X_train, y_train), total=inner_splits):
            X_train_inner, X_test_inner = X_train.iloc[train_index_inner].copy(), X_train.iloc[test_index_inner].copy()
            y_train_inner, y_test_inner = y_train.iloc[train_index_inner].copy(), y_train.iloc[test_index_inner].copy()
        
            X_train_inner, X_test_inner = transform_x(X_train_inner, X_test_inner, 'pca_scaled')

            models = [model_class(**conf) for conf in models_confs_pool]
            [model_.fit(x=X_train_inner, y=y_train_inner, **train_conf) for model_ in models]
        
            # Predict test and compute metrics
            py = [mod.predict(X_test_inner) for mod in models]
            inner_metrics.append([score(y_test_inner, py_) for py_ in py])
        best_inner_model_index = np.array(inner_metrics).mean(axis=0).argmax()
        best_conf = models_confs_pool[best_inner_model_index]
        best_model = model_class(**best_conf)

        X_train, X_test = transform_x(X_train, X_test, 'pca_scaled')
        best_model.fit(x=X_train, y=y_train, **train_conf)


        # Evaluate the model on the outer training set
        aux_res_train = {}
        py = best_model.predict(X_train)
        if score_aux is not None:
            aux_res_train = {key: s(y_train, py) for key, s in score_aux.items()}
        train_result[i] = {**best_conf, "score": score(y_train, py), **aux_res_train}

        # Evaluate the model on the outer test set
        aux_res_test = {}
        py = best_model.predict(X_test)
        if score_aux is not None:
            aux_res_test.update({key: s(y_test, py) for key, s in score_aux.items()})
        test_result[i] = { **best_conf, "score": score(y_test, py), **aux_res_test}
        i = i+1
    return train_result, test_result
        



#### Prepare the nested cross validation attributes

In [49]:
x = pd.read_csv(datasets_dir/'log_rrs_lat_lon_month_season_depth_loc.csv')
y = pd.read_csv(datasets_dir/'log_pigments.csv')

# only med and black sea
y = y[x['med and black sea'].astype(bool)]
x = x[x['med and black sea'].astype(bool)]
x = x.drop(columns=['med', 'black sea', 'med and black sea'])


In [51]:
x = x.drop(columns=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
# x = x.drop(columns=['lat', 'lon'])
x = x.drop(columns=[ 'depth'])
x = x.drop(columns=['summer', 'autumn', 'spring', 'winter'])

In [52]:
# m = DNN(**model_confs[1])
# m.model.predict(x)
# m.model.summary()
train_conf = {}

In [7]:
ncv_train, ncv_test  = nested_cv(x, y, DNN, model_confs, {}, r2, score_aux=metrics)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:28:01<00:00, 1760.57s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:25:31<00:00, 1710.61s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:25:04<00:00, 1701.40s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:47:20<00:00, 2146.69s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:56:49<00:00, 2336.62s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████

In [53]:
ncv_train_th, ncv_test_th = nested_cv(x, y, DNN_th, model_confs_th, train_conf, r2, score_aux=metrics)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:04:58<00:00, 1299.41s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:05:45<00:00, 1315.21s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:05:23<00:00, 1307.81s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:27:36<00:00, 1752.23s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [1:55:35<00:00, 2311.81s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████

In [54]:
attribs_train = list(ncv_train[0].keys())
met_names_train = list(metrics.keys()) 

attribs_test = list(ncv_test[0].keys())
met_names_test = list(metrics.keys()) 


In [55]:
mets_train = {attrib : [fold[attrib] for fold_num, fold in ncv_train.items()] for attrib in attribs_train}
mets_train = {key: np.mean(value, axis=0)  if key in met_names_train else value for key, value in mets_train.items()}

mets_test = {attrib : [fold[attrib] for fold_num, fold in ncv_test.items()] for attrib in attribs_test}
mets_test = {key: np.mean(value, axis=0)  if key in met_names_test else value for key, value in mets_test.items()}

In [56]:
mets_train_th = {attrib : [fold[attrib] for fold_num, fold in ncv_train_th.items()] for attrib in attribs_train}
mets_train_th = {key: np.mean(value, axis=0)  if key in met_names_train else value for key, value in mets_train_th.items()}

mets_test_th = {attrib : [fold[attrib] for fold_num, fold in ncv_test_th.items()] for attrib in attribs_test}
mets_test_th = {key: np.mean(value, axis=0)  if key in met_names_test else value for key, value in mets_test_th.items()}

### Legacy training

In [None]:
x_mini = x.drop(columns=["400", "620", "665", "681", "708", "778", "865"])

In [None]:
ncv_train_mini, ncv_test_mini       = nested_cv(x_mini, y, DNN, models_confs, train_conf, r2_score, score_aux=metrics)

In [None]:
ncv_train_th_mini, ncv_test_th_mini = nested_cv(x_mini, y, DNN_th, models_confs_th, train_conf, r2_score, score_aux=metrics)

In [None]:
mets_train_mini = {attrib : [fold[attrib] for fold_num, fold in ncv_train_mini.items()] for attrib in attribs_train}
mets_train_mini = {key: np.mean(value, axis=0)  if key in met_names_train else value for key, value in mets_train_mini.items()}

In [None]:
mets_test_mini = {attrib : [fold[attrib] for fold_num, fold in ncv_test_mini.items()] for attrib in attribs_test}
mets_test_mini = {key: np.mean(value, axis=0)  if key in met_names_test else value for key, value in mets_test_mini.items()}

In [None]:
mets_train_th_mini = {attrib : [fold[attrib] for fold_num, fold in ncv_train_th_mini.items()] for attrib in attribs_train}
mets_train_th_mini = {key: np.mean(value, axis=0)  if key in met_names_train else value for key, value in mets_train_th_mini.items()}

In [None]:
mets_test_th_mini = {attrib : [fold[attrib] for fold_num, fold in ncv_test_th_mini.items()] for attrib in attribs_test}
mets_test_th_mini = {key: np.mean(value, axis=0)  if key in met_names_test else value for key, value in mets_test_th_mini.items()}

### Test Metrics

In [57]:
pd.DataFrame({'R2':[mets_test['r2_log'], mets_test_th['r2_log']], 
              'MAPE':[mets_test['mape'], mets_test_th['mape']],
              'MAE':[mets_test['mae'], mets_test_th['mae']],
              'MSE':[mets_test['mse'], mets_test_th['mse']]
             }, index=['DNN', 'DNN th'])

Unnamed: 0,R2,MAPE,MAE,MSE
DNN,0.746472,0.545161,0.060534,0.132449
DNN th,0.748104,0.462292,0.069799,0.231377


In [58]:
pd.DataFrame([mets_test['r2_log_per_class'], mets_test_th['r2_log_per_class']], columns=pigments,  index=['DNN', 'DNN th'])

Unnamed: 0,chlide_a[mg*m^3],chla[mg*m^3],chlb[mg*m^3],chlc1+c2[mg*m^3],fucox[mg*m^3],19'hxfcx[mg*m^3],19'btfcx[mg*m^3],diadino[mg*m^3],allox[mg*m^3],diatox[mg*m^3],zeaxan[mg*m^3],beta_car[mg*m^3],peridinin[mg*m^3]
DNN,0.73443,0.875034,0.760904,0.892554,0.886139,0.741763,0.459652,0.859996,0.676695,0.717622,0.399745,0.841517,0.858079
DNN th,0.765414,0.897032,0.76768,0.918961,0.8894,0.750678,0.440322,0.881059,0.720951,0.731495,0.296729,0.874202,0.791427


### Training Metrics

In [16]:
pd.DataFrame({'R2':[mets_train['r2_log'], mets_train_th['r2_log']], 
              'MAPE':[mets_train['mape'], mets_train_th['mape']],
              'MAE':[mets_train['mae'], mets_train_th['mae']],
              'MSE':[mets_train['mse'], mets_train_th['mse']]
             },  index=['DNN', 'DNN th'])

Unnamed: 0,R2,MAPE,MAE,MSE
DNN,0.899365,0.302443,0.036412,0.048137
DNN th,0.788092,0.388616,0.057615,0.116107


In [17]:
pd.DataFrame([mets_train['r2_log_per_class'], mets_train_th['r2_log_per_class']], columns=pigments,  index=['DNN', 'DNN th'])

Unnamed: 0,chlide_a[mg*m^3],chla[mg*m^3],chlb[mg*m^3],chlc1+c2[mg*m^3],fucox[mg*m^3],19'hxfcx[mg*m^3],19'btfcx[mg*m^3],diadino[mg*m^3],allox[mg*m^3],diatox[mg*m^3],zeaxan[mg*m^3],beta_car[mg*m^3],peridinin[mg*m^3]
DNN,0.887038,0.957271,0.881837,0.961243,0.94947,0.902516,0.819896,0.952067,0.89715,0.897497,0.715788,0.943024,0.926943
DNN th,0.811305,0.902875,0.798533,0.90812,0.896615,0.78774,0.516106,0.895633,0.779827,0.794108,0.445697,0.898754,0.809888


### Save Nested CV results

In [18]:
with open(reports_dir_train / 'dnn.pkl', 'wb') as f:
    pickle.dump(mets_train, f)


with open(reports_dir_test / 'dnn.pkl', 'wb') as f:
    pickle.dump(mets_test, f)

with open(reports_dir_train / 'dnn_th.pkl', 'wb') as f:
    pickle.dump(mets_train_th, f)

with open(reports_dir_test / 'dnn_th.pkl', 'wb') as f:
    pickle.dump(mets_test_th, f)

In [None]:
with open(reports_dir_train / 'dnn_legacy.pkl', 'wb') as f:
    pickle.dump(mets_train_mini, f)


with open(reports_dir_test / 'dnn_legacy.pkl', 'wb') as f:
    pickle.dump(mets_test_mini, f)

with open(reports_dir_train / 'dnn_th_legacy.pkl', 'wb') as f:
    pickle.dump(mets_train_th_mini, f)

with open(reports_dir_test / 'dnn_th_legacy.pkl', 'wb') as f:
    pickle.dump(mets_test_th_mini, f)

### Load Nested CV results

In [19]:
with open(reports_dir_train / 'dnn.pkl', 'rb') as f:
    mets_train = pickle.load(f)

with open(reports_dir_test / 'dnn.pkl', 'rb') as f:
    mets_test = pickle.load(f)

with open(reports_dir_train / 'dnn_th.pkl', 'rb') as f:
    mets_train_th = pickle.load(f)

with open(reports_dir_test / 'dnn_th.pkl', 'rb') as f:
    mets_test_th = pickle.load(f)

In [None]:
with open(reports_dir_train / 'dnn_legacy.pkl', 'rb') as f:
    mets_train_mini = pickle.load(f)

with open(reports_dir_test / 'dnn_legacy.pkl', 'rb') as f:
    mets_test_mini = pickle.load(f)

with open(reports_dir_train / 'dnn_th_legacy.pkl', 'rb') as f:
    mets_train_th_mini = pickle.load(f)

with open(reports_dir_test / 'dnn_th_legacy.pkl', 'rb') as f:
    mets_test_th_mini = pickle.load(f)

### Train final model

In [20]:
with open(reports_dir_test / 'dnn.pkl', 'rb') as f:
    mets_test = pickle.load(f)

with open(reports_dir_test / 'dnn_th.pkl', 'rb') as f:
    mets_test_th = pickle.load(f)

In [None]:
with open(reports_dir_test / 'dnn_legacy.pkl', 'rb') as f:
    mets_test_mini = pickle.load(f)

with open(reports_dir_test / 'dnn_th_legacy.pkl', 'rb') as f:
    mets_test_th_mini = pickle.load(f)

In [21]:
hp = ['layers_num', 'hl_dim', 'lr', 'patience', 'loss', 'epochs', 'batch'] 

In [22]:
print("Best Hyperparameters per Fold in outer loop")
pd.DataFrame([mets_test[hp_name] for hp_name in hp], 
             columns=["Fold 1", "Fold 2", "Fold 3", "Fold 4", "Fold 5"], 
             index=hp)

Best Hyperparameters per Fold in outer loop


Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
layers_num,3,3,3,3,3
hl_dim,"[16, 64, 13]","[16, 64, 13]","[16, 64, 13]","[16, 64, 13]","[16, 64, 13]"
lr,0.01,0.01,0.01,0.01,0.01
patience,100,100,100,100,100
loss,mse,mse,mse,mse,mse
epochs,1500,1500,1500,1500,1500
batch,32,32,32,32,32


In [23]:
print("Best Hyperparameters per Fold in outer loop")
pd.DataFrame([mets_test_th[hp_name] for hp_name in hp], 
             columns=["Fold 1", "Fold 2", "Fold 3", "Fold 4", "Fold 5"], 
             index=hp)

Best Hyperparameters per Fold in outer loop


Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
layers_num,2,2,2,2,2
hl_dim,"[64, 13]","[64, 13]","[64, 13]","[64, 13]","[64, 13]"
lr,0.01,0.01,0.01,0.01,0.01
patience,100,100,100,100,100
loss,mse,mse,mse,mse,mse
epochs,300,300,300,300,300
batch,32,32,32,32,32


In [None]:
print("Best Hyperparameters per Fold in outer loop (legacy model)")
pd.DataFrame([mets_test_mini[hp_name] for hp_name in hp], 
             columns=["Fold 1", "Fold 2", "Fold 3", "Fold 4", "Fold 5"], 
             index=hp)

In [None]:
print("Best Hyperparameters per Fold in outer loop (legacy model)")
pd.DataFrame([mets_test_th_mini[hp_name] for hp_name in hp], 
             columns=["Fold 1", "Fold 2", "Fold 3", "Fold 4", "Fold 5"], 
             index=hp)