In [173]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle 

from pathlib import Path
from tqdm import tqdm

from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.inspection import permutation_importance

from functools import partial

#  model
from sklearn.ensemble import RandomForestRegressor

#  data transformations
from src.data.data_utils import augment_data, fill_random_2d, fill_value_2d, transform_x

# scores and metrics
from sklearn.metrics import r2_score, make_scorer
from src.metrics.metrics import (exponential_mae, exponential_mape, exponential_mse, exponential_r2, 
exponential_mae_per_class, exponential_mape_per_class, exponential_mse_per_class, exponential_r2_per_class, r2_per_class, r2)

%matplotlib qt
tqdm = partial(tqdm, position=0, leave=True)

In [174]:

# Paths
datasets_dir = Path('../../data/datasets/dataset_hplc_multi/')
reports_dir_test = Path('../../reports/cross_val')
reports_dir_train = Path('../../reports/cross_val_train')
dir_model = Path('../../model')
reports_dir_test.mkdir(parents=True, exist_ok=True)
reports_dir_train.mkdir(parents=True, exist_ok=True)
dir_model.mkdir(parents=True, exist_ok=True)

# params
pigments_threshold = [0.00248, 0.05878, 0.003  , 0.00518, 0.003  , 0.01302, 0.0036 , 0.00968, 0.001  , 0.0018 , 0.00844, 0.00242, 0.001]
log_pigments_threshold = list(np.log(pigments_threshold))


metrics = {"mae": exponential_mae, "mse": exponential_mse, "r2": exponential_r2, "mape": exponential_mape,
           "mae_per_class": exponential_mae_per_class, "mse_per_class": exponential_mse_per_class, 
           "r2_per_class": exponential_r2_per_class, "mape_per_class": exponential_mape_per_class,
           "r2_log": r2_score, "r2_log_per_class": r2_per_class}

pigments = ['chlide_a[mg*m^3]', 'chla[mg*m^3]', 'chlb[mg*m^3]', 'chlc1+c2[mg*m^3]',
       'fucox[mg*m^3]', "19'hxfcx[mg*m^3]", "19'btfcx[mg*m^3]",
       'diadino[mg*m^3]', 'allox[mg*m^3]', 'diatox[mg*m^3]', 'zeaxan[mg*m^3]',
       'beta_car[mg*m^3]', 'peridinin[mg*m^3]']

param_dist = [{
    'n_estimators':np.arange(20, 100, 5),
    'max_depth':  [None, 5, 6, 7, 8, 9, 10],
    'min_samples_split':[2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 10],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True],
    # 'criterion': ['squared_error', 'absolute_error'],
    'criterion': ['squared_error', 'friedman_mse'],
    'max_samples': [None, 0.5, 0.6, 0.75, 0.9],  # Subsample fraction for training each tree
    'ccp_alpha': [0.0, 0.01, 0.1, 0.2]  # Complexity parameter for Minimal Cost-Complexity Pruning
},
              {
    'n_estimators':np.arange(20, 100, 5),
    'max_depth':  [None, 5, 6, 7, 8, 9, 10],
    'min_samples_split':[2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 10],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [False],
    # 'criterion': ['squared_error', 'absolute_error'],
    'criterion': ['squared_error', 'friedman_mse'],
    'ccp_alpha': [0.0, 0.01, 0.1, 0.2]  # Complexity parameter for Minimal Cost-Complexity Pruning
}]


## Define and run nested CV 

In [175]:

def ncv_random_forest(x, y, param_d, score, score_aux=None, outer_splits=5, inner_splits=3, seed=1, n_it=10):
    kfold_outer = KFold(n_splits=outer_splits, shuffle=True, random_state=seed)
    train_result = {}
    test_result = {}
    i = 0    
    for train_idx, test_idx in tqdm(kfold_outer.split(x), total=outer_splits):
        X_train, X_test = x.iloc[train_idx].copy(), x.iloc[test_idx].copy()
        y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()
        X_train, y_train = augment_data(X_train, y_train, replicate=9)
        # X_train.loc[:, ['lat', 'lon']] = fill_random_2d(X_train.loc[:, ['lat', 'lon']].values, 0.1)
        X_train.loc[:, ['lat']] = fill_value_2d(X_train.loc[:, ['lat']].values, val= np.nan, percent=0.1)
        X_train.loc[:, ['lon']] = fill_value_2d(X_train.loc[:, ['lon']].values, val= np.nan, percent=0.1)
        #X_train.loc[:, :] = fill_random_2d(X_train.loc[:, :].values, 0.1)
    
        kfold_inner = KFold(n_splits=inner_splits, shuffle=True, random_state=seed)
        
        # RandomizedSearchCV for hyperparameter tuning
        model = RandomForestRegressor(random_state=seed)
        randomized_search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_d,
            n_iter=n_it, 
            scoring=score,
            cv=kfold_inner,
            # cv=inner_splits,
            random_state=seed,
            n_jobs=-1
        )
    
        # Fit RandomizedSearchCV
        randomized_search.fit(X_train, y_train)
    
        # Get the best hyperparameters
        best_params = randomized_search.best_params_
    
        # Train a new model on the entire outer training set using the best hyperparameters
        best_model = RandomForestRegressor(random_state=seed, **best_params)
        best_model.fit(X_train, y_train)

        # Evaluate the model on the outer training set
        aux_res_train = {}
        py = best_model.predict(X_train)
        if score_aux is not None:
            aux_res_train = {key: s(y_train, py) for key, s in score_aux.items()}
        train_result[i] = {**best_params, "score": score(best_model, X_train, y_train), **aux_res_train, "mean decrease impurity": best_model.feature_importances_}

        # Evaluate the model on the outer test set
        aux_res_test = {}
        py = best_model.predict(X_test)
        
        
        if score_aux is not None:
            aux_res_test.update({key: s(y_test, py) for key, s in score_aux.items()})
        pi = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2, scoring=score)
        test_result[i] = { **best_params, "score": score(best_model, X_test, y_test), **aux_res_test, 
                    "permutation importance mean": pi.importances_mean,
                    "permutation importance std": pi.importances_std,}
        i = i+1
    return train_result, test_result


def custom_score(y_true, y_pred):
    return  np.min(r2_score(y_true['chla[mg*m^3]'], y_pred['chla[mg*m^3]']))#,  multioutput='raw_values')) # + r2_score(y_true, y_pred)
    

In [176]:
x = pd.read_csv(datasets_dir/'log_rrs_lat_lon_month_season_depth_loc.csv')
y = pd.read_csv(datasets_dir/'log_pigments.csv')

# only med and black sea
y = y[x['med and black sea'].astype(bool)]
x = x[x['med and black sea'].astype(bool)]
x = x.drop(columns=['med', 'black sea', 'med and black sea'])

n = 300

# Wrap the custom score function for use in RandomizedSearchCV
# custom_scorer = make_scorer(custom_score, greater_is_better=True)
custom_scorer = make_scorer(r2, greater_is_better=True)


In [177]:
x = x.drop(columns=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
# x = x.drop(columns=['lat', 'lon'])
x = x.drop(columns=[ 'depth'])
x = x.drop(columns=['summer', 'autumn', 'spring', 'winter'])


In [178]:
'''data = pd.read_csv('../Pierre/data/pigments_export.csv')
pft_names = ['Bacillariophyceae', 'Bolidophyceae', 'Chlorarachnida', 'Chrysophyceae',
       'Coscinodiscophyceae', 'Cryptophyta', 'Dictyochophyceae',
       'Dinoflagellata', 'Haptophyta', 'Mediophyceae', 'MOCH',
       'Other.Photosynthetic.Eukaryotes', 'Pelagophyceae', 'Radiolaria',
       'Non.Phototrophic.Eukaryotes', 'Synechococcus', 'Prochlorococcus',
       'Non.Phototrophic.Prokaryotes', 'Other.Photosynthetic.Prokaryotes',
       'Cyanobium']

pig_names = ['Chl.C2', 'Peridinine', 'BF.19', 'Fucoxanthin',
       'Prasinoxanthin', 'HF.19', 'Diadinoxanthin', 'Alloxanthin',
       'Zeaxanthin', 'Chl.b', 'Chl.a', 'BB.Carotene']

x_raw = data[pig_names].copy()
y_raw = data[pft_names].copy()'''

"data = pd.read_csv('../Pierre/data/pigments_export.csv')\npft_names = ['Bacillariophyceae', 'Bolidophyceae', 'Chlorarachnida', 'Chrysophyceae',\n       'Coscinodiscophyceae', 'Cryptophyta', 'Dictyochophyceae',\n       'Dinoflagellata', 'Haptophyta', 'Mediophyceae', 'MOCH',\n       'Other.Photosynthetic.Eukaryotes', 'Pelagophyceae', 'Radiolaria',\n       'Non.Phototrophic.Eukaryotes', 'Synechococcus', 'Prochlorococcus',\n       'Non.Phototrophic.Prokaryotes', 'Other.Photosynthetic.Prokaryotes',\n       'Cyanobium']\n\npig_names = ['Chl.C2', 'Peridinine', 'BF.19', 'Fucoxanthin',\n       'Prasinoxanthin', 'HF.19', 'Diadinoxanthin', 'Alloxanthin',\n       'Zeaxanthin', 'Chl.b', 'Chl.a', 'BB.Carotene']\n\nx_raw = data[pig_names].copy()\ny_raw = data[pft_names].copy()"

In [179]:
'''
# substitute 0s by lowest value and apply logs
x_raw_aux = x_raw.copy()
y_raw_aux = y_raw.copy()

x_zer_ind = x_raw_aux == 0
y_zer_ind = y_raw_aux == 0

x_raw_aux[x_zer_ind] = 1
y_raw_aux[y_zer_ind] = 1

x_raw = pd.DataFrame(np.maximum(x_raw.values, x_raw_aux.quantile(0.01)), columns=x_raw.columns)
y_raw = pd.DataFrame(np.maximum(y_raw.values, y_raw_aux.quantile(0.01)), columns=y_raw.columns)
x = np.log(x_raw)
y = np.log(y_raw)
'''

'\n# substitute 0s by lowest value and apply logs\nx_raw_aux = x_raw.copy()\ny_raw_aux = y_raw.copy()\n\nx_zer_ind = x_raw_aux == 0\ny_zer_ind = y_raw_aux == 0\n\nx_raw_aux[x_zer_ind] = 1\ny_raw_aux[y_zer_ind] = 1\n\nx_raw = pd.DataFrame(np.maximum(x_raw.values, x_raw_aux.quantile(0.01)), columns=x_raw.columns)\ny_raw = pd.DataFrame(np.maximum(y_raw.values, y_raw_aux.quantile(0.01)), columns=y_raw.columns)\nx = np.log(x_raw)\ny = np.log(y_raw)\n'

In [180]:
ncv_train, ncv_test = ncv_random_forest(x, y, param_dist, custom_scorer, score_aux=metrics, n_it=n, outer_splits=5, inner_splits=3)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:28<00:00, 29.69s/it]


In [181]:

attribs_train = list(ncv_train[0].keys())
met_names_train = list(metrics.keys()) +['mean decrease impurity']

attribs_test = list(ncv_test[0].keys())
met_names_test = list(metrics.keys()) +['permutation importance mean', 'permutation importance std']

In [182]:
mets_train = {attrib : [fold[attrib] for fold_num, fold in ncv_train.items()  if attrib in fold.keys()] for attrib in attribs_train} 
mets_train = {key: np.mean(value, axis=0)  if key in met_names_train else value for key, value in mets_train.items()}

In [183]:
mets_test = {attrib : [fold[attrib] for fold_num, fold in ncv_test.items()  if attrib in fold.keys()] for attrib in attribs_test}
mets_test = {key: np.mean(value, axis=0)  if key in met_names_test else value for key, value in mets_test.items()}

### Legacy training

In [184]:
x_mini = x.drop(columns=["400", "620", "510", "665", "681", "708", "778", "865"])

In [185]:
ncv_train_mini, ncv_test_mini = ncv_random_forest(x_mini, y, param_dist, custom_scorer, score_aux=metrics, n_it=n)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:13<00:00, 26.77s/it]


In [186]:
mets_train_mini = {attrib : [fold[attrib] for fold_num, fold in ncv_train_mini.items()  if attrib in fold.keys()] for attrib in attribs_train}
mets_train_mini = {key: np.mean(value, axis=0)  if key in met_names_train else value for key, value in mets_train_mini.items()}

In [187]:

mets_test_mini = {attrib : [fold[attrib] for fold_num, fold in ncv_test_mini.items()  if attrib in fold.keys()] for attrib in attribs_test}
mets_test_mini = {key: np.mean(value, axis=0)  if key in met_names_test else value for key, value in mets_test_mini.items()}

### Test Metrics

In [195]:
pd.DataFrame({'R2':[mets_test['r2_log'], mets_test_mini['r2_log']], 
              'MAPE':[mets_test['mape'], mets_test_mini['mape']],
              'MAE':[mets_test['mae'], mets_test_mini['mae']],
              'MSE':[mets_test['mse'], mets_test_mini['mse']]
             }, index=['RF', 'RF legacy'])

Unnamed: 0,R2,MAPE,MAE,MSE
RF,0.812763,0.396068,0.053998,0.132813
RF legacy,0.816636,0.37893,0.057079,0.168724


#### Per Pigment

In [196]:
pd.DataFrame([mets_test['r2_log_per_class'], mets_test_mini['r2_log_per_class']], columns=pigments, index=['RF', 'RF legacy'])

Unnamed: 0,chlide_a[mg*m^3],chla[mg*m^3],chlb[mg*m^3],chlc1+c2[mg*m^3],fucox[mg*m^3],19'hxfcx[mg*m^3],19'btfcx[mg*m^3],diadino[mg*m^3],allox[mg*m^3],diatox[mg*m^3],zeaxan[mg*m^3],beta_car[mg*m^3],peridinin[mg*m^3]
RF,0.739014,0.923096,0.795347,0.929336,0.902531,0.849332,0.620303,0.91154,0.766646,0.795963,0.538785,0.89816,0.895871
RF legacy,0.74411,0.907482,0.762974,0.917761,0.893908,0.846093,0.635585,0.912958,0.763842,0.826238,0.611076,0.887163,0.907083


### Training Metrics

In [197]:
pd.DataFrame({'R2':[mets_train['r2_log'], mets_train_mini['r2_log']], 
              'MAPE':[mets_train['mape'], mets_train_mini['mape']],
              'MAE':[mets_train['mae'], mets_train_mini['mae']],
              'MSE':[mets_train['mse'], mets_train_mini['mse']]
             }, index=['RF', 'RF legacy'])

Unnamed: 0,R2,MAPE,MAE,MSE
RF,0.958841,0.167345,0.016563,0.006416
RF legacy,0.949352,0.189221,0.021015,0.012821


In [198]:
pd.DataFrame([mets_train['r2_log_per_class'], mets_train_mini['r2_log_per_class']], columns=pigments, index=['RF', 'RF legacy'])

Unnamed: 0,chlide_a[mg*m^3],chla[mg*m^3],chlb[mg*m^3],chlc1+c2[mg*m^3],fucox[mg*m^3],19'hxfcx[mg*m^3],19'btfcx[mg*m^3],diadino[mg*m^3],allox[mg*m^3],diatox[mg*m^3],zeaxan[mg*m^3],beta_car[mg*m^3],peridinin[mg*m^3]
RF,0.962218,0.982634,0.945012,0.984486,0.977863,0.966433,0.918839,0.983236,0.953751,0.958395,0.881392,0.978109,0.972561
RF legacy,0.950117,0.977332,0.933874,0.979611,0.971797,0.957919,0.895918,0.97902,0.941523,0.948369,0.865611,0.972072,0.968411


## Save nested CV results

In [202]:
with open(reports_dir_train / 'ranfom_forest.pkl', 'wb') as f:
    pickle.dump(mets_train, f)

with open(reports_dir_test / 'ranfom_forest.pkl', 'wb') as f:
    pickle.dump(mets_test, f)

with open(reports_dir_train / 'ranfom_forest_legacy.pkl', 'wb') as f:
    pickle.dump(mets_train_mini, f)

with open(reports_dir_test / 'ranfom_forest_legacy.pkl', 'wb') as f:
    pickle.dump(mets_test_mini, f)

## Load nested CV results


In [203]:
with open(reports_dir_train / 'ranfom_forest_legacy.pkl', 'rb') as f:
    mets_train_mini = pickle.load(f)

with open(reports_dir_test / 'ranfom_forest_legacy.pkl', 'rb') as f:
    mets_test_mini = pickle.load(f)

with open(reports_dir_train / 'ranfom_forest.pkl', 'rb') as f:
    mets_train = pickle.load(f)

with open(reports_dir_test / 'ranfom_forest.pkl', 'rb') as f:
    mets_test = pickle.load(f)

## Show Feature importance

In [204]:
feature_names = x.columns

forest_importances = pd.Series(mets_test["permutation importance mean"], index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=mets_test["permutation importance std"], ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean r2 decrease")
fig.tight_layout()
plt.show()

In [205]:
feature_names = x_mini.columns

forest_importances = pd.Series(mets_test_mini["permutation importance mean"], index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=mets_test_mini["permutation importance std"], ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean r2 decrease")
fig.tight_layout()
plt.show()

In [32]:

forest_importances = pd.Series(mets_train["mean decrease impurity"], index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

### Train final model

In [206]:
with open(reports_dir_test / 'ranfom_forest_legacy.pkl', 'rb') as f:
    mets_test_mini = pickle.load(f)

with open(reports_dir_test / 'ranfom_forest.pkl', 'rb') as f:
    mets_test = pickle.load(f)

In [208]:
#  Select best hyperparameters

print("Hyperparameters are:\n")
hp = list(param_dist[0].keys())
hp

Hyperparameters are:



['n_estimators',
 'max_depth',
 'min_samples_split',
 'min_samples_leaf',
 'max_features',
 'bootstrap',
 'criterion',
 'max_samples',
 'ccp_alpha']

In [209]:
print("Best Hyperparameters per Fold in outer loop")
pd.DataFrame([mets_test[hp_name]  if hp_name in mets_test else [None, None, None, None, None] for hp_name in hp], 
             columns=["Fold 1", "Fold 2", "Fold 3", "Fold 4", "Fold 5"], 
             index=hp)

Best Hyperparameters per Fold in outer loop


Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
n_estimators,60,60,55,60,55
max_depth,10,10,9,10,9
min_samples_split,9,9,8,9,8
min_samples_leaf,2,2,2,2,2
max_features,,,sqrt,,sqrt
bootstrap,True,True,False,True,False
criterion,squared_error,squared_error,squared_error,squared_error,squared_error
max_samples,,,,,
ccp_alpha,0.0,0.0,0.0,0.0,0.0


In [210]:
print("Best Hyperparameters per Fold in outer loop (legacy model)")
pd.DataFrame([mets_test_mini[hp_name] for hp_name in hp], 
             columns=["Fold 1", "Fold 2", "Fold 3", "Fold 4", "Fold 5"], 
             index=hp)

Best Hyperparameters per Fold in outer loop (legacy model)


Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
n_estimators,60,60,55,60,60
max_depth,10,10,9,10,10
min_samples_split,9,9,8,9,9
min_samples_leaf,2,2,2,2,2
max_features,,,sqrt,,
bootstrap,True,True,False,True,True
criterion,squared_error,squared_error,squared_error,squared_error,squared_error
max_samples,,,,,
ccp_alpha,0.0,0.0,0.0,0.0,0.0


There is an absolute consensus on the best Hyperparameters in all Folds. Same for legacy model

In [211]:
best_hp ={hp_name: mets_test[hp_name][0] for hp_name in hp}
best_hp

{'n_estimators': 60,
 'max_depth': 10,
 'min_samples_split': 9,
 'min_samples_leaf': 2,
 'max_features': None,
 'bootstrap': True,
 'criterion': 'squared_error',
 'max_samples': None,
 'ccp_alpha': 0.0}

Train final model

In [212]:
x_train, y_train = augment_data(x, y, replicate=9)
x_train.loc[:, ['lat', 'lon']] = fill_random_2d(x_train.loc[:, ['lat', 'lon']].values, 0.1)

best_model = RandomForestRegressor(**best_hp)
best_model.fit(x_train, y_train)

In [213]:
x_train_mini = x_train.drop(columns=["400", "620", "510", "665", "681", "708", "778", "865"])

best_model_legacy = RandomForestRegressor(**best_hp)
best_model_legacy.fit(x_train_mini, y_train)

In [214]:
r2_score(y.values, best_model.predict(x))

0.972939839477626

In [215]:
best_model.feature_names_in_

array(['400', '412', '442', '490', '510', '560', '620', '665', '673',
       '681', '708', '778', '865', 'lat', 'lon'], dtype=object)

### Save models

In [216]:

with open(dir_model / 'rf.pkl', 'wb') as f:
    pickle.dump(best_model, f)
    
with open(dir_model / 'rf_legacy.pkl', 'wb') as f:
    pickle.dump(best_model_legacy, f)
