# XGBoost: Tara Chile
In this section we conduct some experiments in order to prepare the XGB study on the Chilean data

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import xgboost as xgb

from glob import glob
import os
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [None]:
input_sat_dir = '../01_data/02_satellite_data_processed'

desired_files = [
'matrix_tara_chile_adj_grids_25_all.tsv'
]

predictor_files = sorted([f for f in glob(os.path.join(input_sat_dir, 'matrix_tara_chile_adj_grids_*.tsv')) 
                          if os.path.basename(f) in desired_files])


input_kmeans_dir = '../03_results/out_genomic_clusters'
target_vars_filename = 'kmeans_results_ch.tsv'
target_vars_path = os.path.join(input_kmeans_dir, target_vars_filename)

target_vars = pd.read_csv(target_vars_path, sep='\t', index_col=0)
target_vars = target_vars.map(lambda x: f"C{x}")
#target_vars.head()

desired_clusters = {'5', '6', '7', '8'} # only consider this number of clusters
columns_to_use = [col for col in target_vars.columns if col.startswith('clr_') and col.split('_')[-1] in desired_clusters] # only consider clr-abundance clusters



results_df = pd.DataFrame(index=[os.path.basename(file) for file in predictor_files], columns=columns_to_use)

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    #recall = recall_score(y_true, y_pred, average='macro')
    #precision = precision_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    #roc_auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr')
    return (accuracy, f1)

Here we range over some selections of hyper-pareters for the XGB method, and use 

In [None]:
n_splits = 8
n_repeats = 9

rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

le = LabelEncoder()

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_macro': make_scorer(f1_score, average='macro')
}

for file in predictor_files:
    file_name = os.path.basename(file)
    idx = f"{file_name}_s{n_splits}_r{n_repeats}"
    df = pd.read_csv(file, sep='\t', index_col=0)

    aligned_predictor = df.loc[df.index.intersection(target_vars.index)] # satellite

    for target_column in columns_to_use:
        n_clusters = int(target_column[-1])
        X = aligned_predictor
        y = target_vars.loc[aligned_predictor.index, target_column]

        non_nan_indices = y.dropna().index
        X = X.loc[non_nan_indices]
        y = y.loc[non_nan_indices]

        y_encoded = le.fit_transform(y)

        unique, counts = np.unique(y_encoded, return_counts=True)
        min_samples = n_splits

        X_resampled = X.copy()
        y_resampled = y_encoded.copy()

        for cls, count in zip(unique, counts):
            if count < min_samples:
                diff = min_samples - count
                cls_indices = np.where(y_encoded == cls)[0]
                indices_to_duplicate = np.random.choice(cls_indices, diff, replace=True)
                X_resampled = np.concatenate([X_resampled, X.iloc[indices_to_duplicate]], axis=0)
                y_resampled = np.concatenate([y_resampled, y_encoded[indices_to_duplicate]], axis=0)

        model = xgb.XGBClassifier(eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = n_clusters,
                                    learning_rate =0.2,
                                    n_estimators=10,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    )

        #cv_results = cross_validate(model, X, y_encoded, cv=rskf, scoring=scoring, return_train_score=False)
        cv_results = cross_validate(model, X_resampled, y_resampled, cv=rskf, scoring=scoring, return_train_score=False)

        avg_accuracy = np.mean(cv_results['test_accuracy'])
        avg_f1_macro = np.mean(cv_results['test_f1_macro'])

        results_df.at[idx, target_column] = (avg_accuracy, avg_f1_macro)
                    
#print(results_df)


#results_df.to_csv('../03_results/out_predictions/predictions_kmeans.tsv', sep='\t')


In [None]:
results_df

This seems to show that the best column to try to predict is `clr_M0_all_kmeans_5`

In [None]:
labels = target_vars['clr_M0_all_kmeans_5']

In [None]:
file = f"../01_data/02_satellite_data_processed/{desired_files[0]}"
df = pd.read_csv(file, sep='\t', index_col=0)

In [None]:
aligned_predictor = df.loc[df.index.intersection(target_vars.index)] # satellite

In [None]:
lbs = labels.loc[aligned_predictor.index]
lbs = lbs.map(lambda x: int(f"{x[1:]}"))

In [None]:
aligned_predictor = aligned_predictor.drop(columns = ['IOP.aph_44','bbp_unc_443'])

In [None]:
target = 'labels'
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    #Cross-val to get optimal n_estimators
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        print(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % accuracy_score(dtrain[target].values, dtrain_predictions))
    print(f"AUC Score (Train): {roc_auc_score(dtrain[target], dtrain_predprob, multi_class = 'ovo')}")
    xgb.plot_importance(alg)
    plt.show()

In [None]:
full_data = aligned_predictor.copy()
full_data[target] = lbs

In [None]:
preds = aligned_predictor.columns

In [None]:
train, test = train_test_split(full_data, test_size= 0.3)
pred_train, lbs_train =  train[preds], train[target]

In [None]:
model_1 = xgb.XGBClassifier(use_label_encoder=False,
                                    booster = 'gbtree',
                                    eval_metric='merror',
                                    seed = 29,
                                    objective= 'multi:softmax',
                                    num_class = 5,
                                    learning_rate =0.01,
                                    n_estimators=10000,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    )
modelfit(model_1, train, preds)

In [None]:
model_1.n_estimators

In [None]:
param_test1 = {
 'max_depth':range(3,20,1),
 'min_child_weight':range(1,30,1)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(use_label_encoder=False,
                                    eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = 5,
                                    learning_rate =0.01,
                                    n_estimators=1,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    ), 
                                    param_grid = param_test1, scoring='roc_auc_ovo',n_jobs=1, cv=5)
gsearch1.fit(pred_train,lbs_train)

In [None]:
gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test3 = {
 'gamma':[i/100.0 for i in range(0,50)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier(use_label_encoder=False,
                                    eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = 5,
                                    learning_rate =0.01,
                                    n_estimators=1,
                                    max_depth=3,
                                    min_child_weight=7,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    ), 
 param_grid = param_test3, scoring='roc_auc_ovo',n_jobs=1, cv=5)
gsearch3.fit(pred_train,lbs_train)

In [None]:
gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(3,10)],
 'colsample_bytree':[i/10.0 for i in range(3,10)]
}
gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier(use_label_encoder=False,
                                    eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = 5,
                                    learning_rate =0.01,
                                    n_estimators=1,
                                    max_depth=3,
                                    min_child_weight=7,
                                    gamma=0.07,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    ), 
 param_grid = param_test4, scoring='roc_auc_ovo',n_jobs=1, cv=5)
gsearch4.fit(pred_train,lbs_train)

In [None]:

gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test5 = {
 'subsample':[i/100.0 for i in range(80,100,5)],
 'colsample_bytree':[i/100.0 for i in range(40,60,5)]
}
gsearch5 = GridSearchCV(estimator = xgb.XGBClassifier(use_label_encoder=False,
                                    eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = 5,
                                    learning_rate =0.01,
                                    n_estimators=1,
                                    max_depth=3,
                                    min_child_weight=7,
                                    gamma=0.07
                                    ), 
 param_grid = param_test5, scoring='roc_auc_ovo',n_jobs=1, cv=5)
gsearch5.fit(pred_train,lbs_train)

In [None]:

gsearch5.best_params_, gsearch5.best_score_

In [None]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier(use_label_encoder=False,
                                    eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = 5,
                                    learning_rate =0.01,
                                    n_estimators=1,
                                    max_depth=3,
                                    min_child_weight=7,
                                    gamma=0.2,
                                    subsample=0.9,
                                    colsample_bytree=0.55
                                    ), 
 param_grid = param_test6, scoring='roc_auc_ovo',n_jobs=1, cv=5)
gsearch6.fit(pred_train,lbs_train)


In [None]:
gsearch6.best_params_, gsearch6.best_score_

In [None]:
param_test7 = {
 'learning_rate':[0.1, 0.01,0.001,0.0001]
}
gsearch7 = GridSearchCV(estimator = xgb.XGBClassifier(use_label_encoder=False,
                                    eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = 5,
                                    learning_rate =0.01,
                                    n_estimators=1,
                                    max_depth=3,
                                    min_child_weight=4,
                                    gamma=0.37,
                                    subsample=0.9,
                                    colsample_bytree=0.2,
                                    reg_alpha = 1e-05
                                    ), 
 param_grid = param_test7, scoring='roc_auc_ovo',n_jobs=1, cv=5)
gsearch7.fit(pred_train,lbs_train)

In [None]:

gsearch7.best_params_, gsearch6.best_score_

Finalmente eso nos deja con el siguiente estimador:


In [None]:
final_model = xgb.XGBClassifier(use_label_encoder=False,
                                    eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = 5,
                                    learning_rate =0.1,
                                    n_estimators=1,
                                    max_depth=3,
                                    min_child_weight=4,
                                    gamma=0.37,
                                    subsample=0.9,
                                    colsample_bytree=0.2,
                                    reg_alpha = 1e-05
                                    )

In [None]:
final_model.fit(pred_train,lbs_train)

In [None]:
xgb.plot_importance(final_model)

In [None]:
y_pred = final_model.predict(test[preds])

In [None]:

dtest_predprob = final_model.predict_proba(test[preds])
roc_auc_score(test[target], dtest_predprob, multi_class = 'ovr')

## Cleaned up version:

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import xgboost as xgb

from glob import glob
import os
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [None]:
input_sat_dir = '../01_data/02_satellite_data_processed'

desired_files = [
'matrix_tara_chile_adj_grids_25_all.tsv'
]

predictor_files = sorted([f for f in glob(os.path.join(input_sat_dir, 'matrix_tara_chile_adj_grids_*.tsv')) 
                          if os.path.basename(f) in desired_files])


input_kmeans_dir = '../03_results/out_genomic_clusters'
target_vars_filename = 'kmeans_results_ch.tsv'
target_vars_path = os.path.join(input_kmeans_dir, target_vars_filename)

target_vars = pd.read_csv(target_vars_path, sep='\t', index_col=0)
target_vars = target_vars.map(lambda x: f"C{x}")
#target_vars.head()

desired_clusters = {'5', '6', '7', '8'} # only consider this number of clusters
columns_to_use = [col for col in target_vars.columns if col.startswith('clr_') and col.split('_')[-1] in desired_clusters] # only consider clr-abundance clusters



results_df = pd.DataFrame(index=[os.path.basename(file) for file in predictor_files], columns=columns_to_use)

In [None]:
labels = target_vars['clr_M0_all_kmeans_5']
file = f"../01_data/02_satellite_data_processed/{desired_files[0]}"
df = pd.read_csv(file, sep='\t', index_col=0)
aligned_predictor = df.loc[df.index.intersection(target_vars.index)] # satellite
lbs = labels.loc[aligned_predictor.index]
lbs = lbs.map(lambda x: int(f"{x[1:]}"))
aligned_predictor = aligned_predictor.drop(columns = ['IOP.aph_44','bbp_unc_443'])
target = 'labels'

In [None]:
full_data = aligned_predictor.copy()
full_data[target] = lbs

In [None]:
preds = aligned_predictor.columns

In [None]:
train, test = train_test_split(full_data, test_size= 0.3)
pred_train, lbs_train =  train[preds], train[target]

In [None]:

# Definir el modelo
model = xgb.XGBClassifier(eval_metric='mlogloss')

# Definir los hiperparámetros a evaluar
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [5,8,10,12,15],
    'subsample': [0.3, 0.5, 0.8, 1.0],
    'colsample_bytree': [0.4, 0.5, 0.6],
    'reg_alpha':[5*1e-2, 0.1, 1, 10]
}

# Configurar la búsqueda en cuadrícula
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=1)

# Entrenar y buscar los mejores parámetros
grid_search.fit(pred_train, lbs_train)

print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor puntaje:", grid_search.best_score_)


Tiempo: 49 min

Mejores hiperparámetros: `{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10, 'reg_alpha': 1e-05, 'subsample': 0.8}`.

Mejor puntaje (accuracy): 0.7822656342513322

Refinement:

In [None]:
# Definir los hiperparámetros a evaluar
param_grid2 = {
    'max_depth': [3, 4],
    'learning_rate': [0.005, 0.01, 0.05],
    'n_estimators': [5, 10 , 15],
    'subsample': [0.6, 0.8, 0.9],
    'colsample_bytree': [0.4, 0.5, 0.6],
    'reg_alpha':[1e-5]
}

# Configurar la búsqueda en cuadrícula
grid_search2 = GridSearchCV(estimator=model, param_grid=param_grid2, cv=5, scoring='roc_auc_ovr', verbose=1)

# Entrenar y buscar los mejores parámetros
grid_search2.fit(pred_train, lbs_train)

print("Mejores hiperparámetros:", grid_search2.best_params_)
print("Mejor puntaje:", grid_search2.best_score_)

Mejores hiperparámetros: `{'colsample_bytree': 0.6, 'learning_rate': 0.005, 'max_depth': 3, 'n_estimators': 5, 'reg_alpha': 1e-05, 'subsample': 0.8}`

Mejor puntaje: 0.7853041164911875

In [None]:
#third and last refinement
# Definir los hiperparámetros a evaluar
param_grid3 = {
    'max_depth': [3],
    'learning_rate': [0.005,  0.006, 0.004],
    'n_estimators': [5,6, 7, 8],
    'subsample': [0.75, 0.8, 0.85],
    'colsample_bytree': [0.65, 0.55, 0.6],
    'reg_alpha':[1e-5]
}

# Configurar la búsqueda en cuadrícula
grid_search3 = GridSearchCV(estimator=model, param_grid=param_grid3, cv=5, scoring='roc_auc_ovr', verbose=1)

# Entrenar y buscar los mejores parámetros
grid_search3.fit(pred_train, lbs_train)

print("Mejores hiperparámetros:", grid_search3.best_params_)
print("Mejor puntaje:", grid_search3.best_score_)

In [None]:
# min_child_weight
# Definir los hiperparámetros a evaluar
param_grid3 = {
    'min_child_weight': range(10)
}

better_model = xgb.XGBClassifier(eval_metric='mlogloss', colsample_bytree = 0.55,  learning_rate= 0.005,  max_depth= 3, n_estimators= 8, reg_alpha= 1e-05, subsample= 0.75)

# Configurar la búsqueda en cuadrícula
grid_search3 = GridSearchCV(estimator=better_model, param_grid=param_grid3, cv=5, scoring='roc_auc_ovr', verbose=1)

# Entrenar y buscar los mejores parámetros
grid_search3.fit(pred_train, lbs_train)

print("Mejores hiperparámetros:", grid_search3.best_params_)
print("Mejor puntaje:", grid_search3.best_score_)

## Model tweaking

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import xgboost as xgb

from glob import glob
import os
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [None]:
input_sat_dir = '../01_data/02_satellite_data_processed'

desired_files = [
'matrix_tara_chile_adj_grids_25_all.tsv'
]

predictor_files = sorted([f for f in glob(os.path.join(input_sat_dir, 'matrix_tara_chile_adj_grids_*.tsv')) 
                          if os.path.basename(f) in desired_files])


input_kmeans_dir = '../03_results/out_genomic_clusters'
target_vars_filename = 'kmeans_results_ch.tsv'
target_vars_path = os.path.join(input_kmeans_dir, target_vars_filename)

target_vars = pd.read_csv(target_vars_path, sep='\t', index_col=0)
target_vars = target_vars.map(lambda x: f"C{x}")
#target_vars.head()

desired_clusters = {'5', '6', '7', '8'} # only consider this number of clusters
columns_to_use = [col for col in target_vars.columns if col.startswith('clr_') and col.split('_')[-1] in desired_clusters] # only consider clr-abundance clusters



results_df = pd.DataFrame(index=[os.path.basename(file) for file in predictor_files], columns=columns_to_use)

In [None]:
labels = target_vars['clr_M0_all_kmeans_5']
file = f"../01_data/02_satellite_data_processed/{desired_files[0]}"
df = pd.read_csv(file, sep='\t', index_col=0)
aligned_predictor = df.loc[df.index.intersection(target_vars.index)] # satellite
lbs = labels.loc[aligned_predictor.index]
lbs = lbs.map(lambda x: int(f"{x[1:]}"))
aligned_predictor = aligned_predictor.drop(columns = ['IOP.aph_44','bbp_unc_443'])
target = 'labels'

In [None]:
full_data = aligned_predictor.copy()
full_data[target] = lbs

In [None]:
preds = aligned_predictor.columns

In [None]:
train, test = train_test_split(full_data, test_size= 0.1)
pred_train, lbs_train =  train[preds], train[target]

In [None]:
better_model = xgb.XGBClassifier(eval_metric='mlogloss', colsample_bytree = 0.55,  learning_rate= 0.001,  max_depth= 3, n_estimators= 5, reg_alpha= 1e-02, subsample= 0.5, min_child_weight = 7 )
better_model.fit(pred_train,lbs_train)
xgb.plot_importance(better_model)

In [None]:
def metric_report(model, test):
    y_pred = model.predict(test[preds])
    dtest_predprob = better_model.predict_proba(test[preds])
    print(f"ROC AUC Score: {roc_auc_score(test[target], dtest_predprob, multi_class = 'ovr')}")
    print(f"f1 score: {f1_score(y_pred,test[target],average='macro')}")
    print(f"Acc score: {accuracy_score(y_pred,test[target])}")

In [None]:
metric_report(better_model,test)


In [None]:
naive_model = xgb.XGBClassifier(eval_metric='mlogloss')
naive_model.fit(pred_train,lbs_train)

In [None]:
metric_report(naive_model,test)

In [None]:
def get_xgb_imp(xgb):
    imp_vals = xgb.get_booster().get_fscore()
    feats_imp = pd.DataFrame(imp_vals,index=np.arange(2)).T
    feats_imp.iloc[:,0]= feats_imp.index    
    feats_imp.columns=['feature','importance']
    feats_imp.sort_values('importance',inplace=True,ascending=False)
    #feats_imp.reset_index(drop=True,inplace=True)
    return feats_imp.set_index('feature')

In [None]:
naive_imps = get_xgb_imp(naive_model)

In [None]:
important_feats = list(naive_imps[naive_imps['importance']>= 100].index)


In [None]:
better_model = xgb.XGBClassifier(eval_metric='mlogloss', colsample_bytree = 0.55,  learning_rate= 0.001,  max_depth= 3, n_estimators= 5, reg_alpha= 1e-02, subsample= 0.5, min_child_weight = 7)
better_model.fit(train.loc[:,important_feats],lbs_train)

In [None]:
metric_report(better_model,test)

In [None]:
y_pred = better_model.predict(test[important_feats])

In [None]:
y_pred

In [None]:
f1_score(test[target],y_pred,average='macro')

In [None]:
accuracy_score(y_pred,test[target])

# MD-based clustering.

In [None]:
import pandas as pd
import os

In [None]:
# Read metadata and clusters 
md_path = '../01_data/01_biological_data/metadata_chile.tsv'
md_df = pd.read_csv(md_path, sep = "\t")
cl_path = '../03_results/out_genomic_clusters/kmeans_results_ch.tsv'
cl_df = pd.read_csv(cl_path, sep = "\t")

#Export to get datased to plot in 3D
cols_to_get = cl_df.columns.to_list() + ['lat_cast','lon_cast', 'Depth [m]']
file = pd.merge(md_df, cl_df, on='Samples')[cols_to_get]
file.to_csv(path_or_buf='../03_results/clusters_with_coords.tsv', sep= '\t')
# Prepare df for the study
md_df.set_index('Samples', inplace=True)
cl_df.set_index('Samples', inplace=True)
s1 = md_df['Nitrate [uM]']
s2 = md_df['Nitrates [uM]']
nitrates = 0.5*(s1+s2)

md_df['nitrates [uM]'] = nitrates 

md_df = md_df[['Temperature [ºC]','Oxygen [ml/l]','nitrates [uM]', 'Depth level']]


In [None]:
md_df['Depth level'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Generate plots for the main metadata feats ordered from smaller to larger value.
xvals = {'all': 148, 'SRF': 27, 'MES': 38, 'EPI': 81}
spaces = {'Temperature [ºC]':5,'Oxygen [ml/l]':7,'nitrates [uM]':5}
for col in md_df.columns:
    if col != 'Depth level':
        space = spaces[col]
        xval = xvals['all']
        data = md_df[col].sort_values()
        fig, ax = plt.subplots(figsize = (25,15))
        ax.set_title(f'{col} values in TARA Chile (all)', fontsize = 35, pad = 20)
        ax.set_xticks([])
        plt.yticks(fontsize = 15)
        plt.axhline(data.mean(), linestyle='--', color = 'r', label = 'Mean')
        plt.axhline(data.median(), linestyle='--', color = 'g', label = 'Median')
        ax.set_ylabel(f'{col}', fontsize = 25,labelpad=20)
        ax.set_xlabel('Sample', fontsize = 25,labelpad=20)
        ax.legend(title="MTCs", fontsize = 15, title_fontsize = 18)
        plt.text(xval, data.min(), f'Min: {data.min():.3f}\nMax: {data.max():.3f}\nMean: {data.mean():.3f}\nMedian: {data.median():.3f}', fontsize = 20, bbox = dict(facecolor = 'green', alpha = 0.2, ec = 'black'))
        plt.scatter(data.index,data)
        sns.set(style='darkgrid')
        path = f'../03_results/out_ch_data_analysis/{col[:-space]}_all'
        plt.savefig(path)
        for depth in md_df['Depth level'].unique():
            #plot by depth level
            xval = xvals[depth]
            data = (md_df[md_df['Depth level'] == depth])[col].sort_values()
            fig, ax = plt.subplots(figsize = (25,15))
            ax.set_title(f'{col} values in TARA Chile ({depth})', fontsize = 35, pad = 20)
            ax.set_xticks([])
            plt.yticks(fontsize = 15)
            plt.axhline(data.mean(), linestyle='--', color = 'r', label = 'Mean')
            plt.axhline(data.median(), linestyle='--', color = 'g', label = 'Median')
            ax.set_ylabel(f'{col}', fontsize = 25,labelpad=20)
            ax.set_xlabel('Sample', fontsize = 25,labelpad=20)
            ax.legend(title="MTCs", fontsize = 15, title_fontsize = 18)
            plt.text(xval, data.min(), f'Min: {data.min():.3f}\nMax: {data.max():.3f}\nMean: {data.mean():.3f}\nMedian: {data.median():.3f}', fontsize = 20, bbox = dict(facecolor = 'green', alpha = 0.2, ec = 'black'))
            plt.scatter(data.index,data)
            sns.set(style='darkgrid')
            path = f'../03_results/out_ch_data_analysis/{col[:-4]}_{depth}'
            plt.savefig(path)

In [None]:
# Metadata quantile-based binning
n_bins = [3,4,5,6,7,8] 
feats = ['Temperature [ºC]','Oxygen [ml/l]','nitrates [uM]']
layers = ['all','SRF','EPI','MES']
for n in n_bins:
    q = 1/n
    for layer in layers:
        if layer == 'all':
            for feat in feats:
                clean_feat = feat.split(" ", 1)[0]
                binning = f"{clean_feat}_{n}_{layer}"
                data = md_df[feat] 
                ratios = [k*q for k in range(1,n)]
                k_list = list(range(len(ratios)))
                k_list.reverse()
                quantiles = data.quantile(ratios).to_list()
                quantiles.reverse()
                quantiles = zip(k_list, quantiles)
                md_df[binning] = len(k_list)
                for k, quant in quantiles:
                    for ind in data.index:
                        val = data[ind]
                        if val<= quant:
                            md_df.at[ind,binning] = int(k)
                print(f"{binning}: {md_df[binning].value_counts()}")
        else:
            for feat in feats:
                clean_feat = feat.split(" ", 1)[0]
                binning = f"{clean_feat}_{n}_{layer}"
                data = md_df[md_df['Depth level'] == layer][feat]
                ratios = [k*q for k in range(1,n)]
                k_list = list(range(len(ratios)))
                k_list.reverse()
                quantiles = data.quantile(ratios).to_list()
                quantiles.reverse()
                quantiles = zip(k_list, quantiles)
                md_df.loc[md_df['Depth level'] == layer,binning] = len(k_list)
                for k, quant in quantiles:
                    for ind in data.index:
                        val = data[ind]
                        if val<= quant:
                            md_df.at[ind,binning] = int(k)
                print(f"{binning}: {md_df[binning].value_counts()}")

In [None]:
md_df

In [None]:
md_df
for layer in layers:
    if layer == 'all':
        data = md_df[[col for col in md_df.columns if layer in col]]
    else:
        data = md_df[md_df['Depth level'] == layer][[col for col in md_df.columns if layer in col]]
    data.to_csv(path_or_buf= f'../03_results/metadata_based_clusters/metadata_clusters_{layer}.tsv', sep= '\t')

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [None]:
predictors = pd.read_csv('../01_data/02_satellite_data_processed/matrix_tara_chile_adj_grids_25_all.tsv',sep = '\t').set_index('Samples')

cluster_dir = '../03_results/metadata_based_clusters'
desired_clusters = {'5', '6', '7', '8'}

feats = ['Temperature [ºC]','Oxygen [ml/l]','nitrates [uM]']
layers = ['all'
#          ,'SRF','EPI','MES'
          ]
columns_to_use = []
for feat in feats:
    clean_feat = feat.split(" ", 1)[0]
    for n in desired_clusters:
        columns_to_use.append(clean_feat+'_'+n)

results_df = pd.DataFrame(index=layers, columns=columns_to_use)

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    #recall = recall_score(y_true, y_pred, average='macro')
    #precision = precision_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    #roc_auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr')
    return (accuracy, f1)

n_splits = 8
n_repeats = 9

rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

le = LabelEncoder()

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_macro': make_scorer(f1_score, average='macro')
}

for target_vars_filename in [f for f in os.listdir(cluster_dir) if not f.split('_')[-1] == 'metrics.tsv']:
    target_vars_path = os.path.join(cluster_dir, target_vars_filename)
    target_vars = pd.read_csv(target_vars_path, sep='\t', index_col=0)
    aligned_predictor = predictors.loc[predictors.index.intersection(target_vars.index)]
    layer = target_vars_filename[-7:-4]
    for col in columns_to_use:
        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1_macro': make_scorer(f1_score, average='macro')
        }

        n_clusters = int(col.split('_')[-1])
        feat = col.split('_')[0]
        target_column = f"{feat}_{n_clusters}_{layer}"
        X = aligned_predictor
        y = target_vars.loc[aligned_predictor.index, target_column]
        non_nan_indices = y.dropna().index
        X = X.loc[non_nan_indices]
        y = y.loc[non_nan_indices]
        
        y_encoded = le.fit_transform(y)
        unique, counts = np.unique(y_encoded, return_counts=True)
        min_samples = n_splits

        X_resampled = X.copy()
        y_resampled = y_encoded.copy()

        for cls, count in zip(unique, counts):
            if count < min_samples:
                diff = min_samples - count
                cls_indices = np.where(y_encoded == cls)[0]
                indices_to_duplicate = np.random.choice(cls_indices, diff, replace=True)
                X_resampled = np.concatenate([X_resampled, X.iloc[indices_to_duplicate]], axis=0)
                y_resampled = np.concatenate([y_resampled, y_encoded[indices_to_duplicate]], axis=0)

        model = xgb.XGBClassifier(eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = n_clusters,
                                    learning_rate =0.2,
                                    n_estimators=10,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    )

        #cv_results = cross_validate(model, X, y_encoded, cv=rskf, scoring=scoring, return_train_score=False)
        cv_results = cross_validate(model, X_resampled, y_resampled, cv=rskf, scoring=scoring, return_train_score=False)

        avg_accuracy = np.mean(cv_results['test_accuracy'])
        avg_f1_macro = np.mean(cv_results['test_f1_macro'])

        results_df.at[layer, col] = f"({avg_accuracy}, {avg_f1_macro})"

In [None]:
results_df[[col for col in results_df.columns if '8' in col]]

In [None]:
#results_df.to_csv(path_or_buf='../03_results/metadata_based_clusters/metadata_cluster_metrics_splitted.tsv', sep = '\t')

In [None]:
md_df

# Mixup: Metadata +  Biodata

In this section we will try another approach to the NASA-based bio-prediction concept. Now we will mix the metadata with the biodata, and based on that we will cluster with k-means (in contrast to the only bio-based k-means done previously), and then we will try to predict those clusters using the NASA data.



In [1]:
import pandas as pd
import os
from tqdm import tqdm

In [2]:
# Read metadata and clusters 
md_path = '../01_data/01_biological_data/metadata_chile.tsv'
md_df = pd.read_csv(md_path, sep = "\t")
cl_path = '../03_results/out_genomic_clusters/kmeans_results_ch.tsv'
cl_df = pd.read_csv(cl_path, sep = "\t")

#Export to get datased to plot in 3D
cols_to_get = cl_df.columns.to_list() + ['lat_cast','lon_cast', 'Depth [m]']
file = pd.merge(md_df, cl_df, on='Samples')[cols_to_get]
file.to_csv(path_or_buf='../03_results/clusters_with_coords.tsv', sep= '\t')
# Prepare df for the study
md_df.set_index('Samples', inplace=True)
cl_df.set_index('Samples', inplace=True)
s1 = md_df['Nitrate [uM]']
s2 = md_df['Nitrates [uM]']
nitrates = 0.5*(s1+s2)

md_df['nitrates [uM]'] = nitrates 

md_df.drop(columns=['Nitrate [uM]','Nitrates [uM]'], inplace=True)


In [3]:
md_df.columns

Index(['SAMEA ID', 'Leg', 'Station', 'Station ID', 'Depth ID', 'lat_cast',
       'lon_cast', 'datetime', 'Depth [m]', 'Temperature [ºC]',
       'Salinity [PSU]', 'Density [kg/m3]', 'Oxygen [ml/l]', 'Oxygen [%]',
       'Fluorescence [mg/m3]', 'Orthophosphate [uM]', 'Silicic-acid [uM]',
       'Nitrite [uM]', 'NP ratio', 'year', 'month', 'day', 'hour', 'minute',
       'second', 'instrument', 'original file', 'Depth level', 'Oxygen level',
       'Biogeographical units', 'Freshwater inputs', 'Oxy_depth',
       'Distance from coast (km)', 'Latitude Bin', 'nitrates [uM]'],
      dtype='object')

Now we create the dataframes based on which the clusters will be made,taking care in eliminating unnecessary columns. For that, we firstly drop all the non-important technical data.

In [55]:
bio_path = '../01_data/01_biological_data'
path_list = [path for path in os.listdir(bio_path) if 'Matrix_chile' in path and '_all.tsv' in path]
df_list = []
for path in path_list:
    full_path = f"{bio_path}/{path}"
    bio_df = pd.read_csv(full_path, sep = '\t').set_index('Samples')
    full_df = md_df.join(bio_df)
    final_df = full_df.drop(columns=['SAMEA ID','Leg', 'Station', 'Station ID', 'Depth ID', 'lat_cast',
       'lon_cast', 'datetime', 'Depth [m]', 'instrument','original file', 'year', 'month', 'day', 'hour', 'minute',
       'second'])
    df_list.append(final_df)
    print(final_df.shape)
    

(159, 874)
(159, 37862)
(159, 10253)
(159, 75)
(159, 81)


Then, we parse the dataframes looking for the columns with no variability, and drop those.

In [53]:
trivial_keys = {}
for k in range(len(df_list)):
    df = df_list[k]
    ordered = df.nunique().sort_values().copy(deep = True)
    for key in tqdm(ordered.index):
        if ordered[key] > 1:
            break
    first_non_triv_key = key
    first_non_triv_ind = ordered.index.get_loc(first_non_triv_key)
    triv_keys = ordered.index[:first_non_triv_ind]
    trivial_keys[path_list[k]] = triv_keys
    print(f"Number of columns dropped from {path_list[k]}: {len(trivial_keys[path_list[k]])}")

  5%|▌         | 48/874 [00:00<00:00, 43129.09it/s]


Number of columns dropped from Matrix_chile_GEN_guidi_all.tsv: 48


 10%|▉         | 3680/37862 [00:00<00:00, 111802.74it/s]


Number of columns dropped from Matrix_chile_GEN_M0_all.tsv: 3680


  5%|▍         | 487/10253 [00:00<00:00, 174927.30it/s]


Number of columns dropped from Matrix_chile_GEN_M1_all.tsv: 487


  7%|▋         | 5/75 [00:00<00:00, 1572.08it/s]


Number of columns dropped from Matrix_chile_GEN_salazar_all.tsv: 5


  1%|          | 1/81 [00:00<00:00, 2896.62it/s]

Number of columns dropped from Matrix_chile_GEN_stress_all.tsv: 1





In [54]:
for k in range(len(df_list)):
    matrix_path = path_list[k]
    df = df_list[k]
    print(f"Old {matrix_path} shape: {df.shape}")
    df_list[k].drop(columns = trivial_keys[path_list[k]], inplace = True)
    print(f"New {matrix_path} shape: {df.shape}")

Old Matrix_chile_GEN_guidi_all.tsv shape: (159, 874)
New Matrix_chile_GEN_guidi_all.tsv shape: (159, 826)
Old Matrix_chile_GEN_M0_all.tsv shape: (159, 37862)
New Matrix_chile_GEN_M0_all.tsv shape: (159, 34182)
Old Matrix_chile_GEN_M1_all.tsv shape: (159, 10253)
New Matrix_chile_GEN_M1_all.tsv shape: (159, 9766)
Old Matrix_chile_GEN_salazar_all.tsv shape: (159, 75)
New Matrix_chile_GEN_salazar_all.tsv shape: (159, 70)
Old Matrix_chile_GEN_stress_all.tsv shape: (159, 81)
New Matrix_chile_GEN_stress_all.tsv shape: (159, 80)


In [None]:
k = 4
df = df_list[k]
print(f"File: {path_list[k]}")
trivial_keys[path_list[k]] = []
for key in tqdm(df.nunique().index):
    if df.nunique()[key] == 1:
        trivial_keys[path_list[k]].append(key) 
print(f"Number of columns dropped from {path_list[k]}: {len(trivial_keys[path_list[k]])}")

In [56]:
k = 3
df = df_list[k]
ordered = df.nunique().sort_values().copy(deep = True)