
The idea of this notebook is to get a uniformized and sef-contained pipeline in which we will get the best (most predictable) clustering for the data.

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import xgboost as xgb

from glob import glob
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [3]:
# CLR implementation
def clr_(data, eps=1e-6):
    """
    Perform centered log-ratio (clr) normalization on a dataset.

    Parameters:
    data (pandas.DataFrame): A DataFrame with samples as rows and components as columns.

    Returns:
    pandas.DataFrame: A clr-normalized DataFrame.
    """
    if (data < 0).any().any():
        raise ValueError("Data should be strictly positive for clr normalization.")

    # Add small amount to cells with a value of 0
    if (data <= 0).any().any():
        data = data.replace(0, eps)

    # Calculate the geometric mean of each row
    gm = np.exp(data.apply(np.log).mean(axis=1))

    # Perform clr transformation
    clr_data = data.apply(np.log).subtract(np.log(gm), axis=0)

    return clr_data

def perform_kmeans_clustering(matrix, matrix_type_subsample, n_clusters_list, clr=False):
    suffix = 'clr_' if clr else ''
    # Perform K-Means for different 'n'
    for n_clusters in n_clusters_list:
        kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=50)
        kmeans.fit(matrix)
        
        cluster_labels = kmeans.labels_
        
        # Calculate evaluation metrics
        inertia = kmeans.inertia_
        silhouette_avg = silhouette_score(matrix, cluster_labels)
        davies_bouldin = davies_bouldin_score(matrix, cluster_labels)
        calinski_harabasz = calinski_harabasz_score(matrix, cluster_labels)
        
        all_metrics_results.append({
            'matrix': f"{suffix}{matrix_type_subsample}",
            'n_clusters': n_clusters,
            'inertia': inertia,
            'silhouette_score': silhouette_avg,
            'davies_bouldin_score': davies_bouldin,
            'calinski_harabasz_score': calinski_harabasz
        })
        
        col_name = f"{suffix}{matrix_type_subsample}_kmeans_{n_clusters}" # Create a DataFrame for the cluster labels with appropriate column names
        results = pd.DataFrame({col_name: cluster_labels}, index=matrix.index)
        
        if col_name not in clustering_results_dict:
            clustering_results_dict[col_name] = results
        else:
            clustering_results_dict[col_name] = pd.concat([clustering_results_dict[col_name], results], axis=1)


# Bio data k-means
## Data generation and saving

In [4]:
md_dir = '../01_data/01_biological_data/tara_ch'
md_filename = 'metadata_chile.tsv'
md_path = os.path.join(md_dir,md_filename)
md = pd.read_csv(md_path, sep='\t', index_col=0)
output_dir = md_dir 

In [5]:
depth_list = ['SRF','EPI','MES']
for depth in depth_list:
    md_clean = md[md['Depth level'] == depth]
    # Read matrices of interest and sort them alphabetically
    files = os.listdir(md_dir)
    matrix_files = sorted([f for f in files if f.startswith('Matrix_chile_GEN_') and f.endswith('_all.tsv')])
    for name in matrix_files:
        print(f"filtering {name}")
        file_path = os.path.join(md_dir, name)
        matrix = pd.read_csv(file_path, sep='\t', index_col=0)
        clean_matrix = md_clean.join(matrix).drop(md_clean.columns,axis = 1)
        output_filename = '_'.join(name.split('_')[:-1]) + f'_{depth.lower()}.tsv'
        clean_matrix.to_csv(os.path.join(output_dir, output_filename), sep='\t', index=True)

filtering Matrix_chile_GEN_M0_all.tsv
filtering Matrix_chile_GEN_M1_all.tsv
filtering Matrix_chile_GEN_guidi_all.tsv
filtering Matrix_chile_GEN_salazar_all.tsv
filtering Matrix_chile_GEN_stress_all.tsv
filtering Matrix_chile_GEN_M0_all.tsv
filtering Matrix_chile_GEN_M1_all.tsv
filtering Matrix_chile_GEN_guidi_all.tsv
filtering Matrix_chile_GEN_salazar_all.tsv
filtering Matrix_chile_GEN_stress_all.tsv
filtering Matrix_chile_GEN_M0_all.tsv
filtering Matrix_chile_GEN_M1_all.tsv
filtering Matrix_chile_GEN_guidi_all.tsv
filtering Matrix_chile_GEN_salazar_all.tsv
filtering Matrix_chile_GEN_stress_all.tsv


## Clustering

In [6]:
input_dir = '../01_data/01_biological_data/tara_ch'
os.makedirs(output_dir, exist_ok=True)

# Read matrices of interest and sort them alphabetically
files = os.listdir(input_dir)


# Perform K-Means for different n-clusters for each matrix
for layer in depth_list:
    all_metrics_results = []
    clustering_results_dict = {}
    output_dir = f'../03_results/out_genomic_clusters/clusters_ch/bio_clusters/bio_clusters_{layer.lower()}'
    matrix_files = sorted([f for f in files if f.startswith('Matrix_chile_GEN_') and f.endswith(f'_{layer.lower()}.tsv')])
    n_clusters_list = [3, 4, 5, 6, 7, 8]
    for matrix_file in matrix_files:
        print(f"performing k-means to {matrix_file}")
        file_path = os.path.join(input_dir, matrix_file)
        matrix = pd.read_csv(file_path, sep='\t', index_col=0)
        base_filename = os.path.splitext(os.path.basename(file_path))[0]
        matrix_type_subsample = "_".join(base_filename.split('_')[3:])
        
        perform_kmeans_clustering(matrix, matrix_type_subsample, n_clusters_list, clr=False)
        # CLR normalized matrix clustering
        clr_matrix = clr_(matrix)
        perform_kmeans_clustering(clr_matrix, matrix_type_subsample, n_clusters_list, clr=True)



    combined_clustering_results = pd.concat(clustering_results_dict.values(), axis=1)
    #combined_clustering_results = combined_clustering_results.sort_index(axis=1)

    # Results of the kmeans
    output_filename = f'kmeans_results_ch_{layer}.tsv'
    combined_clustering_results.to_csv(os.path.join(output_dir, output_filename), sep='\t', index=True)

    # Results of the metrics of the kmeans clustering
    metrics_df = pd.DataFrame(all_metrics_results)
    metrics_output_filename = f'kmeans_metrics_ch_{layer}.tsv'
    metrics_df.to_csv(os.path.join(output_dir, metrics_output_filename), sep='\t', index=False)

    # Plot metrics
    unique_matrices = metrics_df['matrix'].unique()
    for matrix_type_subsample in unique_matrices:
        matrix_metrics_df = metrics_df[metrics_df['matrix'] == matrix_type_subsample]
        
        fig, ax1 = plt.subplots(figsize=(10, 6))

        ax1.set_xlabel('Number of Clusters')
        ax1.set_ylabel('Inertia', color='tab:blue')
        ax1.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['inertia'], color='tab:blue', label='Inertia')
        ax1.tick_params(axis='y', labelcolor='tab:blue')

        ax2 = ax1.twinx()
        ax2.set_ylabel('Silhouette Score', color='tab:orange')
        ax2.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['silhouette_score'], color='tab:orange', label='Silhouette Score')
        ax2.tick_params(axis='y', labelcolor='tab:orange')
        ax2.axhline(y=0.25, color='tab:orange', linestyle='--', linewidth=1, label='Silhouette Score Threshold (0.25)')

        ax3 = ax1.twinx()
        ax3.spines['right'].set_position(('outward', 60))
        ax3.set_ylabel('Davies-Bouldin Score', color='tab:green')
        ax3.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['davies_bouldin_score'], color='tab:green', label='Davies-Bouldin Score')
        ax3.tick_params(axis='y', labelcolor='tab:green')
        ax3.axhline(y=1.50, color='tab:green', linestyle='--', linewidth=1, label='Davies-Bouldin Score Threshold (1.50)')

        ax4 = ax1.twinx()
        ax4.spines['right'].set_position(('outward', 120))
        ax4.set_ylabel('Calinski-Harabasz Score', color='tab:red')
        ax4.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['calinski_harabasz_score'], color='tab:red', label='Calinski-Harabasz Score')
        ax4.tick_params(axis='y', labelcolor='tab:red')

        ax1.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
        
        fig.tight_layout()
        plt.title(f'Evaluation Metrics for {matrix_type_subsample}')

        # Save the plot
        plot_filename = f'kmeans_metrics_ch_{matrix_type_subsample}.pdf'
        plt.savefig(os.path.join(output_dir, plot_filename), bbox_inches='tight')
        plt.close()




performing k-means to Matrix_chile_GEN_M0_srf.tsv
performing k-means to Matrix_chile_GEN_M1_srf.tsv
performing k-means to Matrix_chile_GEN_guidi_srf.tsv
performing k-means to Matrix_chile_GEN_salazar_srf.tsv
performing k-means to Matrix_chile_GEN_stress_srf.tsv
performing k-means to Matrix_chile_GEN_M0_epi.tsv
performing k-means to Matrix_chile_GEN_M1_epi.tsv
performing k-means to Matrix_chile_GEN_guidi_epi.tsv
performing k-means to Matrix_chile_GEN_salazar_epi.tsv
performing k-means to Matrix_chile_GEN_stress_epi.tsv
performing k-means to Matrix_chile_GEN_M0_mes.tsv
performing k-means to Matrix_chile_GEN_M1_mes.tsv
performing k-means to Matrix_chile_GEN_guidi_mes.tsv
performing k-means to Matrix_chile_GEN_salazar_mes.tsv
performing k-means to Matrix_chile_GEN_stress_mes.tsv


## XGB

In [7]:
input_kmeans_dir = '..\\03_results\\out_genomic_clusters\\clusters_ch\\bio_clusters'
df_list = []
for fold in os.listdir(input_kmeans_dir):
    folder = f"{input_kmeans_dir}/{fold}"
    layer = folder.split('_')[-1]
    if layer == 'all':
        layer = ''
    else:
        layer = '_'+layer
    path = f"{folder}/kmeans_results_ch{layer.upper()}.tsv"
    df = pd.read_csv(path, sep = '\t')
    
    df_list.append(df.set_index('Samples'))

all_clusters = pd.concat(df_list, axis=1)

In [8]:
input_sat_dir = '../01_data/02_satellite_data_processed'

desired_files = [
'matrix_tara_chile_adj_grids_25_all.tsv'
]

predictor_files = sorted([f for f in glob(os.path.join(input_sat_dir, 'matrix_tara_chile_adj_grids_*.tsv')) 
                          if os.path.basename(f) in desired_files])

target_vars = all_clusters
target_vars = target_vars.map(lambda x: f"C{x}")

desired_clusters = {'5', '6', '7', '8'} # only consider this number of clusters # only consider clr-abundance clusters

df_cols = []
for col in df_list[0].columns:
    bits = col.split('_')
    if bits[0]=='clr' and bits[-1] in desired_clusters:
        new_col = '_'.join([b for b in bits if b not in ['all','srf','mes','epi']])
        df_cols.append(new_col)

results_df_bd = pd.DataFrame(columns = df_cols)

In [11]:
predictors = pd.read_csv('../01_data/02_satellite_data_processed/matrix_tara_chile_adj_grids_25_all.tsv',sep = '\t').set_index('Samples')

cluster_dir = input_kmeans_dir

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    #recall = recall_score(y_true, y_pred, average='macro')
    #precision = precision_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    #roc_auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr')
    return (accuracy, f1)

n_splits = 8
n_repeats = 9

rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

le = LabelEncoder()

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_macro': make_scorer(f1_score, average='macro')
}

for fold in os.listdir(input_kmeans_dir):
    folder = f"{input_kmeans_dir}/{fold}"
    layerr = folder.split('_')[-1]
    if layerr == 'all':
        layer = ''
    else:
        layer = '_'+layerr
    
    path = f"{folder}/kmeans_results_ch{layer.upper()}.tsv"
    target_vars = pd.read_csv(path, sep = '\t', index_col= 0)
    columns_to_use = [col for col in target_vars.columns if col.startswith('clr_') and col.split('_')[-1] in desired_clusters]
    aligned_predictor = predictors.loc[predictors.index.intersection(target_vars.index)]
    for col in columns_to_use:
        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1_macro': make_scorer(f1_score, average='macro')
        }
        bits = col.split('_')
        score_col = '_'.join([b for b in bits if b not in ['all','srf','mes','epi']])
        n_clusters = int(col.split('_')[-1])
        X = aligned_predictor
        y = target_vars.loc[aligned_predictor.index, col]
        non_nan_indices = y.dropna().index
        X = X.loc[non_nan_indices]
        y = y.loc[non_nan_indices]
        
        y_encoded = le.fit_transform(y)
        unique, counts = np.unique(y_encoded, return_counts=True)
        min_samples = n_splits

        X_resampled = X.copy()
        y_resampled = y_encoded.copy()

        for cls, count in zip(unique, counts):
            if count < min_samples:
                diff = min_samples - count
                cls_indices = np.where(y_encoded == cls)[0]
                indices_to_duplicate = np.random.choice(cls_indices, diff, replace=True)
                X_resampled = np.concatenate([X_resampled, X.iloc[indices_to_duplicate]], axis=0)
                y_resampled = np.concatenate([y_resampled, y_encoded[indices_to_duplicate]], axis=0)

        model = xgb.XGBClassifier(eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = n_clusters,
                                    learning_rate =0.2,
                                    n_estimators=10,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    )

        #cv_results = cross_validate(model, X, y_encoded, cv=rskf, scoring=scoring, return_train_score=False)
        cv_results = cross_validate(model, X_resampled, y_resampled, cv=rskf, scoring=scoring, return_train_score=False)

        avg_accuracy = np.mean(cv_results['test_accuracy'])
        avg_f1_macro = np.mean(cv_results['test_f1_macro'])

        results_df_bd.at[layer, score_col] = avg_f1_macro

In [12]:
i,j = np.where(results_df_bd == results_df_bd.max().max())

In [13]:
results_df_bd.to_csv(path_or_buf= '../03_results/out_predictions/predictions_kmeans_ch_bio.tsv',sep = '\t')

In [14]:
results_df_bd.iloc[i,j]

Unnamed: 0,clr_M1_kmeans_5
_srf,0.941944


# Metadata-based bins

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

## Data preparation and Clustering

In [None]:
# Read metadata and clusters 
md_path = '../01_data/01_biological_data/tara_ch/metadata_chile.tsv'
md_df = pd.read_csv(md_path, sep = "\t")

# Prepare df for the study
md_df.set_index('Samples', inplace=True)
s1 = md_df['Nitrate [uM]']
s2 = md_df['Nitrates [uM]']
nitrates = 0.5*(s1+s2)

md_df['nitrates [uM]'] = nitrates 

md_df = md_df[['Temperature [ºC]','Oxygen [ml/l]','nitrates [uM]', 'Depth level']]


In [None]:
# Metadata quantile-based binning
n_bins = [3,4,5,6,7,8] 
feats = ['Temperature [ºC]','Oxygen [ml/l]','nitrates [uM]']
layers = ['all','SRF','EPI','MES']
for n in n_bins:
    q = 1/n
    for layer in layers:
        if layer == 'all':
            for feat in feats:
                clean_feat = feat.split(" ", 1)[0]
                binning = f"{clean_feat}_{n}_{layer}"
                data = md_df[feat] 
                ratios = [k*q for k in range(1,n)]
                k_list = list(range(len(ratios)))
                k_list.reverse()
                quantiles = data.quantile(ratios).to_list()
                quantiles.reverse()
                quantiles = zip(k_list, quantiles)
                md_df[binning] = len(k_list)
                for k, quant in quantiles:
                    for ind in data.index:
                        val = data[ind]
                        if val<= quant:
                            md_df.at[ind,binning] = int(k)
        else:
            for feat in feats:
                clean_feat = feat.split(" ", 1)[0]
                binning = f"{clean_feat}_{n}_{layer}"
                data = md_df[md_df['Depth level'] == layer][feat]
                ratios = [k*q for k in range(1,n)]
                k_list = list(range(len(ratios)))
                k_list.reverse()
                quantiles = data.quantile(ratios).to_list()
                quantiles.reverse()
                quantiles = zip(k_list, quantiles)
                md_df.loc[md_df['Depth level'] == layer,binning] = len(k_list)
                for k, quant in quantiles:
                    for ind in data.index:
                        val = data[ind]
                        if val<= quant:
                            md_df.at[ind,binning] = int(k)

In [None]:
for layer in layers:
    if layer == 'all':
        data = md_df[[col for col in md_df.columns if layer in col]]
    else:
        data = md_df[md_df['Depth level'] == layer][[col for col in md_df.columns if layer in col]]
    data.to_csv(path_or_buf= f'../03_results/clusters_ch/metadata_based_clusters/metadata_clusters_{layer}.tsv', sep= '\t')

## XGB

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [15]:
predictors = pd.read_csv('../01_data/02_satellite_data_processed/matrix_tara_chile_adj_grids_25_all.tsv',sep = '\t').set_index('Samples')

cluster_dir = '../03_results/out_genomic_clusters/clusters_ch/metadata_based_clusters'
desired_clusters = {'5', '6', '7', '8'}

feats = ['Temperature [ºC]','Oxygen [ml/l]','nitrates [uM]']
layers = ['all''SRF','EPI','MES']
columns_to_use = []
for feat in feats:
    clean_feat = feat.split(" ", 1)[0]
    for n in desired_clusters:
        columns_to_use.append(clean_feat+'_'+n)

results_df_md = pd.DataFrame(index=layers, columns=columns_to_use)

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    #roc_auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr')
    return (accuracy, f1)

n_splits = 8
n_repeats = 9

rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

le = LabelEncoder()

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_macro': make_scorer(f1_score, average='macro')
}

for target_vars_filename in [f for f in os.listdir(cluster_dir) if not f.split('_')[-1] == 'metrics.tsv']:
    target_vars_path = os.path.join(cluster_dir, target_vars_filename)
    target_vars = pd.read_csv(target_vars_path, sep='\t', index_col=0)
    aligned_predictor = predictors.loc[predictors.index.intersection(target_vars.index)]
    layer = target_vars_filename[-7:-4]
    for col in columns_to_use:
        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1_macro': make_scorer(f1_score, average='macro')
        }

        n_clusters = int(col.split('_')[-1])
        feat = col.split('_')[0]
        target_column = f"{feat}_{n_clusters}_{layer}"
        X = aligned_predictor
        y = target_vars.loc[aligned_predictor.index, target_column]
        non_nan_indices = y.dropna().index
        X = X.loc[non_nan_indices]
        y = y.loc[non_nan_indices]
        
        y_encoded = le.fit_transform(y)
        unique, counts = np.unique(y_encoded, return_counts=True)
        min_samples = n_splits

        X_resampled = X.copy()
        y_resampled = y_encoded.copy()

        for cls, count in zip(unique, counts):
            if count < min_samples:
                diff = min_samples - count
                cls_indices = np.where(y_encoded == cls)[0]
                indices_to_duplicate = np.random.choice(cls_indices, diff, replace=True)
                X_resampled = np.concatenate([X_resampled, X.iloc[indices_to_duplicate]], axis=0)
                y_resampled = np.concatenate([y_resampled, y_encoded[indices_to_duplicate]], axis=0)

        model = xgb.XGBClassifier(eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = n_clusters,
                                    learning_rate =0.2,
                                    n_estimators=10,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    )

        #cv_results = cross_validate(model, X, y_encoded, cv=rskf, scoring=scoring, return_train_score=False)
        cv_results = cross_validate(model, X_resampled, y_resampled, cv=rskf, scoring=scoring, return_train_score=False)

        avg_accuracy = np.mean(cv_results['test_accuracy'])
        avg_f1_macro = np.mean(cv_results['test_f1_macro'])

        results_df_md.at[layer, col] = avg_f1_macro

In [16]:
i,j = np.where(results_df_md == results_df_md.max().max())

In [17]:
results_df_md.iloc[i,j]

Unnamed: 0,Temperature_7
SRF,0.901124


In [18]:
results_df_md.to_csv(path_or_buf='../03_results/out_predictions/predictions_mdbins.tsv',sep = '\t')

# Meta-bio 

In [None]:
import pandas as pd
import os
from tqdm import tqdm

In [None]:
# Read metadata and clusters 
md_path = '../01_data/01_biological_data/tara_ch/metadata_chile.tsv'
md_df = pd.read_csv(md_path, sep = "\t")

# Prepare df for the study
md_df.set_index('Samples', inplace=True)
usable_cols = [i for i in md_df.columns if md_df.dtypes.loc[i] in [float,int]]
cols = [c for c in usable_cols if md_df.min()[c]>0]
md_df = md_df[cols]
s1 = md_df['Nitrate [uM]']
s2 = md_df['Nitrates [uM]']
nitrates = 0.5*(s1+s2)

md_df['nitrates [uM]'] = nitrates 

md_df.drop(columns=['Nitrate [uM]','Nitrates [uM]'], inplace=True)

In [None]:
md_df.columns

Now we create the dataframes based on which the clusters will be made, taking care in eliminating unnecessary columns. For that, we firstly drop all the non-important technical data.

In [None]:
bio_path = '../01_data/01_biological_data/tara_ch'
path_list = [path for path in os.listdir(bio_path) if 'Matrix_chile' in path and 'srf.tsv' in path and path != 'metadata_clile.tsv']
df_list = []
for path in path_list:
    full_path = f"{bio_path}/{path}"
    bio_df = pd.read_csv(full_path, sep = '\t').set_index('Samples')
    full_df = pd.concat([md_df.loc[md_df.index.intersection(bio_df.index)], bio_df], axis =1)
    df = full_df.drop(columns=[c for c in md_df.columns if c in ['Leg', 'Station', 'Station ID', 'Depth ID', 'lat_cast',
       'lon_cast', 'datetime', 'Depth [m]', 'instrument','original file', 'year', 'month', 'day', 'hour', 'minute',
       'second']])
    ordered = df.nunique().sort_values().copy(deep = True)
    for key in tqdm(ordered.index):
        if ordered[key] > 1:
            break
    first_non_triv_key = key
    first_non_triv_ind = ordered.index.get_loc(first_non_triv_key)
    triv_keys = ordered.index[:first_non_triv_ind]
    df.drop(columns = triv_keys, inplace = True)
    break

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# CLR implementation
def clr_(data, eps=1e-6):
    """
    Perform centered log-ratio (clr) normalization on a dataset.

    Parameters:
    data (pandas.DataFrame): A DataFrame with samples as rows and components as columns.

    Returns:
    pandas.DataFrame: A clr-normalized DataFrame.
    """
    if (data < 0).any().any():
        raise ValueError("Data should be strictly positive for clr normalization.")

    # Add small amount to cells with a value of 0
    if (data <= 0).any().any():
        data = data.replace(0, eps)

    # Calculate the geometric mean of each row
    gm = np.exp(data.apply(np.log).mean(axis=1))

    # Perform clr transformation
    clr_data = data.apply(np.log).subtract(np.log(gm), axis=0)

    return clr_data


all_metrics_results = []
clustering_results_dict = {}

def perform_kmeans_clustering(matrix, matrix_type_subsample, n_clusters_list, clr=False):
    suffix = 'clr_' if clr else ''
    # Perform K-Means for different 'n'
    for n_clusters in n_clusters_list:
        kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=50)
        kmeans.fit(matrix)
        
        cluster_labels = kmeans.labels_
        
        # Calculate evaluation metrics
        inertia = kmeans.inertia_
        silhouette_avg = silhouette_score(matrix, cluster_labels)
        davies_bouldin = davies_bouldin_score(matrix, cluster_labels)
        calinski_harabasz = calinski_harabasz_score(matrix, cluster_labels)
        
        all_metrics_results.append({
            'matrix': f"{suffix}{matrix_type_subsample}",
            'n_clusters': n_clusters,
            'inertia': inertia,
            'silhouette_score': silhouette_avg,
            'davies_bouldin_score': davies_bouldin,
            'calinski_harabasz_score': calinski_harabasz
        })
        
        col_name = f"{suffix}{matrix_type_subsample}_kmeans_{n_clusters}" # Create a DataFrame for the cluster labels with appropriate column names
        results = pd.DataFrame({col_name: cluster_labels}, index=matrix.index)
        
        if col_name not in clustering_results_dict:
            clustering_results_dict[col_name] = results
        else:
            clustering_results_dict[col_name] = pd.concat([clustering_results_dict[col_name], results], axis=1)



In [None]:
path_list

In [None]:
# Perform K-Means for different n-clusters for each matrix

layers = ['all','srf','mes','epi']
n_clusters_list = [3, 4, 5, 6, 7, 8]
bio_path = '../01_data/01_biological_data/tara_ch'
for layer in layers:
    output_dir = f'../03_results/out_genomic_clusters/clusters_ch/metabio_clusters/{layer}'
    all_metrics_results = []
    clustering_results_dict = {}
    path_list = [path for path in os.listdir(bio_path) if 'Matrix_chile' in path and f'{layer}.tsv' in path and path != 'metadata_clile.tsv']
    for name in path_list:
        full_path = f"{bio_path}/{name}"
        bio_df = pd.read_csv(full_path, sep = '\t').set_index('Samples')
        full_df = pd.concat([md_df.loc[md_df.index.intersection(bio_df.index)], bio_df], axis =1)
        df = full_df.drop(columns=[c for c in md_df.columns if c in ['Leg', 'Station', 'Station ID', 'Depth ID', 'lat_cast',
        'lon_cast', 'datetime', 'Depth [m]', 'instrument','original file', 'year', 'month', 'day', 'hour', 'minute',
        'second']])
        ordered = df.nunique().sort_values().copy(deep = True)
        for key in tqdm(ordered.index):
            if ordered[key] > 1:
                break
        first_non_triv_key = key
        first_non_triv_ind = ordered.index.get_loc(first_non_triv_key)
        triv_keys = ordered.index[:first_non_triv_ind]
        df.drop(columns = triv_keys, inplace = True)
        matrix = df.copy(deep = True).dropna(axis = 0) 
        matrix_type_subsample = name[17:-4]
        perform_kmeans_clustering(matrix, matrix_type_subsample, n_clusters_list, clr=False)
        # CLR normalized matrix clustering
        clr_matrix = clr_(matrix)
        perform_kmeans_clustering(clr_matrix, matrix_type_subsample, n_clusters_list, clr=True)


    combined_clustering_results = pd.concat(clustering_results_dict.values(), axis=1)

    # Results of the kmeans
    output_filename = f'kmeans_results_metabio_ch_{layer}.tsv'
    combined_clustering_results.to_csv(os.path.join(output_dir, output_filename), sep='\t', index=True)

    # Results of the metrics of the kmeans clustering
    metrics_df = pd.DataFrame(all_metrics_results)
    metrics_output_filename = f'kmeans_metrics_metabio_ch_{layer}.tsv'
    metrics_df.to_csv(os.path.join(output_dir, metrics_output_filename), sep='\t', index=False)

    # Plot metrics
    unique_matrices = metrics_df['matrix'].unique()
    print('Plotting metrics.')
    for matrix_type_subsample in tqdm(unique_matrices):
        print(matrix_type_subsample)
        matrix_metrics_df = metrics_df[metrics_df['matrix'] == matrix_type_subsample]
        
        fig, ax1 = plt.subplots(figsize=(10, 6))

        ax1.set_xlabel('Number of Clusters')
        ax1.set_ylabel('Inertia', color='tab:blue')
        ax1.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['inertia'], color='tab:blue', label='Inertia')
        ax1.tick_params(axis='y', labelcolor='tab:blue')

        ax2 = ax1.twinx()
        ax2.set_ylabel('Silhouette Score', color='tab:orange')
        ax2.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['silhouette_score'], color='tab:orange', label='Silhouette Score')
        ax2.tick_params(axis='y', labelcolor='tab:orange')
        ax2.axhline(y=0.25, color='tab:orange', linestyle='--', linewidth=1, label='Silhouette Score Threshold (0.25)')

        ax3 = ax1.twinx()
        ax3.spines['right'].set_position(('outward', 60))
        ax3.set_ylabel('Davies-Bouldin Score', color='tab:green')
        ax3.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['davies_bouldin_score'], color='tab:green', label='Davies-Bouldin Score')
        ax3.tick_params(axis='y', labelcolor='tab:green')
        ax3.axhline(y=1.50, color='tab:green', linestyle='--', linewidth=1, label='Davies-Bouldin Score Threshold (1.50)')

        ax4 = ax1.twinx()
        ax4.spines['right'].set_position(('outward', 120))
        ax4.set_ylabel('Calinski-Harabasz Score', color='tab:red')
        ax4.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['calinski_harabasz_score'], color='tab:red', label='Calinski-Harabasz Score')
        ax4.tick_params(axis='y', labelcolor='tab:red')

        ax1.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
        
        fig.tight_layout()
        plt.title(f'Evaluation Metrics for {matrix_type_subsample}')

        # Save the plot
        plot_filename = f'kmeans_metrics_{matrix_type_subsample}_metabio_ch.pdf'
        plt.savefig(os.path.join(output_dir, plot_filename), bbox_inches='tight')
        plt.close()
        

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [19]:
predictors = pd.read_csv('../01_data/02_satellite_data_processed/matrix_tara_chile_adj_grids_25_all.tsv',sep = '\t').set_index('Samples')

desired_clusters = {'5', '6', '7', '8'}

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    #recall = recall_score(y_true, y_pred, average='macro')
    #precision = precision_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    #roc_auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr')
    return (accuracy, f1)

n_splits = 8
n_repeats = 9

rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

le = LabelEncoder()

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_macro': make_scorer(f1_score, average='macro')
}
layers = ['all','srf','epi','mes']
matrix_types = ['M0','M1','stress','guidi','salazar']
df_columns = []
for n in desired_clusters:
    for type in matrix_types:
        df_columns.append(f"clr_{type}_{n}")
results_df_mb = pd.DataFrame(index = layers, columns = df_columns)

for layer in tqdm(layers):
    cluster_dir = f'../03_results/out_genomic_clusters/clusters_ch/metabio_clusters/{layer}'
    target_vars_filename = f'kmeans_results_metabio_ch_{layer}.tsv'
    target_vars_path = os.path.join(cluster_dir, target_vars_filename)
    print(f"Reading {target_vars_path}")
    #DataFrame with the clusters
    target_vars = pd.read_csv(target_vars_path, sep='\t', index_col=0)

    columns_to_use = [col for col in target_vars.columns if col.startswith('clr_') and col.split('_')[-1] in desired_clusters] # only consider clr-abundance clusters

    aligned_predictor = predictors.loc[predictors.index.intersection(target_vars.index)]
    for col in columns_to_use:
        bits = col.split('_')
        mat_type = bits[1]
        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1_macro': make_scorer(f1_score, average='macro')
        }

        n_clusters = int(col.split('_')[-1])
        col_tag = f"clr_{mat_type}_{n_clusters}"
        X = aligned_predictor
        y = target_vars.loc[aligned_predictor.index, col]
        non_nan_indices = y.dropna().index
        X = X.loc[non_nan_indices]
        y = y.loc[non_nan_indices]
        
        y_encoded = le.fit_transform(y)
        unique, counts = np.unique(y_encoded, return_counts=True)
        min_samples = n_splits

        X_resampled = X.copy()
        y_resampled = y_encoded.copy()

        for cls, count in zip(unique, counts):
            if count < min_samples:
                diff = min_samples - count
                cls_indices = np.where(y_encoded == cls)[0]
                indices_to_duplicate = np.random.choice(cls_indices, diff, replace=True)
                X_resampled = np.concatenate([X_resampled, X.iloc[indices_to_duplicate]], axis=0)
                y_resampled = np.concatenate([y_resampled, y_encoded[indices_to_duplicate]], axis=0)

        model = xgb.XGBClassifier(eval_metric='merror', 
                                    seed = 29,
                                    objective= 'multi: softmax',
                                    num_class = n_clusters,
                                    learning_rate =0.2,
                                    n_estimators=10,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8
                                    )

        #cv_results = cross_validate(model, X, y_encoded, cv=rskf, scoring=scoring, return_train_score=False)
        cv_results = cross_validate(model, X_resampled, y_resampled, cv=rskf, scoring=scoring, return_train_score=False)

        avg_accuracy = np.mean(cv_results['test_accuracy'])
        avg_f1_macro = np.mean(cv_results['test_f1_macro'])

        results_df_mb.at[layer, col_tag] = f"({avg_accuracy}, {avg_f1_macro})"

NameError: name 'tqdm' is not defined

In [None]:
results_df_mb

In [None]:
i,j = np.where(results_df_mb == results_df_mb.max().max())

In [None]:
results_df_mb.iloc[i,j]

In [None]:
results_df_mb.to_csv(path_or_buf = '../03_results/out_predictions/predictions_kmeans_metabio.tsv',sep = '\t')