In [None]:
import xarray as xr
import pandas as pd
import geopandas as gpd
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import numpy as np
import os
import re

## Comparativa de técnicas

This notebook aims to copare two tecniques of clustering applied on `SRF` level samples.
* Take the old clusterizations, filtering the `SRF` samples, and mapping. 
* Filter, cluster, and then map. 

The filename will signal this following this nomenclature:
* `_cf`: clustered filtered.
* `_fc`: filtered clustered.

For this, we will make use of the `plot_clusters_on_map` function defined in the `clustering_projection_map` modules.

In [None]:
output_dir = '../03_results/out_genomic_clusters/map_projections_ch'

In [None]:

def plot_clusters_on_map(merged_data, cluster_column):
    filtered_data = merged_data[~merged_data[cluster_column].isna()] # filter out rows where cluster_column is NaN
    
    fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(20, 18), subplot_kw={'projection': ccrs.PlateCarree()})
    axes = axes.flatten()  # flatten the array of axes for easy iteration
    
    plot_titles = [
        f'Clusters Projection: {cluster_column}',
        'Temperature [ºC]',
       'Salinity [PSU]', 'Oxygen [%]',
       'Fluorescence [mg/m3]', 'Orthophosphate [uM]', 'Silicic-acid [uM]',
       'Nitrite [uM]', 'Nitrates [uM]', 'NP ratio'
    ]
    data_columns = [
        cluster_column,
        'Temperature [ºC]',
       'Salinity [PSU]', 'Oxygen [%]',
       'Fluorescence [mg/m3]', 'Orthophosphate [uM]', 'Silicic-acid [uM]',
       'Nitrite [uM]', 'Nitrates [uM]', 'NP ratio'
    ]
    
    unique_clusters = filtered_data[cluster_column].unique()
    num_clusters = len(unique_clusters)
    marker_styles = ['o', 's', '^', 'v', '<', '>', 'd', 'p', 'h', 'H', '*', 'x', '+', 'D']
    if num_clusters > len(marker_styles):
        marker_styles = (marker_styles * ((num_clusters // len(marker_styles)) + 1))[:num_clusters]
    cluster_marker_map = dict(zip(unique_clusters, marker_styles))
    
    env_vars = ['Temperature [ºC]',
       'Salinity [PSU]', 'Oxygen [%]',
       'Fluorescence [mg/m3]', 'Orthophosphate [uM]', 'Silicic-acid [uM]',
       'Nitrite [uM]', 'Nitrates [uM]', 'NP ratio']
    norms = {}
    
    for data_column in env_vars:
        vmin = filtered_data[data_column].min()
        vmax = filtered_data[data_column].max()
        norms[data_column] = Normalize(vmin=vmin, vmax=vmax)
    
    for idx, ax in enumerate(axes):
        ax.set_extent([-80, -67, -55,-17])

        ax.add_feature(cfeature.LAND)
        ax.add_feature(cfeature.OCEAN)
        ax.add_feature(cfeature.BORDERS)
        
        ax.set_title(plot_titles[idx])
        
        data_column = data_columns[idx]
        plot_data = filtered_data[~filtered_data[data_column].isna()]
        
        if idx == 0:
            for cluster_id in unique_clusters:
                cluster_points = plot_data[plot_data[cluster_column] == cluster_id]
                ax.scatter(
                    cluster_points['lon_cast'],
                    cluster_points['lat_cast'],
                    label=f'Cluster {cluster_id}',
                    s=35,
                    marker=cluster_marker_map[cluster_id],
                    transform=ccrs.PlateCarree()
                )
            ax.legend(loc='upper left')
        else:
            norm = norms[data_column]
            for cluster_id in unique_clusters:
                cluster_points = plot_data[plot_data[cluster_column] == cluster_id]
                sc = ax.scatter(
                    cluster_points['lon_cast'],
                    cluster_points['lat_cast'],
                    c=cluster_points[data_column],
                    s=35,
                    cmap='viridis',
                    marker=cluster_marker_map[cluster_id],
                    edgecolors='black',
                    norm=norm,
                    transform=ccrs.PlateCarree()
                )
            cbar = plt.colorbar(sc, ax=ax, orientation='vertical', shrink=0.5)
            cbar.set_label(data_column)
            handles = []
            for cluster_id in unique_clusters:
                marker = cluster_marker_map[cluster_id]
                handle = plt.Line2D([], [], color='black', marker=marker, linestyle='', markersize=8, label=f'Cluster {cluster_id}')
                handles.append(handle)
            ax.legend(handles=handles, loc='upper left')
    
    plt.tight_layout()
    
    output_path = os.path.join(output_dir, f'clusters_{cluster_column}.pdf')
    plt.savefig(output_path, format='pdf', bbox_inches='tight')
    plt.close()

### 1.- Cluster -> Filter

#### Data generation.
This code right ahead takes all the already clustered data, and saves a file of the selection of those samples that are 'SRF'. If a file named `kmeans_results_ch_srf_clustered_filtered.tsv` exists already in the folder, there is no need to run this code.

In [None]:
input_dir = '../03_results/out_genomic_clusters'
filename = 'kmeans_results_ch.tsv'


output_dir = '../03_results/out_genomic_clusters/map_projections_ch'
os.makedirs(output_dir, exist_ok=True)

file_path = os.path.join(input_dir,filename)


In [None]:
input_dir = '../01_data/01_biological_data'
output_dir = '../03_results/out_genomic_clusters'
os.makedirs(output_dir, exist_ok=True)

# Read matrices of interest and sort them alphabetically
files = os.listdir(input_dir)
matrix_files = sorted([f for f in files if f.startswith('Matrix_chile_GEN_') and f.endswith('.tsv')])

In [None]:
for file in matrix_files:
    pth = f"{input_dir}/{file}"
    #bio_mtrx es en verdad cluster_mtrx. Arreglar para legibilidad.
    clstr_mtrx =  pd.read_csv(file_path, sep='\t', index_col=0) 
    meta_mtrx = pd.read_csv('../01_data/01_biological_data/metadata_chile.tsv', sep='\t', index_col=0) 
    meta_mtrx = meta_mtrx[meta_mtrx['Depth level']== 'SRF']
    dirty_df = meta_mtrx.join(clstr_mtrx)
    clean_df = dirty_df.drop(meta_mtrx.columns, axis=1)
    new_keys = {col: col+'_cf' for col in clean_df.columns}
    clean_df.rename(columns = new_keys, inplace=True)
    output_filename = 'kmeans_results_ch_srf_clustered_filtered.tsv'
    clean_df.to_csv(os.path.join(output_dir, output_filename), sep='\t', index=True)

#### Plotting

In [None]:
input_dir = '../03_results/out_genomic_clusters'
filename = 'kmeans_results_ch_srf_clustered_filtered.tsv'

env_data_dir = '../01_data/01_biological_data'
env_filename = 'metadata_chile.tsv'

output_dir = '../03_results/out_genomic_clusters/map_projections_ch'
os.makedirs(output_dir, exist_ok=True)

file_path = os.path.join(input_dir,filename)
md_path = os.path.join(env_data_dir,env_filename)

clusters = pd.read_csv(file_path, sep='\t', index_col=0)
for col in clusters.columns:
    clusters[col] = clusters[col].astype('Int64')
md = pd.read_csv(md_path, sep='\t', index_col=0)

In [None]:
col_selection = ['lat_cast','lon_cast','Temperature [ºC]',
       'Salinity [PSU]', 'Oxygen [%]',
       'Fluorescence [mg/m3]', 'Orthophosphate [uM]', 'Silicic-acid [uM]',
       'Nitrite [uM]', 'Nitrates [uM]', 'NP ratio']

In [None]:
merged_data = clusters.join(md)

In [None]:
for column in clusters.columns:
    plot_clusters_on_map(merged_data, column)

### 2.- Filter -> Cluster

#### Data collection
We start by collecting the bio data, filtering only the SRF samples, and saving the resulting matrices.

In [None]:
md_dir = '../01_data/01_biological_data'
md_filename = 'metadata_chile.tsv'
md_path = os.path.join(md_dir,md_filename)
md = pd.read_csv(md_path, sep='\t', index_col=0)

output_dir = md_dir 


In [None]:
md_clean = md[md['Depth level']=='SRF']

In [None]:
# Read matrices of interest and sort them alphabetically
files = os.listdir(md_dir)
matrix_files = sorted([f for f in files if f.startswith('Matrix_chile_GEN_') and f.endswith('.tsv')])

In [None]:
matrix_files

In [None]:
for name in matrix_files:
    print(f"filtering {name}")
    file_path = os.path.join(md_dir, name)
    matrix = pd.read_csv(file_path, sep='\t', index_col=0)
    clean_matrix = md_clean.join(matrix).drop(md_clean.columns,axis = 1)
    output_filename =  name[:-8] + '_srf.tsv'
    clean_matrix.to_csv(os.path.join(output_dir, output_filename), sep='\t', index=True)

#### Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
# CLR implementation
def clr_(data, eps=1e-6):
    """
    Perform centered log-ratio (clr) normalization on a dataset.

    Parameters:
    data (pandas.DataFrame): A DataFrame with samples as rows and components as columns.

    Returns:
    pandas.DataFrame: A clr-normalized DataFrame.
    """
    if (data < 0).any().any():
        raise ValueError("Data should be strictly positive for clr normalization.")

    # Add small amount to cells with a value of 0
    if (data <= 0).any().any():
        data = data.replace(0, eps)

    # Calculate the geometric mean of each row
    gm = np.exp(data.apply(np.log).mean(axis=1))

    # Perform clr transformation
    clr_data = data.apply(np.log).subtract(np.log(gm), axis=0)

    return clr_data

all_metrics_results = []
clustering_results_dict = {}

def perform_kmeans_clustering(matrix, matrix_type_subsample, n_clusters_list, clr=False):
    suffix = 'clr_' if clr else ''
    # Perform K-Means for different 'n'
    for n_clusters in n_clusters_list:
        kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=50)
        kmeans.fit(matrix)
        
        cluster_labels = kmeans.labels_
        
        # Calculate evaluation metrics
        inertia = kmeans.inertia_
        silhouette_avg = silhouette_score(matrix, cluster_labels)
        davies_bouldin = davies_bouldin_score(matrix, cluster_labels)
        calinski_harabasz = calinski_harabasz_score(matrix, cluster_labels)
        
        all_metrics_results.append({
            'matrix': f"{suffix}{matrix_type_subsample}",
            'n_clusters': n_clusters,
            'inertia': inertia,
            'silhouette_score': silhouette_avg,
            'davies_bouldin_score': davies_bouldin,
            'calinski_harabasz_score': calinski_harabasz
        })
        
        col_name = f"{suffix}{matrix_type_subsample}_kmeans_{n_clusters}" # Create a DataFrame for the cluster labels with appropriate column names
        results = pd.DataFrame({col_name: cluster_labels}, index=matrix.index)
        
        if col_name not in clustering_results_dict:
            clustering_results_dict[col_name] = results
        else:
            clustering_results_dict[col_name] = pd.concat([clustering_results_dict[col_name], results], axis=1)


In [None]:
input_dir = '../01_data/01_biological_data'
output_dir = '../03_results/out_genomic_clusters'
os.makedirs(output_dir, exist_ok=True)

# Read matrices of interest and sort them alphabetically
files = os.listdir(input_dir)
matrix_files = sorted([f for f in files if f.startswith('Matrix_chile_GEN_') and f.endswith('_srf.tsv')])

# Perform K-Means for different n-clusters for each matrix
n_clusters_list = [3, 4, 5, 6, 7, 8]
for matrix_file in matrix_files:
    print(f"performing k-means to {matrix_file}")
    file_path = os.path.join(input_dir, matrix_file)
    matrix = pd.read_csv(file_path, sep='\t', index_col=0)
    base_filename = os.path.splitext(os.path.basename(file_path))[0]
    matrix_type_subsample = "_".join(base_filename.split('_')[3:])
    
    perform_kmeans_clustering(matrix, matrix_type_subsample, n_clusters_list, clr=False)
    # CLR normalized matrix clustering
    clr_matrix = clr_(matrix)
    perform_kmeans_clustering(clr_matrix, matrix_type_subsample, n_clusters_list, clr=True)



combined_clustering_results = pd.concat(clustering_results_dict.values(), axis=1)
#combined_clustering_results = combined_clustering_results.sort_index(axis=1)

# Results of the kmeans
output_filename = 'kmeans_results_ch_fc.tsv'
combined_clustering_results.to_csv(os.path.join(output_dir, output_filename), sep='\t', index=True)

# Results of the metrics of the kmeans clustering
metrics_df = pd.DataFrame(all_metrics_results)
metrics_output_filename = 'kmeans_metrics_ch_fc.tsv'
metrics_df.to_csv(os.path.join(output_dir, metrics_output_filename), sep='\t', index=False)

# Plot metrics
unique_matrices = metrics_df['matrix'].unique()
for matrix_type_subsample in unique_matrices:
    matrix_metrics_df = metrics_df[metrics_df['matrix'] == matrix_type_subsample]
    
    fig, ax1 = plt.subplots(figsize=(10, 6))

    ax1.set_xlabel('Number of Clusters')
    ax1.set_ylabel('Inertia', color='tab:blue')
    ax1.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['inertia'], color='tab:blue', label='Inertia')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    ax2 = ax1.twinx()
    ax2.set_ylabel('Silhouette Score', color='tab:orange')
    ax2.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['silhouette_score'], color='tab:orange', label='Silhouette Score')
    ax2.tick_params(axis='y', labelcolor='tab:orange')
    ax2.axhline(y=0.25, color='tab:orange', linestyle='--', linewidth=1, label='Silhouette Score Threshold (0.25)')

    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))
    ax3.set_ylabel('Davies-Bouldin Score', color='tab:green')
    ax3.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['davies_bouldin_score'], color='tab:green', label='Davies-Bouldin Score')
    ax3.tick_params(axis='y', labelcolor='tab:green')
    ax3.axhline(y=1.50, color='tab:green', linestyle='--', linewidth=1, label='Davies-Bouldin Score Threshold (1.50)')

    ax4 = ax1.twinx()
    ax4.spines['right'].set_position(('outward', 120))
    ax4.set_ylabel('Calinski-Harabasz Score', color='tab:red')
    ax4.plot(matrix_metrics_df['n_clusters'], matrix_metrics_df['calinski_harabasz_score'], color='tab:red', label='Calinski-Harabasz Score')
    ax4.tick_params(axis='y', labelcolor='tab:red')

    ax1.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
    
    fig.tight_layout()
    plt.title(f'Evaluation Metrics for {matrix_type_subsample}')

    # Save the plot
    plot_filename = f'kmeans_metrics_{matrix_type_subsample}_ch_fc.pdf'
    plt.savefig(os.path.join(output_dir, plot_filename), bbox_inches='tight')
    plt.close()




#### Plotting

In [None]:
input_dir = '../03_results/out_genomic_clusters'
filename = 'kmeans_results_ch_fc.tsv'

env_data_dir = '../01_data/01_biological_data'
env_filename = 'metadata_chile.tsv'

output_dir = '../03_results/out_genomic_clusters/map_projections_ch'
os.makedirs(output_dir, exist_ok=True)

file_path = os.path.join(input_dir,filename)
md_path = os.path.join(env_data_dir,env_filename)

clusters = pd.read_csv(file_path, sep='\t', index_col=0)
new_keys = {col: col+'_fc' for col in clusters.columns}
clusters.rename(columns = new_keys, inplace=True)


for col in clusters.columns:
    clusters[col] = clusters[col].astype('Int64')
md = pd.read_csv(md_path, sep='\t', index_col=0)


In [None]:
col_selection = ['lat_cast','lon_cast','Temperature [ºC]',
       'Salinity [PSU]', 'Oxygen [%]',
       'Fluorescence [mg/m3]', 'Orthophosphate [uM]', 'Silicic-acid [uM]',
       'Nitrite [uM]', 'Nitrates [uM]', 'NP ratio']

In [None]:
merged_data = clusters.join(md)

In [None]:
for column in clusters.columns:
    plot_clusters_on_map(merged_data, column)