In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import xgboost as xgb

from glob import glob
import os

from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

In [2]:
input_sat_dir = '../../01_data/02_satellite_data_processed'
predictor_files = sorted(glob(os.path.join(input_sat_dir, 'matrix_tara_world_adj_grids_*.tsv')))


input_kmeans_dir = '../../03_results/out_genomic_clusters'
target_vars_filename = 'kmeans_results.tsv'
target_vars_path = os.path.join(input_kmeans_dir, target_vars_filename)

target_vars = pd.read_csv(target_vars_path, sep='\t', index_col=0)
target_vars = target_vars.map(lambda x: f"C{x}")
target_vars.head()

Unnamed: 0_level_0,M0_srf_kmeans_3,M0_srf_kmeans_4,M0_srf_kmeans_5,M0_srf_kmeans_6,M0_srf_kmeans_7,M0_srf_kmeans_8,clr_M0_srf_kmeans_3,clr_M0_srf_kmeans_4,clr_M0_srf_kmeans_5,clr_M0_srf_kmeans_6,...,stress_srf_kmeans_5,stress_srf_kmeans_6,stress_srf_kmeans_7,stress_srf_kmeans_8,clr_stress_srf_kmeans_3,clr_stress_srf_kmeans_4,clr_stress_srf_kmeans_5,clr_stress_srf_kmeans_6,clr_stress_srf_kmeans_7,clr_stress_srf_kmeans_8
Samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSC001,C2,C1,C3,C5,C3,C3,C1,C0,C0,C2,...,C2,C2,C0,C2,C0,C2,C1,C4,C5,C5
TSC003,C1,C2,C1,C0,C0,C6,C1,C3,C0,C2,...,C2,C2,C0,C1,C0,C2,C1,C4,C2,C5
TSC005,C2,C0,C3,C1,C1,C0,C1,C3,C0,C2,...,C1,C2,C0,C2,C0,C2,C1,C4,C2,C5
TSC008,C2,C0,C3,C1,C3,C3,C1,C0,C0,C2,...,C1,C2,C0,C2,C2,C2,C1,C4,C2,C3
TSC013,C1,C2,C4,C3,C4,C6,C1,C0,C0,C2,...,C0,C0,C1,C7,C2,C2,C1,C4,C2,C3


In [11]:
results_df = pd.DataFrame(index=[os.path.basename(file) for file in predictor_files], columns=target_vars.columns)

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    #recall = recall_score(y_true, y_pred, average='macro')
    #precision = precision_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    #roc_auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr')
    return (accuracy, f1)

n_splits = 5
n_repeats = 10
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=0)

le = LabelEncoder()

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_macro': make_scorer(f1_score, average='macro')
}

for file in predictor_files:
    # Nombre del archivo como identificador
    file_name = os.path.basename(file)
    
    # Cargar el predictor
    df = pd.read_csv(file, sep='\t', index_col=0)
    
    # Alinear los predictores con las muestras de target_vars
    aligned_predictor = df.loc[df.index.intersection(target_vars.index)]
    
    for target_column in target_vars.columns:
        X = aligned_predictor
        y = target_vars.loc[aligned_predictor.index, target_column]

        y_encoded = le.fit_transform(y)

        unique, counts = np.unique(y_encoded, return_counts=True)
        min_samples = n_splits

        X_resampled = X.copy()
        y_resampled = y_encoded.copy()

        for cls, count in zip(unique, counts):
            if count < min_samples:
                diff = min_samples - count
                cls_indices = np.where(y_encoded == cls)[0]
                indices_to_duplicate = np.random.choice(cls_indices, diff, replace=True)
                X_resampled = np.concatenate([X_resampled, X.iloc[indices_to_duplicate]], axis=0)
                y_resampled = np.concatenate([y_resampled, y_encoded[indices_to_duplicate]], axis=0)

        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

        #cv_results = cross_validate(model, X, y_encoded, cv=rskf, scoring=scoring, return_train_score=False)
        cv_results = cross_validate(model, X_resampled, y_resampled, cv=rskf, scoring=scoring, return_train_score=False)

        avg_accuracy = np.mean(cv_results['test_accuracy'])
        avg_f1_macro = np.mean(cv_results['test_f1_macro'])

        results_df.at[file_name, target_column] = (avg_accuracy, avg_f1_macro)
            
print(results_df)

results_df.to_csv('../../03_results/out_predictions/predictions_kmeans.tsv', sep='\t')

                                                             M0_srf_kmeans_3  \
matrix_tara_world_adj_grids_01.tsv   (0.6987499999999999, 0.561773058029351)   
matrix_tara_world_adj_grids_09.tsv  (0.7402500000000001, 0.6101268460878099)   
matrix_tara_world_adj_grids_25.tsv  (0.7284999999999998, 0.6005016888432557)   
matrix_tara_world_adj_grids_49.tsv  (0.7309166666666665, 0.6111580455310432)   

                                                             M0_srf_kmeans_4  \
matrix_tara_world_adj_grids_01.tsv  (0.5227499999999999, 0.4735826673326673)   
matrix_tara_world_adj_grids_09.tsv  (0.5075833333333333, 0.4579822816072816)   
matrix_tara_world_adj_grids_25.tsv   (0.5113333333333334, 0.464628031282443)   
matrix_tara_world_adj_grids_49.tsv             (0.52225, 0.4751258186258187)   

                                                               M0_srf_kmeans_5  \
matrix_tara_world_adj_grids_01.tsv   (0.43674999999999997, 0.3614751744987039)   
matrix_tara_world_adj_grids_09.tsv

# Debug