# Imports

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
import dask.dataframe as dd
from sklearn.decomposition import TruncatedSVD

from src.data_preprocessing import DataPreprocessing
from src.SML_models import SML
from src.PCI_Genomics_modeling import ModelTrainer

from src.SVDHandler import SVDHandler
from src.PCOAHandler import PCoAHandler
from sklearn.linear_model import LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


# Import data

In [2]:
# Initialize the preprocessing class
preproc = DataPreprocessing()

# Load OTU table
otu_v1v2 = preproc.import_data("../Data/OTUtable_euksV1V2.txt")
otu_v3v4 = preproc.import_data("../Data/OTUtable_bactV3V4_filtered.txt")
otu_v4 = preproc.import_data("../Data/OTUtable_euksV4.txt")
otu_37F = preproc.import_data("../Data/OTUtable_foram37F.txt")

# Since the V9 OTU file is very large, we filter it to reduce size
v9_data_file = dd.read_csv("../Data/OTUtable_euksV9.txt", delimiter = '\t', decimal=',')
total_col_v9 = [m for m in v9_data_file if "Total_" in m] #Find Otus Total Abund column
v9_data_file = v9_data_file[v9_data_file[total_col_v9[0]] > 1000]

# Load metadata
metadata = preproc.import_data("../Data/metadata.txt")



# Data preprocessing

In [3]:
# Set filtering thresholds
filter_R = 10000  
filter_C = 100   

# Apply filtering and full preprocessing pipeline (filtering, TMM normalization, matching)
v1v2_df_filtered, v1v2_df_processed = preproc.preprocess_data(otu_v1v2, metadata, filter_R, filter_C, "TMM")
v3v4_df_filtered, v3v4_df_processed = preproc.preprocess_data(otu_v3v4, metadata, filter_R, filter_C, "TMM")
v4_df_filtered, v4_df_processed = preproc.preprocess_data(otu_v4, metadata, filter_R, filter_C, "TMM")
f_df_filtered, f_df_processed = preproc.preprocess_data(otu_37F, metadata, filter_R, filter_C, "TMM")
v9_df_filtered, v9_df_processed = preproc.preprocess_data(v9_data_file.compute(), metadata, filter_R, 1000, "TMM")

#After maching the OTU data and metadata, we keep each one in a separated variable
v1v2_otu_data_norm = v1v2_df_processed.iloc[:,:-7] # Only TMM OTU data
v1v2_otu_data_fil = v1v2_df_filtered.iloc[:,:-7] # Only filtered OTU data
v1v2_metadata_proc = v1v2_df_processed.iloc[:,-7:] # Only metadata
v1v2_env_var = v1v2_df_processed.iloc[:,-7:-4] # Only environmental data

v3v4_otu_data_norm = v3v4_df_processed.iloc[:,:-7] # Only TMM OTU data
v3v4_otu_data_fil = v3v4_df_filtered.iloc[:,:-7] # Only filtered OTU data
v3v4_metadata_proc = v3v4_df_processed.iloc[:,-7:] # Only metadata
v3v4_env_var = v3v4_df_processed.iloc[:,-7:-4] # Only environmental data

v4_otu_data_norm = v4_df_processed.iloc[:,:-7] # Only TMM OTU data
v4_otu_data_fil = v4_df_filtered.iloc[:,:-7] # Only filtered OTU data
v4_metadata_proc = v4_df_processed.iloc[:,-7:] # Only metadata
v4_env_var = v4_df_processed.iloc[:,-7:-4] # Only environmental data

f_otu_data_norm = f_df_processed.iloc[:,:-7] # Only TMM OTU data
f_otu_data_fil = f_df_filtered.iloc[:,:-7] # Only filtered OTU data
f_metadata_proc = f_df_processed.iloc[:,-7:] # Only metadata
f_env_var = f_df_processed.iloc[:,-7:-4] # Only environmental data

v9_otu_data_norm = v9_df_processed.iloc[:,:-7] # Only TMM OTU data
v9_otu_data_fil = v9_df_filtered.iloc[:,:-7] # Only filtered OTU data
v9_metadata_proc = v9_df_processed.iloc[:,-7:] # Only metadata
v9_env_var = v9_df_processed.iloc[:,-7:-4] # Only environmental data

Exception ignored from cffi callback <function _consolewrite_ex at 0x000002770B4253A0>:
Traceback (most recent call last):
  File "c:\Users\houria\AppData\Local\Programs\Python\Python313\Lib\site-packages\rpy2\rinterface_lib\callbacks.py", line 132, in _consolewrite_ex
    s = conversion._cchar_to_str_with_maxlen(buf, n, _CCHAR_ENCODING)
  File "c:\Users\houria\AppData\Local\Programs\Python\Python313\Lib\site-packages\rpy2\rinterface_lib\conversion.py", line 133, in _cchar_to_str_with_maxlen
    s = ffi.string(c, maxlen).decode(encoding)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 17: invalid continuation byte


# Random forest

In [4]:
N_C = list(range(3, 11)) + list(range(10, 51, 10))
sml = SML(n_estimators=200, cv_splits=3)

In [37]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(v1v2_otu_data_norm, v1v2_metadata_proc["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(v1v2_otu_data_norm, v1v2_metadata_proc["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(v1v2_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, v1v2_metadata_proc["AMBI"])[1]

# # ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(v1v2_otu_data_norm, v1v2_metadata_proc["AMBI"])
X_pcoa = pcoa_best[1].iloc[:, :int(pcoa_best[0]["Best NC"])]
pcoa_res = sml.random_forest_MDL(X_pcoa, v1v2_metadata_proc["AMBI"])[1]


# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]


# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 0.931, 'Kappa Score': np.float64(0.925), 'Recall Score': [1.0, 0.0, 1.0, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(
  X_pcoa = pcoa_best[1].iloc[:, :int(pcoa_best[0]["Best NC"])]


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,9.0,0.983,0.947,0.963,0.95,0.926,"[1.0, 0.5, 1.0, 0.9]"
PCoA,3.0,0.735,0.951,0.932,0.973,0.963,"[1.0, 0.5, 1.0, 1.0]"


In [38]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(v3v4_otu_data_norm, v3v4_metadata_proc["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(v3v4_otu_data_norm, v3v4_metadata_proc["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(v3v4_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, v3v4_metadata_proc["AMBI"])[1]

# ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(v3v4_otu_data_norm, v3v4_metadata_proc["AMBI"])
best_n_pcoa = int(pcoa_best[0]["Best NC"].iloc[0])
X_pcoa = pcoa_best[1].iloc[:, :best_n_pcoa]
pcoa_res = sml.random_forest_MDL(X_pcoa, v3v4_metadata_proc["AMBI"])[1]

# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]

# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 0.893, 'Kappa Score': np.float64(0.863), 'Recall Score': [1.0, 0.0, 0.75, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,7.0,0.991,0.873,0.975,0.893,0.863,"[1.0, 0.0, 0.75, 1.0]"
PCoA,3.0,0.739,0.951,0.977,0.893,0.863,"[1.0, 0.0, 0.75, 1.0]"


In [39]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(v4_otu_data_norm, v4_metadata_proc["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(v4_otu_data_norm, v4_metadata_proc["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(v4_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, v4_metadata_proc["AMBI"])[1]

# ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(v4_otu_data_norm, v4_metadata_proc["AMBI"])
best_n_pcoa = int(pcoa_best[0]["Best NC"].iloc[0])
X_pcoa = pcoa_best[1].iloc[:, :best_n_pcoa]
pcoa_res = sml.random_forest_MDL(X_pcoa, v4_metadata_proc["AMBI"])[1]

# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]

# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 0.936, 'Kappa Score': np.float64(0.928), 'Recall Score': [1.0, 0.0, 1.0, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,6.0,0.981,0.921,0.977,0.952,0.928,"[1.0, 0.5, 0.8888888888888888, 1.0]"
PCoA,6.0,0.756,0.96,0.992,0.936,0.928,"[1.0, 0.0, 1.0, 1.0]"


In [40]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(f_otu_data_norm, f_metadata_proc["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(f_otu_data_norm, f_metadata_proc["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(f_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, f_metadata_proc["AMBI"])[1]

# ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(f_otu_data_norm, f_metadata_proc["AMBI"])
X_pcoa = pcoa_best[1].iloc[:, :int(pcoa_best[0]["Best NC"].iloc[0])]
pcoa_res = sml.random_forest_MDL(X_pcoa, f_metadata_proc["AMBI"])[1]

# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]

# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 0.869, 'Kappa Score': np.float64(0.81), 'Recall Score': [1.0, 0.0, 0.7142857142857143, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,8.0,0.962,0.904,0.981,0.944,0.908,"[1.0, 0.5, 0.8571428571428571, 1.0]"
PCoA,10.0,0.769,0.962,0.993,0.975,0.954,"[1.0, 1.0, 0.8571428571428571, 1.0]"


In [41]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(v9_otu_data_norm, v9_metadata_proc["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(v9_otu_data_norm, v9_metadata_proc["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(v9_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, v9_metadata_proc["AMBI"])[1]

# ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(v9_otu_data_norm, v9_metadata_proc["AMBI"])
X_pcoa = pcoa_best[1].iloc[:, :int(pcoa_best[0]["Best NC"].iloc[0])]
pcoa_res = sml.random_forest_MDL(X_pcoa, v9_metadata_proc["AMBI"])[1]

# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]

# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 0.938, 'Kappa Score': np.float64(0.932), 'Recall Score': [1.0, 0.0, 1.0, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,8.0,0.959,0.932,0.988,0.919,0.9,"[0.9583333333333334, 0.0, 1.0, 1.0]"
PCoA,5.0,0.831,0.953,0.99,0.882,0.835,"[0.9166666666666666, 0.0, 1.0, 0.9090909090909..."


# 2 classes

In [4]:
mapping_AMBI = {1:2, 2:2, 3:5, 4:5, 5:5}

# Appliquer sur ton dataframe
v1v2_metadata_proc_3classes = v1v2_metadata_proc.copy()  # ou normalisé si tu utilises la version normalisée
v1v2_metadata_proc_3classes["AMBI"] = v1v2_metadata_proc_3classes["AMBI"].map(mapping_AMBI)

v3v4_metadata_proc_3classes = v3v4_metadata_proc.copy()  # ou normalisé si tu utilises la version normalisée
v3v4_metadata_proc_3classes["AMBI"] = v3v4_metadata_proc_3classes["AMBI"].map(mapping_AMBI)

v4_metadata_proc_3classes = v4_metadata_proc.copy()  # ou normalisé si tu utilises la version normalisée
v4_metadata_proc_3classes["AMBI"] = v4_metadata_proc_3classes["AMBI"].map(mapping_AMBI)

f_metadata_proc_3classes = f_metadata_proc.copy()  # ou normalisé si tu utilises la version normalisée
f_metadata_proc_3classes["AMBI"] = f_metadata_proc_3classes["AMBI"].map(mapping_AMBI)

v9_metadata_proc_3classes = v9_metadata_proc.copy()  # ou normalisé si tu utilises la version normalisée
v9_metadata_proc_3classes["AMBI"] = v9_metadata_proc_3classes["AMBI"].map(mapping_AMBI)

In [5]:
N_C = list(range(3, 11)) + list(range(10, 51, 10))
sml = SML(n_estimators=200, cv_splits=3)

In [6]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(v1v2_otu_data_norm, v1v2_metadata_proc_3classes["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(v1v2_otu_data_norm, v1v2_metadata_proc_3classes["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(v1v2_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, v1v2_metadata_proc_3classes["AMBI"])[1]

# # ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(v1v2_otu_data_norm, v1v2_metadata_proc_3classes["AMBI"])
X_pcoa = pcoa_best[1].iloc[:, :int(pcoa_best[0]["Best NC"])]
pcoa_res = sml.random_forest_MDL(X_pcoa, v1v2_metadata_proc_3classes["AMBI"])[1]


# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]


# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 1.0, 'Kappa Score': np.float64(1.0), 'Recall Score': [1.0, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(
  X_pcoa = pcoa_best[1].iloc[:, :int(pcoa_best[0]["Best NC"])]


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,7.0,0.981,0.995,1.0,1.0,1.0,"[1.0, 1.0]"
PCoA,3.0,0.735,1.0,1.0,1.0,1.0,"[1.0, 1.0]"


In [7]:
#LASSO for feature selection

### LASSO + RF
# Modèle avec régularisation L1 (style Lasso)
model = LogisticRegression(penalty='l1', solver='saga', max_iter=10000)
model.fit(v1v2_otu_data_norm, v1v2_metadata_proc_3classes["AMBI"])

coeffs = model.coef_[0]
v1v2_otus_utiles = v1v2_otu_data_norm.columns[np.abs(coeffs) > 1e-5].tolist()
print("Selected OTUs :", v1v2_otus_utiles)
print("len : " , len(v1v2_otus_utiles))

# Train the model
model_trainer1 = ModelTrainer(v1v2_otu_data_norm[v1v2_otus_utiles], v1v2_metadata_proc_3classes["AMBI"], "V1V2_")
_, v1v2_ext_performance = model_trainer1.train_random_forest()

print("LASSO + RF : ")
print(v1v2_ext_performance)

Selected OTUs : ['OTU3', 'OTU4', 'OTU6', 'OTU9', 'OTU14', 'OTU16', 'OTU17', 'OTU18', 'OTU19', 'OTU25', 'OTU28', 'OTU31', 'OTU32', 'OTU33', 'OTU39', 'OTU44', 'OTU46', 'OTU50', 'OTU54', 'OTU57', 'OTU58', 'OTU64', 'OTU66', 'OTU68', 'OTU81', 'OTU86', 'OTU94', 'OTU111', 'OTU112', 'OTU114', 'OTU116', 'OTU118', 'OTU124', 'OTU140', 'OTU149', 'OTU151', 'OTU152', 'OTU160', 'OTU162', 'OTU166', 'OTU170', 'OTU181', 'OTU190', 'OTU194', 'OTU198', 'OTU210', 'OTU211', 'OTU213', 'OTU214', 'OTU219', 'OTU222', 'OTU225', 'OTU232', 'OTU237', 'OTU238', 'OTU244', 'OTU254', 'OTU267', 'OTU268', 'OTU273', 'OTU275', 'OTU284', 'OTU294', 'OTU309', 'OTU320', 'OTU337', 'OTU366', 'OTU377', 'OTU378', 'OTU388', 'OTU395', 'OTU402', 'OTU410', 'OTU424', 'OTU425', 'OTU430', 'OTU433', 'OTU440', 'OTU494', 'OTU523', 'OTU543', 'OTU554', 'OTU577', 'OTU604', 'OTU620', 'OTU631', 'OTU642', 'OTU669', 'OTU686', 'OTU699', 'OTU755', 'OTU762', 'OTU782', 'OTU900', 'OTU934', 'OTU1256']
len :  96
LASSO + RF : 
{'F1 Score': 1.0, 'Kappa Scor

In [8]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(v3v4_otu_data_norm, v3v4_metadata_proc_3classes["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(v3v4_otu_data_norm, v3v4_metadata_proc_3classes["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(v3v4_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, v3v4_metadata_proc_3classes["AMBI"])[1]

# ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(v3v4_otu_data_norm, v3v4_metadata_proc_3classes["AMBI"])
best_n_pcoa = int(pcoa_best[0]["Best NC"].iloc[0])
X_pcoa = pcoa_best[1].iloc[:, :best_n_pcoa]
pcoa_res = sml.random_forest_MDL(X_pcoa, v3v4_metadata_proc_3classes["AMBI"])[1]

# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]

# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 1.0, 'Kappa Score': np.float64(1.0), 'Recall Score': [1.0, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,4.0,0.988,0.977,0.999,1.0,1.0,"[1.0, 1.0]"
PCoA,4.0,0.79,1.0,1.0,1.0,1.0,"[1.0, 1.0]"


In [9]:
# LASSO for feature selection on V3V4

### LASSO + RF
# Modèle avec régularisation L1 (style Lasso)
model = LogisticRegression(penalty='l1', solver='saga', max_iter=10000)
model.fit(v3v4_otu_data_norm, v3v4_metadata_proc_3classes["AMBI"])

# Sélection des OTUs importantes
coeffs = model.coef_[0]
v3v4_otus_utiles = v3v4_otu_data_norm.columns[np.abs(coeffs) > 1e-5].tolist()
print("Selected OTUs :", v3v4_otus_utiles)
print("len : " , len(v3v4_otus_utiles))

# Entraînement du modèle RF sur les OTUs sélectionnées
model_trainer1 = ModelTrainer(v3v4_otu_data_norm[v3v4_otus_utiles], v3v4_metadata_proc_3classes["AMBI"], "V3V4_")
_, v3v4_ext_performance = model_trainer1.train_random_forest()

print("LASSO + RF : ")
print(v3v4_ext_performance)

Selected OTUs : ['OTU2', 'OTU6', 'OTU14', 'OTU17', 'OTU19', 'OTU20', 'OTU30', 'OTU32', 'OTU35', 'OTU36', 'OTU41', 'OTU42', 'OTU43', 'OTU46', 'OTU57', 'OTU63', 'OTU65', 'OTU79', 'OTU90', 'OTU91', 'OTU92', 'OTU99', 'OTU105', 'OTU108', 'OTU122', 'OTU132', 'OTU135', 'OTU138', 'OTU139', 'OTU140', 'OTU141', 'OTU142', 'OTU148', 'OTU150', 'OTU162', 'OTU165', 'OTU173', 'OTU180', 'OTU181', 'OTU184', 'OTU190', 'OTU193', 'OTU194', 'OTU199', 'OTU201', 'OTU202', 'OTU210', 'OTU212', 'OTU215', 'OTU224', 'OTU231', 'OTU236', 'OTU239', 'OTU245', 'OTU246', 'OTU251', 'OTU253', 'OTU256', 'OTU267', 'OTU268', 'OTU272', 'OTU274', 'OTU287', 'OTU291', 'OTU292', 'OTU294', 'OTU302', 'OTU311', 'OTU324', 'OTU329', 'OTU333', 'OTU337', 'OTU339', 'OTU340', 'OTU344', 'OTU349', 'OTU357', 'OTU383', 'OTU384', 'OTU387', 'OTU388', 'OTU404', 'OTU413', 'OTU428', 'OTU434', 'OTU446', 'OTU459', 'OTU463', 'OTU464', 'OTU467', 'OTU469', 'OTU472', 'OTU473', 'OTU481', 'OTU482', 'OTU495', 'OTU507', 'OTU518', 'OTU520', 'OTU523', 'OTU534

In [10]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(v4_otu_data_norm, v4_metadata_proc_3classes["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(v4_otu_data_norm, v4_metadata_proc_3classes["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(v4_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, v4_metadata_proc_3classes["AMBI"])[1]

# ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(v4_otu_data_norm, v4_metadata_proc_3classes["AMBI"])
best_n_pcoa = int(pcoa_best[0]["Best NC"].iloc[0])
X_pcoa = pcoa_best[1].iloc[:, :best_n_pcoa]
pcoa_res = sml.random_forest_MDL(X_pcoa, v4_metadata_proc_3classes["AMBI"])[1]

# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]

# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 1.0, 'Kappa Score': np.float64(1.0), 'Recall Score': [1.0, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,6.0,0.981,0.984,0.998,1.0,1.0,"[1.0, 1.0]"
PCoA,7.0,0.779,0.992,1.0,1.0,1.0,"[1.0, 1.0]"


In [11]:
# LASSO for feature selection on V4

### LASSO + RF
# Modèle avec régularisation L1 (style Lasso)
model = LogisticRegression(penalty='l1', solver='saga', max_iter=10000)
model.fit(v4_otu_data_norm, v4_metadata_proc_3classes["AMBI"])

# Sélection des OTUs importantes
coeffs = model.coef_[0]
v4_otus_utiles = v4_otu_data_norm.columns[np.abs(coeffs) > 1e-5].tolist()
print("Selected OTUs :", v4_otus_utiles)
print("len :", len(v4_otus_utiles))

# Entraînement du modèle RF sur les OTUs sélectionnées
model_trainer4 = ModelTrainer(v4_otu_data_norm[v4_otus_utiles], v4_metadata_proc_3classes["AMBI"], "V4_")
_, v4_ext_performance = model_trainer4.train_random_forest()

print("LASSO + RF : ")
print(v4_ext_performance)

Selected OTUs : ['OTU0', 'OTU2', 'OTU3', 'OTU5', 'OTU6', 'OTU10', 'OTU11', 'OTU13', 'OTU15', 'OTU17', 'OTU18', 'OTU21', 'OTU22', 'OTU23', 'OTU24', 'OTU25', 'OTU29', 'OTU34', 'OTU36', 'OTU37', 'OTU38', 'OTU44', 'OTU47', 'OTU51', 'OTU57', 'OTU70', 'OTU71', 'OTU72', 'OTU73', 'OTU77', 'OTU83', 'OTU84', 'OTU87', 'OTU88', 'OTU91', 'OTU92', 'OTU93', 'OTU95', 'OTU96', 'OTU98', 'OTU99', 'OTU100', 'OTU101', 'OTU103', 'OTU109', 'OTU112', 'OTU122', 'OTU129', 'OTU131', 'OTU133', 'OTU144', 'OTU162', 'OTU168', 'OTU170', 'OTU171', 'OTU173', 'OTU178', 'OTU185', 'OTU195', 'OTU206', 'OTU212', 'OTU221', 'OTU225', 'OTU226', 'OTU234', 'OTU246', 'OTU247', 'OTU254', 'OTU264', 'OTU274', 'OTU279', 'OTU283', 'OTU287', 'OTU298', 'OTU309', 'OTU313', 'OTU324', 'OTU327', 'OTU329', 'OTU338', 'OTU349', 'OTU351', 'OTU374', 'OTU380', 'OTU382', 'OTU406', 'OTU427', 'OTU451', 'OTU452', 'OTU501', 'OTU509', 'OTU524', 'OTU525', 'OTU563', 'OTU593', 'OTU636', 'OTU654', 'OTU768', 'OTU882', 'OTU892', 'OTU1161', 'OTU1365', 'OTU153

In [12]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(f_otu_data_norm, f_metadata_proc_3classes["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(f_otu_data_norm, f_metadata_proc_3classes["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(f_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, f_metadata_proc_3classes["AMBI"])[1]

# ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(f_otu_data_norm, f_metadata_proc_3classes["AMBI"])
X_pcoa = pcoa_best[1].iloc[:, :int(pcoa_best[0]["Best NC"].iloc[0])]
pcoa_res = sml.random_forest_MDL(X_pcoa, f_metadata_proc_3classes["AMBI"])[1]

# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]

# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 1.0, 'Kappa Score': np.float64(1.0), 'Recall Score': [1.0, 1.0]})
TMM ********* SVD
TMM ********* PCoA


  warn(


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,20.0,0.974,0.962,0.989,1.0,1.0,"[1.0, 1.0]"
PCoA,5.0,0.641,0.991,0.99,1.0,1.0,"[1.0, 1.0]"


In [13]:
# LASSO for feature selection on F

### LASSO + RF
# Modèle avec régularisation L1 (style Lasso)
model = LogisticRegression(penalty='l1', solver='saga', max_iter=10000)
model.fit(f_otu_data_norm, f_metadata_proc_3classes["AMBI"])

# Sélection des OTUs importantes
coeffs = model.coef_[0]
f_otus_utiles = f_otu_data_norm.columns[np.abs(coeffs) > 1e-5].tolist()
print("Selected OTUs :", f_otus_utiles)
print("len :", len(f_otus_utiles))

# Entraînement du modèle RF sur les OTUs sélectionnées
model_trainer_f = ModelTrainer(f_otu_data_norm[f_otus_utiles], f_metadata_proc_3classes["AMBI"], "37F_")
_, f_ext_performance = model_trainer_f.train_random_forest()

print("LASSO + RF : ")
print(f_ext_performance)

Selected OTUs : ['OTU1', 'OTU3', 'OTU4', 'OTU5', 'OTU6', 'OTU7', 'OTU11', 'OTU12', 'OTU13', 'OTU16', 'OTU18', 'OTU19', 'OTU20', 'OTU21', 'OTU24', 'OTU30', 'OTU33', 'OTU38', 'OTU41', 'OTU44', 'OTU49', 'OTU50', 'OTU51', 'OTU64', 'OTU65', 'OTU67', 'OTU71', 'OTU72', 'OTU86', 'OTU87', 'OTU90', 'OTU93', 'OTU95', 'OTU97', 'OTU104', 'OTU107', 'OTU110', 'OTU116', 'OTU117', 'OTU129', 'OTU136', 'OTU138', 'OTU144', 'OTU146', 'OTU148', 'OTU152', 'OTU154', 'OTU159', 'OTU163', 'OTU167', 'OTU168', 'OTU173', 'OTU177', 'OTU186', 'OTU187', 'OTU195', 'OTU200', 'OTU218', 'OTU228', 'OTU235', 'OTU239', 'OTU260', 'OTU280', 'OTU302', 'OTU312', 'OTU322', 'OTU328', 'OTU335', 'OTU336', 'OTU370', 'OTU391', 'OTU410', 'OTU428', 'OTU438', 'OTU439', 'OTU448', 'OTU463', 'OTU472', 'OTU533', 'OTU666', 'OTU682', 'OTU752', 'OTU756', 'OTU772', 'OTU776', 'OTU786', 'OTU804', 'OTU869', 'OTU896', 'OTU968', 'OTU990', 'OTU1028', 'OTU1148', 'OTU1343', 'OTU1478', 'OTU1556', 'OTU2112']
len : 97
LASSO + RF : 
{'F1 Score': 1.0, 'Kappa

In [14]:
print("TMM ********* ALL")
print(sml.random_forest_MDL(v9_otu_data_norm, v9_metadata_proc_3classes["AMBI"]))

# ----- SVD + RF -----
print("TMM ********* SVD")
svd_handler = SVDHandler(n_components_list=N_C)
svd_best = svd_handler.choose_best_n_components(v9_otu_data_norm, v9_metadata_proc_3classes["AMBI"])
svd = TruncatedSVD(n_components=int(svd_best["Best NC"].iloc[0]))
X_svd = svd.fit_transform(v9_otu_data_norm)
svd_res = sml.random_forest_MDL(X_svd, v9_metadata_proc_3classes["AMBI"])[1]

# ----- PCoA + RF -----
print("TMM ********* PCoA")
pcoa_handler = PCoAHandler(metric="cosine", n_components_list=N_C)
pcoa_best = pcoa_handler.choose_best_n_components(v9_otu_data_norm, v9_metadata_proc_3classes["AMBI"])
X_pcoa = pcoa_best[1].iloc[:, :int(pcoa_best[0]["Best NC"].iloc[0])]
pcoa_res = sml.random_forest_MDL(X_pcoa, v9_metadata_proc_3classes["AMBI"])[1]

# Add cross validation results
results_df = pd.DataFrame(index=["SVD", "PCoA"])
results_df['Best NC'] = [svd_best["Best NC"].iloc[0], float(pcoa_best[0]["Best NC"].iloc[0])]
results_df['Explained Var'] = [svd_best["exp var"].iloc[0], float(pcoa_best[0]["exp var"].iloc[0])]
results_df['Mean F1 CV'] = [svd_best["Mean F1 Score"].iloc[0], float(pcoa_best[0]["Mean F1 Score"].iloc[0])]
results_df['Mean AUC CV'] = [svd_best["Mean AUC ROC"].iloc[0], float(pcoa_best[0]["Mean AUC ROC"].iloc[0])]

# Update results DataFrame with RF metrics
results_df['F1 model'] = [svd_res['F1 Score'], pcoa_res['F1 Score']]
results_df['Kappa model'] = [svd_res['Kappa Score'], pcoa_res['Kappa Score']]
results_df['Recall model'] = [svd_res['Recall Score'], pcoa_res['Recall Score']]

results_df


TMM ********* ALL
(RandomForestClassifier(n_estimators=200, random_state=0), {'F1 Score': 0.979, 'Kappa Score': np.float64(0.957), 'Recall Score': [1.0, 0.9545454545454546]})
TMM ********* SVD
TMM ********* PCoA


  warn(


Unnamed: 0,Best NC,Explained Var,Mean F1 CV,Mean AUC CV,F1 model,Kappa model,Recall model
SVD,3.0,0.938,0.992,1.0,0.915,0.83,"[0.88, 0.9545454545454546]"
PCoA,20.0,0.972,0.985,1.0,0.957,0.915,"[0.96, 0.9545454545454546]"


In [15]:
### LASSO + RF
# Modèle avec régularisation L1 (style Lasso)
model = LogisticRegression(penalty='l1', solver='saga', max_iter=10000)
model.fit(v9_otu_data_norm, v9_metadata_proc_3classes["AMBI"])

# Sélection des OTUs importantes
coeffs = model.coef_[0]
v9_otus_utiles = v9_otu_data_norm.columns[np.abs(coeffs) > 1e-5].tolist()
print("Selected OTUs :", v9_otus_utiles)
print("len :", len(v9_otus_utiles))

# Entraînement du modèle RF sur les OTUs sélectionnées
model_trainer_v9 = ModelTrainer(v9_otu_data_norm[v9_otus_utiles], v9_metadata_proc_3classes["AMBI"], "V9_")
_, v9_ext_performance = model_trainer_v9.train_random_forest()

print("LASSO + RF : ")
print(v9_ext_performance)

Selected OTUs : ['OTU10', 'OTU20', 'OTU21', 'OTU22', 'OTU24', 'OTU33', 'OTU40', 'OTU41', 'OTU43', 'OTU61', 'OTU73', 'OTU76', 'OTU82', 'OTU92', 'OTU100', 'OTU102', 'OTU103', 'OTU105', 'OTU107', 'OTU110', 'OTU114', 'OTU117', 'OTU119', 'OTU124', 'OTU142', 'OTU158', 'OTU159', 'OTU161', 'OTU164', 'OTU165', 'OTU175', 'OTU177', 'OTU181', 'OTU190', 'OTU199', 'OTU201', 'OTU220', 'OTU224', 'OTU225', 'OTU228', 'OTU229', 'OTU234', 'OTU235', 'OTU239', 'OTU240', 'OTU244', 'OTU254', 'OTU258', 'OTU272', 'OTU275', 'OTU276', 'OTU287', 'OTU291', 'OTU303', 'OTU304', 'OTU307', 'OTU314', 'OTU325', 'OTU326', 'OTU334', 'OTU335', 'OTU338', 'OTU342', 'OTU346', 'OTU355', 'OTU357', 'OTU363', 'OTU364', 'OTU372', 'OTU382', 'OTU403', 'OTU404', 'OTU407', 'OTU412', 'OTU432', 'OTU440', 'OTU450', 'OTU451', 'OTU452', 'OTU461', 'OTU462', 'OTU479', 'OTU484', 'OTU490', 'OTU491', 'OTU492', 'OTU496', 'OTU504', 'OTU515', 'OTU521', 'OTU525', 'OTU534', 'OTU538', 'OTU539', 'OTU541', 'OTU543', 'OTU545', 'OTU562', 'OTU563', 'OTU589