In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

df = pd.read_excel("/content/Final_DF.xlsx")

In [3]:
# Visualizar count de periodo antes de transformación

df["periodo"].value_counts()

Unnamed: 0_level_0,count
periodo,Unnamed: 1_level_1
1,84
2,84


In [4]:
# Transformación y visualizar count de periodo después de transformación
# con Pre y Post

df["periodo"] = df["periodo"].replace({
    1: "Pre",
    2: "Post"
})

df["periodo"].value_counts()

Unnamed: 0_level_0,count
periodo,Unnamed: 1_level_1
Pre,84
Post,84


In [5]:
# Filtrar datos solo con periodo Pre

df_pre = df[df["periodo"] == "Pre"].copy()

print("Dataset Periodo Pre:", df_pre.shape)
df_pre.head()

Dataset Periodo Pre: (84, 524)


Unnamed: 0,id,visita,grupo,periodo,tiempo,tratamiento,frec_alcohol_audit,puntaje_audit,Palabras_moca,memoria_moca,...,pgh_por_eficiencia,pgh_eficiencia,pgh_perturbaciones,pgh_disfuncion_dia,pgh_duracion_sueño,pgh_total,cat_pgh_total,total_meds,menst_flag,menst_tipo
0,1,1,1,Pre,1,1,2,5,29,4,...,80.0,1,1,0,2,5,1,2,0,MPM
1,1,2,1,Pre,2,1,2,5,29,4,...,104.0,0,1,0,0,1,1,0,0,MPM
4,2,1,2,Pre,1,2,2,2,21,4,...,85.71429,0,1,1,2,4,1,5,0,Hombre
5,2,2,2,Pre,2,2,2,2,21,4,...,83.12342,1,1,2,2,8,2,0,0,Hombre
8,3,1,1,Pre,1,1,2,2,18,3,...,70.0,2,2,0,1,8,2,2,1,MEF


In [6]:
# Confirmación de que ya no aparezca periodo Post

df_pre["periodo"] if "periodo" in df_pre.columns else "Periodo eliminado correctamente", print("Filas Post restantes:", (df_pre == "Post").sum().sum())


Filas Post restantes: 0


(0      Pre
 1      Pre
 4      Pre
 5      Pre
 8      Pre
       ... 
 157    Pre
 160    Pre
 161    Pre
 164    Pre
 165    Pre
 Name: periodo, Length: 84, dtype: object,
 None)

In [7]:
df["periodo"] = df["periodo"].astype(str).str.strip().str.lower()

In [8]:
df_pre = df[df["periodo"] == "pre"].copy()
print(df_pre.shape)

(84, 524)


# Random Forest - BDNF

In [9]:
# Definir objetivo - BDNF

target = "bdnf"

y = df_pre[target]

print("Objetivo:", target)
print("Tamaño de y:", y.shape)

Objetivo: bdnf
Tamaño de y: (84,)


In [10]:
# Definir predictores

X = df_pre.drop(columns=[target, "ID"], errors="ignore")

print("Tamaño de X:", X.shape)

Tamaño de X: (84, 523)


In [11]:
# Eliminar columnas no numéricas

X = X.select_dtypes(include=["int64", "float64"])

print("X solo numéricas:", X.shape)

X solo numéricas: (84, 508)


In [12]:
# Valores faltantes - no NaN

imputer = SimpleImputer(strategy="median")

X_imputed = imputer.fit_transform(X)

In [13]:
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

print("NaNs restantes:", X_imputed.isna().sum().sum())

NaNs restantes: 0


In [14]:
# Split Train/Test

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y,
    test_size=0.15,
    random_state=42
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)

Train: (71, 508)
Test: (13, 508)


In [15]:
rf_bdnf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf_bdnf.fit(X_train, y_train)

print("Modelo entrenado: Exitoso")

Modelo entrenado: Exitoso


In [16]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

y_pred = rf_bdnf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

print("Baseline Random Forest - BDNF (Pre)")
print("MSE:", mse)
print("RMSE:", rmse)
print("R2:", r2)

Baseline Random Forest - BDNF (Pre)
MSE: 483.2776945769301
RMSE: 21.983577838398602
R2: -0.32202656622432335


In [17]:
# Variables importantes

import numpy as np

importances = rf_bdnf.feature_importances_

feat_imp_bdnf = pd.DataFrame({
    "variable": X.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False)

feat_imp_bdnf.head(10)

Unnamed: 0,variable,importance
82,creatinina_orina,0.066448
483,delta_bdnf,0.058937
496,hads_score,0.048225
209,depresion,0.021275
288,ffq_zanahor,0.019103
453,tom_momento,0.018713
106,cobre,0.018568
480,mortality_risk,0.018334
112,hcm,0.017422
481,phenoage,0.015923


In [18]:
feat_imp_bdnf.to_excel("FeatureImportance_BDNF_Pre.xlsx", index=False)

print("Archivo guardado: FeatureImportance_BDNF_Pre.xlsx")

Archivo guardado: FeatureImportance_BDNF_Pre.xlsx


Se entrenó un modelo baseline mediante Random Forest Regressor utilizando únicamente observaciones del periodo Pre con el objetivo de explorar la capacidad predictiva inicial del conjunto de variables clínicas y de estilo de vida sobre los niveles de BDNF.

El modelo obtuvo un RMSE de 21.98 y un R² negativo (-0.32), lo que sugiere que, bajo las condiciones actuales, no se identifican patrones predictivos robustos. Este resultado es esperable debido al tamaño reducido de la muestra (n=84) en comparación con el elevado número de variables (p>500), así como a la alta variabilidad biológica inherente al biomarcador.

Por lo tanto, este baseline se considera un punto de referencia exploratorio más que un modelo final, alineado con el enfoque científico del proyecto.

# Random Forest - Delta

In [19]:
delta_targets = [
    "delta_bdnf",
    "delta_d2_con_pd",
    "delta_d2_vt_pd",
    "delta_fn_nombre",
    "delta_fn_score",
    "delta_licopeno"
]

In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
def baseline_rf(df, target):

    print("\n==============================")
    print("Baseline Random Forest para:", target)
    print("==============================")

    # Eliminar filas sin target
    df_model = df.dropna(subset=[target]).copy()

    # Definir y
    y = df_model[target]

    # Definir X (quitando target, ID y periodo)
    X = df_model.drop(columns=[target, "ID", "periodo"], errors="ignore")

    # Quedarnos solo con numéricas
    X = X.select_dtypes(include=["int64", "float64"])

    print("Dataset:", X.shape)

    # Imputación
    imputer = SimpleImputer(strategy="median")
    X_imp = imputer.fit_transform(X)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_imp, y,
        test_size=0.2,
        random_state=42
    )

    # Modelo
    rf = RandomForestRegressor(
        n_estimators=300,
        random_state=42
    )

    rf.fit(X_train, y_train)

    # Predicción
    y_pred = rf.predict(X_test)

    # Métricas
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print("RMSE:", rmse)
    print("R2:", r2)

    # Feature importance
    feat_imp = pd.DataFrame({
        "variable": X.columns,
        "importance": rf.feature_importances_
    }).sort_values(by="importance", ascending=False)

    return feat_imp

In [22]:
resultados_importancia = {}

for target in delta_targets:
    feat_imp = baseline_rf(df, target)
    resultados_importancia[target] = feat_imp

    # Guardar top 15 por variable
    feat_imp.head(15).to_excel(f"Top15_{target}.xlsx", index=False)


Baseline Random Forest para: delta_bdnf
Dataset: (84, 508)
RMSE: 21.607165211282194
R2: 0.3742418734897508

Baseline Random Forest para: delta_d2_con_pd
Dataset: (84, 508)
RMSE: 9.937516062386822
R2: 0.5074559718578857

Baseline Random Forest para: delta_d2_vt_pd
Dataset: (84, 508)
RMSE: 12.741192344240831
R2: 0.366894136618806

Baseline Random Forest para: delta_fn_nombre
Dataset: (84, 508)
RMSE: 1.2030636273710829
R2: 0.6155444444444444

Baseline Random Forest para: delta_fn_score
Dataset: (84, 508)
RMSE: 1.9458682555542797
R2: 0.5695237826733107

Baseline Random Forest para: delta_licopeno
Dataset: (84, 508)
RMSE: 0.09831810170697264
R2: 0.626565003804373


In [23]:
summary = []

for target in delta_targets:

    df_model = df.dropna(subset=[target]).copy()
    y = df_model[target]

    X = df_model.drop(columns=[target, "ID", "periodo"], errors="ignore")
    X = X.select_dtypes(include=["int64", "float64"])

    imputer = SimpleImputer(strategy="median")
    X_imp = imputer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X_imp, y, test_size=0.2, random_state=42
    )

    rf = RandomForestRegressor(n_estimators=300, random_state=42)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    summary.append({
        "Target": target,
        "RMSE": rmse,
        "R2": r2,
        "N_samples": len(df_model)
    })

summary_df = pd.DataFrame(summary)
summary_df

Unnamed: 0,Target,RMSE,R2,N_samples
0,delta_bdnf,21.607165,0.374242,84
1,delta_d2_con_pd,9.937516,0.507456,84
2,delta_d2_vt_pd,12.741192,0.366894,84
3,delta_fn_nombre,1.203064,0.615544,84
4,delta_fn_score,1.945868,0.569524,84
5,delta_licopeno,0.098318,0.626565,84


In [24]:
summary_df.to_excel("Baseline_RF_Deltas_Resumen.xlsx", index=False)
print("Archivo guardado: Baseline_RF_Deltas_Resumen.xlsx")

Archivo guardado: Baseline_RF_Deltas_Resumen.xlsx


In [25]:
import pandas as pd

importances = rf.feature_importances_

feat_imp = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
}).sort_values("Importance", ascending=False)

feat_imp.head(10)

Unnamed: 0,Feature,Importance
473,licopeno_trans,0.478868
471,licopeno,0.097932
210,pgh_min_dormirse,0.037506
472,licopeno_5cis,0.029565
60,ldh,0.019083
118,eosinofilos_porcentaje,0.013173
100,homocisteina_basal,0.010729
141,vit_d3,0.008774
103,ac_folico_serico,0.006997
247,cv_p9,0.006896
