In [5]:
!pip -q install mlflow xgboost

import mlflow
import mlflow.sklearn

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

mlflow.set_tracking_uri("http://3.94.195.217:8050")
mlflow.set_experiment("Proyecto_Depresion_Entrega2")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1771797147308, experiment_id='1', last_update_time=1771797147308, lifecycle_stage='active', name='Proyecto_Depresion_Entrega2', tags={}, workspace='default'>

In [6]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import files
uploaded = files.upload()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving student_depression_dataset.csv to student_depression_dataset (3).csv


In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

df = pd.read_csv("student_depression_dataset.csv")

y = df["Depression"]

# Dataset A: quita id y target
X_A = df.drop(columns=["id", "Depression"])

# Dataset B: quita adem√°s la variable de suicidal thoughts (como t√∫)
col_suic = "Have you ever had suicidal thoughts ?"
X_B = df.drop(columns=["id", "Depression", col_suic])

X_train_A, X_test_A, y_train, y_test = train_test_split(
    X_A, y, test_size=0.2, random_state=42, stratify=y
)

# Para B usamos el mismo split que A, pero quitando la columna
X_train_B = X_train_A.drop(columns=[col_suic])
X_test_B  = X_test_A.drop(columns=[col_suic])

print("Shapes A:", X_train_A.shape, X_test_A.shape)
print("Shapes B:", X_train_B.shape, X_test_B.shape)

Shapes A: (22320, 16) (5581, 16)
Shapes B: (22320, 15) (5581, 15)


In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def build_preprocessor(X_train, scale_numeric=False):
    num_cols = X_train.select_dtypes(include=["int64","float64"]).columns
    cat_cols = X_train.select_dtypes(include=["object","bool"]).columns

    num_transformer = StandardScaler() if scale_numeric else "passthrough"

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_transformer, num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ]
    )
    return preprocessor

def train_model(model_name, dataset_name, estimator, X_train, X_test, y_train, y_test, extra_params=None):
    """
    M√≠nimo viable MLflow:
    - log_param: modelo, dataset y params clave
    - log_metric: accuracy, precision, recall, f1
    - log_model: el pipeline completo
    """
    if extra_params is None:
        extra_params = {}

    # Preprocesador: Logistic con escalado, RF/XGB sin escalado (como t√∫)
    scale_numeric = (model_name == "LogisticRegression")
    preprocessor = build_preprocessor(X_train, scale_numeric=scale_numeric)

    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", estimator)
    ])

    run_name = f"{model_name}_{dataset_name}"

    with mlflow.start_run(run_name=run_name):
        # Params b√°sicos
        mlflow.log_param("modelo", model_name)
        mlflow.log_param("dataset", dataset_name)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)

        # Params del estimador (los m√°s importantes)
        for k, v in extra_params.items():
            mlflow.log_param(k, v)

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1  = f1_score(y_test, y_pred)

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", pre)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1", f1)

        # Guarda el pipeline completo (prepro + modelo)
        mlflow.sklearn.log_model(pipe, artifact_path="model")

    return pipe, y_pred, {"accuracy": acc, "precision": pre, "recall": rec, "f1": f1}

In [9]:
results = []

# 1) Logistic
log_est = LogisticRegression(max_iter=1000)

mA, yA, metA = train_model(
    "LogisticRegression", "A",
    log_est, X_train_A, X_test_A, y_train, y_test,
    extra_params={"max_iter": 1000}
)
results.append(("Logistic_A", metA))




üèÉ View run LogisticRegression_A at: http://3.94.195.217:8050/#/experiments/1/runs/3071ab02788346689d6b2c1c55d02fc0
üß™ View experiment at: http://3.94.195.217:8050/#/experiments/1


In [None]:
mB, yB, metB = train_model(
    "LogisticRegression", "B",
    log_est, X_train_B, X_test_B, y_train, y_test,
    extra_params={"max_iter": 1000}
)
results.append(("Logistic_B", metB))


# 2) Random Forest
rf_est = RandomForestClassifier(n_estimators=200, random_state=42)

mA, yA, metA = train_model(
    "RandomForest", "A",
    rf_est, X_train_A, X_test_A, y_train, y_test,
    extra_params={"n_estimators": 200, "random_state": 42}
)
results.append(("RF_A", metA))

mB, yB, metB = train_model(
    "RandomForest", "B",
    rf_est, X_train_B, X_test_B, y_train, y_test,
    extra_params={"n_estimators": 200, "random_state": 42}
)
results.append(("RF_B", metB))


# 3) XGBoost
xgb_est = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    random_state=42,
    eval_metric="logloss"
)

mA, yA, metA = train_model(
    "XGBoost", "A",
    xgb_est, X_train_A, X_test_A, y_train, y_test,
    extra_params={"n_estimators": 400, "max_depth": 5, "learning_rate": 0.05, "random_state": 42}
)
results.append(("XGB_A", metA))

mB, yB, metB = train_model(
    "XGBoost", "B",
    xgb_est, X_train_B, X_test_B, y_train, y_test,
    extra_params={"n_estimators": 400, "max_depth": 5, "learning_rate": 0.05, "random_state": 42}
)
results.append(("XGB_B", metB))

print("‚úÖ Listo. Ya quedaron los 6 runs en MLflow.")

In [16]:
df_metrics = pd.DataFrame([
    {"Modelo": name, **metrics} for name, metrics in results
]).sort_values(by="f1", ascending=False)

df_metrics

Unnamed: 0,Modelo,accuracy,precision,recall,f1
0,Logistic_A,0.84286,0.857613,0.877295,0.867342
4,XGB_A,0.841426,0.855566,0.877295,0.866294
2,RF_A,0.83838,0.847124,0.883415,0.864889
5,XGB_B,0.798244,0.814261,0.849143,0.831336
1,Logistic_B,0.796452,0.812793,0.847613,0.829838
3,RF_B,0.792331,0.805387,0.850979,0.827555
