In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import mlflow
import mlflow.sklearn

import sqlalchemy

In [2]:
from sqlalchemy import create_engine

In [3]:
# Crear la conexión a la base de datos MySQL
db_uri = "mysql+pymysql://root:supersecret@10.43.101.173:3306/training_data"
engine = create_engine(db_uri)

# ---------------------------------------------------------
# Cargar datos desde archivos CSV en lugar de MySQL
# ---------------------------------------------------------
# Para el primer conjunto (datos del archivo penguins_Iter.csv)
df1 = pd.read_csv('/home/jovyan/work/penguins_lter.csv', sep=",")  # Reemplaza con la ruta correcta

# Para el segundo conjunto (datos del archivo penguins_size.csv)
df2 = pd.read_csv('/home/jovyan/work/penguins_size.csv', sep=",")  # Reemplaza con la ruta correcta

# ---------------------------------------------------------
# Escribir los DataFrames en la base de datos (si las tablas no existen)
# ---------------------------------------------------------
df1.to_sql('raw_penguins_lter', con=engine, if_exists='replace', index=False)
df2.to_sql('raw_penguins_size', con=engine, if_exists='replace', index=False)

344

In [4]:
df1.head()

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,


In [5]:
# ---------------------------------------------------------
# Limpieza de datos: reemplazar valores indeseados y eliminar registros faltantes
# ---------------------------------------------------------
# En ambos DataFrames la columna de sexo viene almacenada como "sex"
df1["Sex"] = df1["Sex"].replace({".": None})
df2["sex"] = df2["sex"].replace({".": None})

# Eliminar registros con valores faltantes en la columna "sex"
df1_clean = df1.dropna(subset=["Sex"])
df2_clean = df2.dropna(subset=["sex"])

# ---------------------------------------------------------
# Renombrar columnas (si es necesario) y seleccionar las columnas de interés

df1_clean = df1_clean.rename(columns={
    "Species": "species",
    "Island": "island",
    "Culmen Length (mm)": "culmen_length_mm",
    "Culmen Depth (mm)": "culmen_depth_mm",
    "Flipper Length (mm)": "flipper_length_mm",
    "Body Mass (g)": "body_mass_g",
    "Sex": "sex"
})
df1_clean = df1_clean[["species", "island", "culmen_length_mm", "culmen_depth_mm",
                       "flipper_length_mm", "body_mass_g", "sex"]]

# ---------------------------------------------------------
# Combinar ambos DataFrames
# ---------------------------------------------------------
df_combined = pd.concat([df1_clean, df2_clean], ignore_index=True)

# ---------------------------------------------------------
# Mapear la variable 'sex': MALE -> 1, FEMALE -> 0
# ---------------------------------------------------------
df_combined["sex"] = df_combined["sex"].map({"MALE": 1, "FEMALE": 0})

# ---------------------------------------------------------
# Codificar la variable categórica "island"
# ---------------------------------------------------------
label_encoder = LabelEncoder()
df_combined["island"] = label_encoder.fit_transform(df_combined["island"])

print("Datos combinados y procesados:")
print(df_combined.head())

Datos combinados y procesados:
                               species  island  culmen_length_mm  \
0  Adelie Penguin (Pygoscelis adeliae)       2              39.1   
1  Adelie Penguin (Pygoscelis adeliae)       2              39.5   
2  Adelie Penguin (Pygoscelis adeliae)       2              40.3   
3  Adelie Penguin (Pygoscelis adeliae)       2              36.7   
4  Adelie Penguin (Pygoscelis adeliae)       2              39.3   

   culmen_depth_mm  flipper_length_mm  body_mass_g  sex  
0             18.7              181.0       3750.0    1  
1             17.4              186.0       3800.0    0  
2             18.0              195.0       3250.0    0  
3             19.3              193.0       3450.0    0  
4             20.6              190.0       3650.0    1  


In [6]:
# Conexión a la base de datos MySQL

db_uri = "mysql+pymysql://root:supersecret@10.43.101.173:3306/training_data"
engine = sqlalchemy.create_engine(db_uri)

# Guardar el DataFrame en una tabla (se reemplaza la tabla si ya existe)
df_combined.to_sql('processed_penguins', engine, if_exists='replace', index=False)
print("Datos procesados guardados en la base de datos.")

Datos procesados guardados en la base de datos.


In [7]:
# Seleccionar las características y la variable objetivo
X = df_combined[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g", "island"]]
y = df_combined["sex"]

# Normalizar las características utilizando StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir el dataset en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print("Datos divididos en entrenamiento y prueba.")

Datos divididos en entrenamiento y prueba.


In [8]:
# Conectar al servidor de MLflow y habilitar el autologging
mlflow.set_tracking_uri("http://10.43.101.173:5000")
mlflow.autolog()  # Esto activa el autologging para modelos, parámetros, métricas y artefactos en las librerías compatibles

# Lista de modelos a usar
model_names = ["random_forest", "decision_tree", "svm", "logistic_regression"]

# Configurar el experimento en MLflow
experiment_name = "PenguinClassification"
mlflow.set_experiment(experiment_name)

best_accuracy = 0
best_run_id = None

# Realizar 20 experimentos
for i in range(20):
    with mlflow.start_run(run_name=f"Experiment {i}") as run:
        
        # Seleccionar de forma aleatoria un modelo
        model_type = random.choice(model_names)
        mlflow.log_param("model_type", model_type)
        
        # Definir y registrar hiperparámetros dependiendo del modelo
        if model_type == "random_forest":
            n_estimators = random.randint(50, 150)
            max_depth_val = random.choice([None, 3, 5, 7, 10])
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("max_depth", max_depth_val)
            # Inicializar RandomForest
            model = RandomForestClassifier(n_estimators=n_estimators, 
                                           max_depth=max_depth_val, 
                                           random_state=42)
            
        elif model_type == "decision_tree":
            max_depth_val = random.choice([None, 3, 5, 7, 10])
            min_samples_split = random.randint(2, 10)
            mlflow.log_param("max_depth", max_depth_val)
            mlflow.log_param("min_samples_split", min_samples_split)
            # Inicializar DecisionTree
            model = DecisionTreeClassifier(max_depth=max_depth_val, 
                                           min_samples_split=min_samples_split, 
                                           random_state=42)
            
        elif model_type == "svm":
            C_val = round(random.uniform(0.1, 10), 2)
            kernel_val = random.choice(["linear", "rbf"])
            mlflow.log_param("C", C_val)
            mlflow.log_param("kernel", kernel_val)
            # Inicializar SVC; en este caso usar probability=True si se requiere obtener probabilidades
            model = SVC(C=C_val, kernel=kernel_val, random_state=42, probability=True)
            
        elif model_type == "logistic_regression":
            C_val = round(random.uniform(0.1, 10), 2)
            penalty = random.choice(['l1', 'l2'])
            # Para penalty 'l1' se debe usar un solver compatible como "liblinear"
            solver = "liblinear" if penalty == 'l1' else "lbfgs"
            mlflow.log_param("C", C_val)
            mlflow.log_param("penalty", penalty)
            mlflow.log_param("solver", solver)
            # Inicializar LogisticRegression
            model = LogisticRegression(C=C_val, penalty=penalty, solver=solver, random_state=42, max_iter=1000)
        
        # Entrenar el modelo seleccionado
        model.fit(X_train, y_train)
        
        # Crear un pipeline que incluya el escalador y el modelo ya entrenado
        pipeline = Pipeline(steps=[('scaler', scaler), ('model', model)])
        
        # Evaluar el desempeño en el conjunto de prueba
        y_pred = pipeline.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", acc)
        
        # Registrar el modelo en MLflow: se usará el nombre registrado según el tipo de modelo.
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="model",
            registered_model_name=model_type
        )
        
        print(f"Run {i}: Modelo = {model_type}, Accuracy = {acc:.4f}")
        
        if acc > best_accuracy:
            best_accuracy = acc
            best_run_id = run.info.run_id

print(f"Mejor ejecución: {best_run_id} con accuracy = {best_accuracy:.4f}")

2025/03/16 22:51:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/03/16 22:51:48 INFO mlflow.bedrock: Enabled auto-tracing for Bedrock. Note that MLflow can only trace boto3 service clients that are created after this call. If you have already created one, please recreate the client by calling `boto3.client`.
2025/03/16 22:51:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for boto3.
Registered model 'random_forest' already exists. Creating a new version of this model...
2025/03/16 22:52:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 11
Created version '11' of model 'random_forest'.


Run 0: Modelo = random_forest, Accuracy = 0.4925
🏃 View run Experiment 0 at: http://10.43.101.173:5000/#/experiments/1/runs/8b854b49d85a424493779a92d425ef37
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'random_forest' already exists. Creating a new version of this model...
2025/03/16 22:52:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 12
Created version '12' of model 'random_forest'.


Run 1: Modelo = random_forest, Accuracy = 0.4925
🏃 View run Experiment 1 at: http://10.43.101.173:5000/#/experiments/1/runs/9ff5d4ea3122411087f579f663063837
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'decision_tree' already exists. Creating a new version of this model...
2025/03/16 22:52:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: decision_tree, version 12
Created version '12' of model 'decision_tree'.


Run 2: Modelo = decision_tree, Accuracy = 0.4925
🏃 View run Experiment 2 at: http://10.43.101.173:5000/#/experiments/1/runs/edc9f3fd0e7a437182b833aeb8175938
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'logistic_regression' already exists. Creating a new version of this model...
2025/03/16 22:52:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression, version 10
Created version '10' of model 'logistic_regression'.


Run 3: Modelo = logistic_regression, Accuracy = 0.4925
🏃 View run Experiment 3 at: http://10.43.101.173:5000/#/experiments/1/runs/79d10dfd821740eca5bfd70b7864428a
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'random_forest' already exists. Creating a new version of this model...
2025/03/16 22:52:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 13
Created version '13' of model 'random_forest'.


Run 4: Modelo = random_forest, Accuracy = 0.4925
🏃 View run Experiment 4 at: http://10.43.101.173:5000/#/experiments/1/runs/205616171a4442ae8817e6da4dce6fb2
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'random_forest' already exists. Creating a new version of this model...
2025/03/16 22:52:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 14
Created version '14' of model 'random_forest'.


Run 5: Modelo = random_forest, Accuracy = 0.4925
🏃 View run Experiment 5 at: http://10.43.101.173:5000/#/experiments/1/runs/9c0b78a499ca411997e276159ae50e10
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'svm' already exists. Creating a new version of this model...
2025/03/16 22:53:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svm, version 11
Created version '11' of model 'svm'.


Run 6: Modelo = svm, Accuracy = 0.5075
🏃 View run Experiment 6 at: http://10.43.101.173:5000/#/experiments/1/runs/248a46a1d68f418893f700ef876d99ea
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'logistic_regression' already exists. Creating a new version of this model...
2025/03/16 22:53:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression, version 11
Created version '11' of model 'logistic_regression'.


Run 7: Modelo = logistic_regression, Accuracy = 0.4925
🏃 View run Experiment 7 at: http://10.43.101.173:5000/#/experiments/1/runs/8c50fdd06a0d4c9cb11f11897f7b1568
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'logistic_regression' already exists. Creating a new version of this model...
2025/03/16 22:53:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression, version 12
Created version '12' of model 'logistic_regression'.


Run 8: Modelo = logistic_regression, Accuracy = 0.4925
🏃 View run Experiment 8 at: http://10.43.101.173:5000/#/experiments/1/runs/a57713c341114b6c96362a9683641110
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'svm' already exists. Creating a new version of this model...
2025/03/16 22:53:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svm, version 12
Created version '12' of model 'svm'.


Run 9: Modelo = svm, Accuracy = 0.4925
🏃 View run Experiment 9 at: http://10.43.101.173:5000/#/experiments/1/runs/183ed58af8064a9dbb18fb5d72389497
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'decision_tree' already exists. Creating a new version of this model...
2025/03/16 22:53:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: decision_tree, version 13
Created version '13' of model 'decision_tree'.


Run 10: Modelo = decision_tree, Accuracy = 0.4925
🏃 View run Experiment 10 at: http://10.43.101.173:5000/#/experiments/1/runs/24b50e4b38304007aef42b407c6985c0
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'random_forest' already exists. Creating a new version of this model...
2025/03/16 22:53:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 15
Created version '15' of model 'random_forest'.


Run 11: Modelo = random_forest, Accuracy = 0.4925
🏃 View run Experiment 11 at: http://10.43.101.173:5000/#/experiments/1/runs/3ddf3169985f41199908ad6ca16c2798
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'svm' already exists. Creating a new version of this model...
2025/03/16 22:54:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svm, version 13
Created version '13' of model 'svm'.


Run 12: Modelo = svm, Accuracy = 0.4925
🏃 View run Experiment 12 at: http://10.43.101.173:5000/#/experiments/1/runs/056f9c88ec244ec5a0457c73f3516d3f
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'logistic_regression' already exists. Creating a new version of this model...
2025/03/16 22:54:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression, version 13
Created version '13' of model 'logistic_regression'.


Run 13: Modelo = logistic_regression, Accuracy = 0.4925
🏃 View run Experiment 13 at: http://10.43.101.173:5000/#/experiments/1/runs/b37c9c1b27b74cb8a01bae514cf7adf2
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'svm' already exists. Creating a new version of this model...
2025/03/16 22:54:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svm, version 14
Created version '14' of model 'svm'.


Run 14: Modelo = svm, Accuracy = 0.4925
🏃 View run Experiment 14 at: http://10.43.101.173:5000/#/experiments/1/runs/4ea9c66bd7c6403f8b749e4330666377
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'decision_tree' already exists. Creating a new version of this model...
2025/03/16 22:54:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: decision_tree, version 14
Created version '14' of model 'decision_tree'.


Run 15: Modelo = decision_tree, Accuracy = 0.4925
🏃 View run Experiment 15 at: http://10.43.101.173:5000/#/experiments/1/runs/1ea007988e9f4c75ba52cf8db3f085c6
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'logistic_regression' already exists. Creating a new version of this model...
2025/03/16 22:54:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression, version 14
Created version '14' of model 'logistic_regression'.


Run 16: Modelo = logistic_regression, Accuracy = 0.4925
🏃 View run Experiment 16 at: http://10.43.101.173:5000/#/experiments/1/runs/4bd8390a485745d8bf41cdeceff4f358
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'random_forest' already exists. Creating a new version of this model...
2025/03/16 22:54:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 16
Created version '16' of model 'random_forest'.


Run 17: Modelo = random_forest, Accuracy = 0.4925
🏃 View run Experiment 17 at: http://10.43.101.173:5000/#/experiments/1/runs/f303bbfbc49f43cfbcc7e2a26f17bbef
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'svm' already exists. Creating a new version of this model...
2025/03/16 22:54:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svm, version 15
Created version '15' of model 'svm'.


Run 18: Modelo = svm, Accuracy = 0.4925
🏃 View run Experiment 18 at: http://10.43.101.173:5000/#/experiments/1/runs/bf53766cd6324e48b62f991893778e64
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1


Registered model 'svm' already exists. Creating a new version of this model...
2025/03/16 22:55:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svm, version 16


Run 19: Modelo = svm, Accuracy = 0.4925
🏃 View run Experiment 19 at: http://10.43.101.173:5000/#/experiments/1/runs/77715138083d4b34be4922979e93a784
🧪 View experiment at: http://10.43.101.173:5000/#/experiments/1
Mejor ejecución: 248a46a1d68f418893f700ef876d99ea con accuracy = 0.5075


Created version '16' of model 'svm'.
