In [1]:
import numpy as np 
import pandas as pd 
import mlflow
import mlflow.sklearn


In [2]:
df = pd.read_csv('datos/penguins_size.csv')
df.head()
"""  
species->especie
island->isla
culmen_length_mm->longitud_del_culmen_mm
culmen_depth_mm->profundidad_del_culmen_mm
flipper_length_mm->longitud_de_la_aleta_mm
body_mass_g=>masa_corporal_g
sex=>sexo
"""

'  \nspecies->especie\nisland->isla\nculmen_length_mm->longitud_del_culmen_mm\nculmen_depth_mm->profundidad_del_culmen_mm\nflipper_length_mm->longitud_de_la_aleta_mm\nbody_mass_g=>masa_corporal_g\nsex=>sexo\n'

In [3]:
df.describe(include='all')

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
count,344,344,342.0,342.0,342.0,342.0,334
unique,3,3,,,,,3
top,Adelie,Biscoe,,,,,MALE
freq,152,168,,,,,168
mean,,,43.92193,17.15117,200.915205,4201.754386,
std,,,5.459584,1.974793,14.061714,801.954536,
min,,,32.1,13.1,172.0,2700.0,
25%,,,39.225,15.6,190.0,3550.0,
50%,,,44.45,17.3,197.0,4050.0,
75%,,,48.5,18.7,213.0,4750.0,


In [4]:
df.dropna(inplace=True)  # Eliminar filas con valores faltantes (opción simple para este dataset)
df = pd.get_dummies(df, columns=[ 'island', 'sex'], drop_first=True)

In [5]:
from sklearn.model_selection import train_test_split
X = df.drop('species', axis=1)   # Variable objetivo: body_mass_g
y = df['species'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Guardar el orden de las columnas
import pickle
column_order = X_train.columns
with open("column_order.pkl", "wb") as f:
    pickle.dump(column_order, f)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


import mlflow

# Modelos a probar
models = [
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=5)),
    ('SVC', SVC(kernel='linear')),
    ('GaussianNB', GaussianNB()),
    ('MLPClassifier', MLPClassifier(max_iter=1000))  # Ahora MLPClassifier está importado
]

def run_experiment(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        print(f"Iniciando experimentación con el modelo {model_name}")
        
        # Entrenamiento del modelo
        model.fit(X_train, y_train)

        # Predicción
        y_pred = model.predict(X_test)

        # Calcular la precisión
        accuracy = accuracy_score(y_test, y_pred)

        # Registrar el modelo en MLflow
        mlflow.sklearn.log_model(model, model_name)

        # Registrar las métricas
        mlflow.log_metric("accuracy", accuracy)

        # Registrar los parámetros
        if model_name == 'KNeighborsClassifier':
            mlflow.log_param("n_neighbors", model.n_neighbors)
        elif model_name == 'SVC':
            mlflow.log_param("kernel", model.kernel)
        elif model_name == 'MLPClassifier':
            mlflow.log_param("max_iter", model.max_iter)

        print(f"Modelo {model_name} - Accuracy: {accuracy}")
        print(f"Modelo {model_name} registrado en MLflow y almacenado en el bucket S3")
        print(f"Run ID: {mlflow.active_run().info.run_id}")



In [None]:
from sklearn.metrics import accuracy_score

# Realizar pruebas adicionales con un conjunto de datos diferente
df_new = pd.read_csv('datos/penguins_size.csv')
df_new.dropna(inplace=True)
df_new = pd.get_dummies(df_new, columns=['island', 'sex'], drop_first=True)

X_new = df_new.drop('species', axis=1)
y_new = df_new['species']
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

for model_name, model in models:
    for i in range(5):  # Experimentamos también con el nuevo dataset
        run_experiment(model_name, model, X_train_new, X_test_new, y_train_new, y_test_new)

Iniciando experimentación con el modelo KNeighborsClassifier
Modelo KNeighborsClassifier - Accuracy: 0.8208955223880597
Modelo KNeighborsClassifier registrado en MLflow y almacenado en el bucket S3
Run ID: e40fe1e82505448980478616caee07ae
Iniciando experimentación con el modelo KNeighborsClassifier
Modelo KNeighborsClassifier - Accuracy: 0.8208955223880597
Modelo KNeighborsClassifier registrado en MLflow y almacenado en el bucket S3
Run ID: f5169d1191ea4723b389bb35b9808def
Iniciando experimentación con el modelo KNeighborsClassifier


In [None]:
from sklearn.metrics import accuracy_score

# Experimentar con los modelos y registrar los resultados
for model_name, model in models:
    for i in range(5):  # Hacemos 5 experimentos por cada modelo
        run_experiment(model_name, model, X_train, X_test, y_train, y_test)

### Forma de consumir los modelos

In [None]:
import pandas as pd


# DataFrame base con todas las columnas dummy posibles (¡CRUCIAL!)


class Pinguino:
    
    def __init__(self, culmen_length_mm, culmen_depth_mm, flipper_length_mm,body_mass_g ,island, sex):
        self.culmen_length_mm = culmen_length_mm
        self.culmen_depth_mm = culmen_depth_mm
        self.flipper_length_mm = flipper_length_mm
        self.island = island
        self.sex = sex
        self.body_mass_g=body_mass_g
        


    def to_dataframe(self):
        data = {
            'culmen_length_mm': [self.culmen_length_mm],
            'culmen_depth_mm': [self.culmen_depth_mm],
            'flipper_length_mm': [self.flipper_length_mm],
            'island': [self.island],
            'sex': [self.sex],
            'body_mass_g': [self.body_mass_g]
        }
        df = pd.DataFrame(data)

        # 1. Crear un DataFrame *vacío* con las columnas base
        df_nuevo = pd.DataFrame(columns=columnas_base)

        # 2. Asignar los valores del pingüino al DataFrame vacío
        df_nuevo[['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']] = df[['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']].values
        
        # 3. Crear las columnas dummy *manualmente*
        if self.island == 'Dream':
            df_nuevo['island_Dream'] = 1
            df_nuevo['island_Torgersen'] = 0
        elif self.island == 'Torgersen':
            df_nuevo['island_Torgersen'] = 1
            df_nuevo['island_Dream'] = 0
        # Si la isla es 'Biscoe' (u otra), las columnas dummy correspondientes ya están a 0 por defecto

        if self.sex == 'FEMALE':
            df_nuevo['sex_FEMALE'] = 1
            df_nuevo['sex_MALE'] = 0
        elif self.sex == 'MALE':
            df_nuevo['sex_MALE'] = 1
            df_nuevo['sex_FEMALE'] = 0
        # Si el sexo es 'Unknown' (u otro), las columnas dummy correspondientes ya están a 0 por defecto

        return df_nuevo

In [None]:



import pickle
def preprocesar_pinguino(pinguino):
    """Preprocesa un objeto Pinguino para la inferencia."""
    df = pinguino.to_dataframe()
    df.dropna(inplace=True) 
    

    # Asumiendo que X_train es tu DataFrame de entrenamiento

# Cargar el orden de las columnas
    with open("../models/column_order.pkl", "rb") as f:
        column_order = pickle.load(f)
        df = df[column_order]  


    return df

In [None]:
import joblib


# Carga de modelos y scaler
model1 = joblib.load("../models/modelo1.pkl")
model2 = joblib.load("../models/modelo2.pkl")
model3 = joblib.load("../models/modelo3.pkl")
model4 = joblib.load("../models/modelo4.pkl")


# Creación de un nuevo objeto Pinguino
nuevo_pinguino = Pinguino(
    culmen_length_mm=69.1, 
    culmen_depth_mm=188.7, 
    flipper_length_mm=18771.0, 
    body_mass_g=3757770.0,
    island='Torgersen', 
    sex='MALE'
)


""" 
species	culmen_length_mm	culmen_depth_mm	flipper_length_mm	body_mass_g	island_Dream	island_Torgersen	sex_FEMALE	sex_MALE
Adelie	39.1	18.7	181.0	3750.0	                                    False	True	False	True

Gentoo	50.4	15.7	222.0	5750.0	False	False	False	True
"""



In [None]:
# Preprocesamiento del pingüino
nuevos_datos = preprocesar_pinguino(nuevo_pinguino)
nuevos_datos



In [None]:
# Inferencia con el modelo
prediccion1 = model1.predict(nuevos_datos)
prediccion2 = model2.predict(nuevos_datos)
prediccion3 = model3.predict(nuevos_datos)
prediccion4 = model4.predict(nuevos_datos)

print("Predicción:", prediccion1)
print("Predicción:", prediccion2)
print("Predicción:", prediccion3)
print("Predicción:", prediccion4)