In [27]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from sklearn.inspection import permutation_importance

import os
import mlflow
import mlflow.sklearn
from typing import Dict, Any
import statsmodels.api as sm
from statsmodels.formula.api import ols

current_dir = os.path.dirname(os.path.abspath('Model.ipynb'))

src_dir = os.path.join(current_dir, '..')

if src_dir not in sys.path:
    sys.path.append(src_dir)

from src.misc import ParameterControl, safe_execution, register_model, transition_model_to_production, model_exists, print_best_results

pc = ParameterControl()

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

retrain = widgets.Checkbox(
    value=True,
    description='Retrain Model',
    disabled=True
)

display(retrain)

Checkbox(value=True, description='Retrain Model', disabled=True)

# Load Data

In [28]:
with safe_execution():
    try:
        df_train = pd.read_parquet(pc.get_path("titanic_modeling_train"))
        df_test = pd.read_parquet(pc.get_path("titanic_modeling_test"))
    except:
        raise Exception('Could not load the parquet files, make sure to run ETL.ipynb first.')

# Modelado

In [30]:
X = df_train.drop(columns=['Survived'])
y = df_train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

with safe_execution():
    # Definir la ruta de mlruns y la base de datos SQLite
    mlruns_dir = pc.get_path('mlflow_runs')
    db_uri = pc.get_path('mlflow_model_db')

## Logistic Regression

In [31]:
with safe_execution():
    if not model_exists('Titanic Model - Logistic Regression') or retrain.value:
        param_grid = [
            {
                'logreg__penalty': ['l1', 'l2'],
                'logreg__C': [0.01, 0.1, 1, 10, 100],
                'logreg__solver': ['liblinear', 'saga'],
                'logreg__max_iter': [100, 200, 300, 500]
            },
            {
                'logreg__penalty': ['elasticnet'],
                'logreg__C': [0.01, 0.1, 1, 10, 100],
                'logreg__solver': ['saga'],
                'logreg__max_iter': [100, 200, 300, 500],
                'logreg__l1_ratio': [0.5, 0.7, 0.9]
            }
        ]

        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Escalar características
            ('logreg', LogisticRegression())  # Modelo de regresión logística
        ])

        # Definir las métricas de evaluación
        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1': make_scorer(f1_score),
            'roc_auc': make_scorer(roc_auc_score)
        }

        # Configurar el Grid Search con validación cruzada
        grid_search_lr = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring=scoring, refit='accuracy', error_score='raise')

        # Ajustar el Grid Search a los datos de entrenamiento
        grid_search_lr.fit(X_train, y_train)

        best_lr = print_best_results(grid_search_lr, "Logistic Regression")
    else:
            print(f"Modelo {'Titanic Model - Logistic Regression'} ya existe en MLflow y no se necesita retrain.")
            # Cargar el modelo existente desde MLflow
            client = mlflow.tracking.MlflowClient()
            model_uri = client.get_model_version_download_uri('Titanic Model - Logistic Regression', 1)
            best_lr = mlflow.sklearn.load_model(model_uri)
            print(f"Modelo {'Titanic Model - Logistic Regression'} cargado desde MLflow.")

Mejores parámetros encontrados para Logistic Regression: {'logreg__C': 0.01, 'logreg__max_iter': 100, 'logreg__penalty': 'l2', 'logreg__solver': 'saga'}
Mejor puntuación de exactitud para Logistic Regression: 0.7963262090022653


## Random Forest

In [32]:
with safe_execution():
    if not model_exists('Titanic Model - Random Forest') or retrain.value:
        # Define the pipeline
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Scale features
            ('rf', RandomForestClassifier(random_state=42))  # Random Forest model
        ])

        # Define the parameter grid for Grid Search
        param_grid = {
            'rf__n_estimators': [100, 200, 300, 500],
            'rf__max_depth': [None, 10, 20, 30],
            'rf__min_samples_split': [2, 5, 10],
            'rf__min_samples_leaf': [1, 2, 4],
            'rf__bootstrap': [True, False]
        }

        # Define the scoring metrics
        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1': make_scorer(f1_score),
            'roc_auc': make_scorer(roc_auc_score)
        }

        # Configure Grid Search with cross-validation
        grid_search_rf = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring=scoring, refit='accuracy', n_jobs=-1)

        # Fit Grid Search to the training data
        grid_search_rf.fit(X_train, y_train)

        best_rf = print_best_results(grid_search_rf, "Random Forest")
    else:
            print(f"Modelo {'Titanic Model - Random Forest'} ya existe en MLflow y no se necesita retrain.")
            # Cargar el modelo existente desde MLflow
            client = mlflow.tracking.MlflowClient()
            model_uri = client.get_model_version_download_uri('Titanic Model - Random Forest', 1)
            best_rf = mlflow.sklearn.load_model(model_uri)
            print(f"Modelo {'Titanic Model - Random Forest'} cargado desde MLflow.")

Mejores parámetros encontrados para Random Forest: {'rf__bootstrap': True, 'rf__max_depth': None, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 10, 'rf__n_estimators': 500}
Mejor puntuación de exactitud para Random Forest: 0.8202206244459764


## Support Vector Machine

In [33]:
with safe_execution():
    if not model_exists('Titanic Model - SVM') or retrain.value:
        # Definir el grid de parámetros para la búsqueda
        param_grid = [{
            'svc__C': [0.01, 0.1, 1, 10, 100, 200, 500],
            'svc__kernel': ['linear', 'rbf'],
            'svc__gamma': ['scale', 'auto'],
            },{
            'svc__C': [0.01, 0.1, 1, 10, 100, 200, 500],
            'svc__kernel': ['poly'],
            'svc__gamma': ['scale', 'auto'],
            'svc__degree': [3, 4, 5]
            }
        ]

        # Definir el pipeline que incluye el escalado de características y el modelo SVM
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Escalar características
            ('svc', SVC())  # Modelo SVM
        ])

        # Definir las métricas de evaluación
        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1': make_scorer(f1_score),
            'roc_auc': make_scorer(roc_auc_score)
        }

        # Configurar el Grid Search con validación cruzada
        grid_search_svm = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring=scoring, refit='accuracy', n_jobs=-1)

        # Ajustar el Grid Search a los datos de entrenamiento
        grid_search_svm.fit(X_train, y_train)

        best_svm = print_best_results(grid_search_svm, "SVM")
    else:
            print(f"Modelo {'Titanic Model - SVM'} ya existe en MLflow y no se necesita retrain.")
            # Cargar el modelo existente desde MLflow
            client = mlflow.tracking.MlflowClient()
            model_uri = client.get_model_version_download_uri('Titanic Model - SVM', 1)
            best_svm = mlflow.sklearn.load_model(model_uri)
            print(f"Modelo {'Titanic Model - SVM'} cargado desde MLflow.")

Mejores parámetros encontrados para SVM: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
Mejor puntuación de exactitud para SVM: 0.817374175120654


## XGBoost

In [34]:
with safe_execution():
    if not model_exists('Titanic Model - XGBoost') or retrain.value:
        # Definir el grid de parámetros para la búsqueda
        param_grid = {
            'xgb__n_estimators': [25, 50, 100, 200, 300, 500],
            'xgb__max_depth': [None, 3, 6, 9],
            'xgb__learning_rate': [0.001, 0.01, 0.1, 0.3],
            'xgb__subsample': [0.8, 1.0],
            'xgb__colsample_bytree': [0.8, 1.0]
        }

        # Definir el pipeline que incluye el escalado de características y el modelo XGBoost
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Escalar características
            ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))  # Modelo XGBoost
        ])

        # Definir las métricas de evaluación
        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1': make_scorer(f1_score),
            'roc_auc': make_scorer(roc_auc_score)
        }

        # Configurar el Grid Search con validación cruzada
        grid_search_xgb = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring=scoring, refit='accuracy', n_jobs=-1)

        # Ajustar el Grid Search a los datos de entrenamiento
        grid_search_xgb.fit(X_train, y_train)

        best_xgb = print_best_results(grid_search_xgb, "XGBoost")
    else:
            print(f"Modelo {'Titanic Model - XGBoost'} ya existe en MLflow y no se necesita retrain.")
            # Cargar el modelo existente desde MLflow
            client = mlflow.tracking.MlflowClient()
            model_uri = client.get_model_version_download_uri('Titanic Model - XGBoost', 1)
            best_xgb = mlflow.sklearn.load_model(model_uri)
            print(f"Modelo {'Titanic Model - XGBoost'} cargado desde MLflow.")

Mejores parámetros encontrados para XGBoost: {'xgb__colsample_bytree': 1.0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': None, 'xgb__n_estimators': 100, 'xgb__subsample': 1.0}
Mejor puntuación de exactitud para XGBoost: 0.8329262287008767


Parameters: { "use_label_encoder" } are not used.



## KNN

In [35]:
with safe_execution():
    if not model_exists('Titanic Model - KNN') or retrain.value:
        # Definir el grid de parámetros para la búsqueda
        param_grid = {
            'knn__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
            'knn__weights': ['uniform', 'distance'],
            'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }

        # Definir el pipeline que incluye el escalado de características y el modelo KNN
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Escalar características
            ('knn', KNeighborsClassifier())  # Modelo KNN
        ])

        # Definir las métricas de evaluación
        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'f1': make_scorer(f1_score),
            'roc_auc': make_scorer(roc_auc_score)
        }

        # Configurar el Grid Search con validación cruzada
        grid_search_knn = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring=scoring, refit='accuracy', n_jobs=-1)

        # Ajustar el Grid Search a los datos de entrenamiento
        grid_search_knn.fit(X_train, y_train)

        best_knn = print_best_results(grid_search_knn, "KNN")
    else:
            print(f"Modelo {'Titanic Model - KNN'} ya existe en MLflow y no se necesita retrain.")
            # Cargar el modelo existente desde MLflow
            client = mlflow.tracking.MlflowClient()
            model_uri = client.get_model_version_download_uri('Titanic Model - KNN', 1)
            best_knn = mlflow.sklearn.load_model(model_uri)
            print(f"Modelo {'Titanic Model - KNN'} cargado desde MLflow.")

Mejores parámetros encontrados para KNN: {'knn__algorithm': 'auto', 'knn__n_neighbors': 11, 'knn__weights': 'uniform'}
Mejor puntuación de exactitud para KNN: 0.814586821629075


# Testing

In [36]:
with safe_execution():
    # Evaluar y seleccionar el mejor modelo
    models = {
        "Logistic Regression": best_lr,
        "Random Forest": best_rf,
        "SVM": best_svm,
        "XGBoost": best_xgb,
        "KNN": best_knn
    }

    searches = {
        "Logistic Regression": grid_search_lr,
        "Random Forest": grid_search_rf,
        "SVM": grid_search_svm,
        "XGBoost": grid_search_xgb,
        "KNN": grid_search_knn
    }

    model_scores = {}
    for name, model in models.items():
        y_pred = model.predict(X_val)
        model_scores[name] = {
            'accuracy': accuracy_score(y_val, y_pred),
            'f1': f1_score(y_val, y_pred),
            'roc_auc': roc_auc_score(y_val, y_pred)
        }

# MLOps

In [37]:
with safe_execution():
    for model in list(models.keys()):
        metrics = model_scores[model]
        
        def clean_key(k): 
            try: 
                return k.split('__')[1] 
            except: 
                return k
            
        params = dict({ (k.split('__')[1], v) for k, v in searches[model].best_params_.items() })

        best_model = models[model]

        register_model(best_model, f"Titanic Model - {model}", params, metrics, registry_uri=mlruns_dir, tracking_uri=db_uri, stage="Staging", verbose=False)
        
    # # Visualización de la interfaz de MLflow
    print(f'mlflow ui --backend-store-uri sqlite:///{db_uri}')

Registered model 'Titanic Model - Logistic Regression' already exists. Creating a new version of this model...
2024/07/11 11:39:10 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Titanic Model - Logistic Regression, version 6
Created version '6' of model 'Titanic Model - Logistic Regression'.
Registered model 'Titanic Model - Random Forest' already exists. Creating a new version of this model...
2024/07/11 11:39:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Titanic Model - Random Forest, version 6
Created version '6' of model 'Titanic Model - Random Forest'.
Registered model 'Titanic Model - SVM' already exists. Creating a new version of this model...
2024/07/11 11:39:18 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     

mlflow ui --backend-store-uri sqlite:///c:\Users\Juanm\OneDrive\Escritorio\RappiChallenge\.mlflow.db


Created version '6' of model 'Titanic Model - KNN'.


# Model Selection

In [38]:
with safe_execution():
    best_model_name = max(model_scores, key=lambda x: model_scores[x]['accuracy'])
    best_model = models[best_model_name]

    # # Supongamos que quieres actualizar la versión 1 del modelo "KNN Model Titanic" a Production
    transition_model_to_production(f"Titanic Model - {best_model_name}", 1)

    print(f"El mejor modelo es: {best_model_name} con las siguientes métricas:")
    print(model_scores[best_model_name])

El mejor modelo es: KNN con las siguientes métricas:
{'accuracy': 0.8156424581005587, 'f1': 0.7659574468085106, 'roc_auc': 0.802960102960103}


# Feature Importance

In [39]:
with safe_execution():
    result = permutation_importance(best_knn, X_val, y_val, n_repeats=30, random_state=42)

    feature_importance = pd.DataFrame(result.importances_mean, index=X_val.columns, columns=['Importance'])
    feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
    feature_importance

Unnamed: 0,Importance
Sex,0.165549
Pclass,0.036313
Parch,0.027374
Embarked,0.025512
SibSp,0.024022
Age,0.017877
Cabin,0.010615
Ticket,0.00987
