# Training Models 

### Load libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Binarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
import joblib
import warnings
from datetime import datetime
warnings.filterwarnings("ignore")

## Load dataset

In [2]:
base_df = pd.read_csv('../data/hotel_booking_dataset.csv')

In [3]:
base_df.shape

(109390, 32)

## Preprocessing dataset

In [4]:
# Defining categorical, numerical and binary columns
cat_cols = ['hotel','meal', 'market_segment','distribution_channel',
            'reserved_room_type','deposit_type','customer_type']

num_cols = ['lead_time','days_in_waiting_list',
            'adr','total_stay','total_people']

bin_cols = ['is_repeated_guest','previous_cancellations',
            'previous_bookings_not_canceled','booking_changes',
            'agent','company','required_car_parking_spaces',
            'total_of_special_requests']

In [5]:
# Checking for duplicates and null values and creating new columns
dataset =(base_df
 .drop_duplicates()
 .fillna(0)
 .assign(total_stay=lambda df: df['stays_in_weekend_nights'] + df['stays_in_week_nights'],
         total_people=lambda df: df['adults'] + df['children'] + df['babies'],
        )
 [cat_cols + num_cols + bin_cols + ['is_canceled']]
 .assign(total_people=lambda df: df['total_people'].astype('int64'),
         agent=lambda df: df['agent'].astype('int64'),
         company=lambda df: df['company'].astype('int64'),
        )
)

In [6]:
dataset.to_csv("../data/hotel_bookings_reference.csv", index=False)

In [7]:
# Define the preprocessing steps
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

binary_transformer = Pipeline(steps=[
    ('binarizer', Binarizer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_cols),
        ('num', numerical_transformer, num_cols),
        ('bin', binary_transformer, bin_cols)
    ]).set_output(transform="pandas")

processed_data = preprocessor.fit_transform(dataset)

preprocessor_file = "./artifacts/preprocessor_model.pkl"
joblib.dump(preprocessor, preprocessor_file)

['./artifacts/preprocessor_model.pkl']

In [8]:
X = processed_data
y = dataset['is_canceled']

## Training Models

In [9]:
# Establecer la URL del servidor remoto de MLflow
mlflow.set_tracking_uri("http://localhost:5000")

In [10]:
# Configurar el experimento en MLflow
mlflow.set_experiment("hotel_cancellation_prediction")

2024/07/13 23:04:02 INFO mlflow.tracking.fluent: Experiment with name 'hotel_cancellation_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1720926242775, experiment_id='1', last_update_time=1720926242775, lifecycle_stage='active', name='hotel_cancellation_prediction', tags={}>

In [11]:
# Configurar MLflow para el autolog
mlflow.sklearn.autolog()



In [12]:
# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Guardar los datasets de entrenamiento y prueba en archivos CSV
train_data = pd.DataFrame(X_train, columns=X.columns)
train_data['is_canceled'] = y_train.reset_index(drop=True)
train_data.to_csv('./artifacts/train_data.csv', index=False)

test_data = pd.DataFrame(X_test, columns=X.columns)
test_data['is_canceled'] = y_test.reset_index(drop=True)
test_data.to_csv('./artifacts/test_data.csv', index=False)

In [14]:
# Definir los modelos
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

In [15]:
# Entrenar y evaluar los modelos
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name} [{(datetime.now().strftime('%Y%m%d-%H%M%S'))}]"):
        # Registrar el escalador y los datasets de entrenamiento y prueba
        mlflow.log_artifact(preprocessor_file, artifact_path='preprocessing')
        mlflow.log_artifact('./artifacts/train_data.csv', artifact_path='train_data')
        mlflow.log_artifact('./artifacts/test_data.csv', artifact_path='test_data')
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        signature = infer_signature(X_test, y_pred)
        
        # Generar la matriz de confusión
        cm = confusion_matrix(y_test, y_pred)
        
        # Registrar la matriz de confusión como una imagen en MLflow
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot()
        plt.title(f'Matriz de Confusión - {name}')
        plt.savefig(f'./artifacts/confusion_matrix_{name}.png')
        mlflow.log_artifact(f'./artifacts/confusion_matrix_{name}.png')
        plt.close()
        
        # Registrar la importancia de características si está disponible
        if hasattr(model, 'feature_importances_'):
            feature_importances = model.feature_importances_
            importance_df = pd.DataFrame({
                'Feature': X.columns,
                'Importance': feature_importances
            }).sort_values(by='Importance', ascending=False)
            importance_df.to_csv(f'./artifacts/feature_importances_{name}.csv', index=False)
            mlflow.log_artifact(f'./artifacts/feature_importances_{name}.csv')
        elif hasattr(model, 'coef_'):
            feature_importances = np.abs(model.coef_[0])
            importance_df = pd.DataFrame({
                'Feature': X.columns,
                'Importance': feature_importances
            }).sort_values(by='Importance', ascending=False)
            importance_df.to_csv(f'./artifacts/feature_importances_{name}.csv', index=False)
            mlflow.log_artifact(f'./artifacts/feature_importances_{name}.csv')

        # Log the sklearn model and register as version 1
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            signature=signature,
            registered_model_name=f"sk-learn-{(name.lower().replace(' ', '-'))}-reg-model",
        )
        
        run = mlflow.active_run()
        print("Active run_id: {}".format(run.info.run_id))
        
    mlflow.end_run()
        
print("Training and evaluation finished.")

Successfully registered model 'sk-learn-logistic-regression-reg-model'.
2024/07/13 23:04:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-logistic-regression-reg-model, version 1
Created version '1' of model 'sk-learn-logistic-regression-reg-model'.


Active run_id: d416b58a1f39460386dec9aac49228e2


Successfully registered model 'sk-learn-decision-tree-reg-model'.
2024/07/13 23:04:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-decision-tree-reg-model, version 1
Created version '1' of model 'sk-learn-decision-tree-reg-model'.


Active run_id: 1eaa705caf4b473fa1f68abe1ea60244


Successfully registered model 'sk-learn-random-forest-reg-model'.
2024/07/13 23:05:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-random-forest-reg-model, version 1
Created version '1' of model 'sk-learn-random-forest-reg-model'.


Active run_id: 21a3bee7174340d1930188d2e53d2029


Successfully registered model 'sk-learn-gradient-boosting-reg-model'.
2024/07/13 23:05:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-gradient-boosting-reg-model, version 1


Active run_id: 8146f9a904494ee7a0e52ee659692e93
Training and evaluation finished.


Created version '1' of model 'sk-learn-gradient-boosting-reg-model'.


## Manage Models

In [16]:
from mlflow.tracking import MlflowClient

In [17]:
client = MlflowClient(tracking_uri='http://localhost:5000')

client.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1720926242775, experiment_id='1', last_update_time=1720926242775, lifecycle_stage='active', name='hotel_cancellation_prediction', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1720926184689, experiment_id='0', last_update_time=1720926184689, lifecycle_stage='active', name='Default', tags={}>]

In [19]:
for name, model in models.items():
    model_name = f"sk-learn-{(name.lower().replace(' ', '-'))}-reg-model"
    x=client.get_registered_model(model_name)
    client.set_registered_model_alias(model_name, "challenger", x.latest_versions[0].version)

In [21]:
model_name = "sk-learn-decision-tree-reg-model"
x=client.get_registered_model(model_name)
client.delete_registered_model_alias(model_name, "challenger")
client.set_registered_model_alias(model_name, "champion", x.latest_versions[0].version)