In [46]:
# bibliotecas
import os
import pandas as pd
import mlflow

from pathlib import Path
from pycaret.classification import setup, create_model, tune_model, save_model, predict_model, pull
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

In [47]:
# Diretório base do projeto
BASE_DIR = Path(os.getcwd()).parent

# Caminho onde os dados de treino estão armazenados
MODEL_INPUT_PATH = BASE_DIR / 'data' / '05_model_input'
X_TRAIN_FILE_PATH = MODEL_INPUT_PATH / '05_model_input_x_train.parquet'
Y_TRAIN_FILE_PATH = MODEL_INPUT_PATH / '05_model_input_y_train.parquet'

# Caminho para salvar os modelos otimizados
RESULTS_PATH = BASE_DIR / 'data' / '06_models'
os.makedirs(RESULTS_PATH, exist_ok=True)
DT_OPTIMIZED_MODEL_PATH = RESULTS_PATH / 'decision_tree_optimized_model'
LR_OPTIMIZED_MODEL_PATH = RESULTS_PATH / 'logistic_regression_optimized_model'

In [48]:
# Configurar o MLflow
mlflow.set_experiment("eng_ml_experiments")

<Experiment: artifact_location='file:///c:/Users/pesso/Documents/DevProjects/GitHubRepositories/ml_models/04_infnet_ml_engineering_pd/04-infnet-ml-engineering-pd/notebooks/mlruns/323894616767193424', creation_time=1743884000557, experiment_id='323894616767193424', last_update_time=1743884000557, lifecycle_stage='active', name='eng_ml_experiments', tags={}>

In [None]:
# Run para o modelo Decision Tree
with mlflow.start_run(run_name="decision_tree"):
    # Ler os dados de treino
    X_train = pd.read_parquet(X_TRAIN_FILE_PATH)
    y_train = pd.read_parquet(Y_TRAIN_FILE_PATH).squeeze()  # Convertendo para Series

    # Combinar X e y em um único DataFrame para PyCaret
    data = pd.concat([X_train, y_train], axis=1)

    # Configurar o PyCaret
    setup(
        data=data,
        target='shot_made_flag',  # Nome da variável dependente
        session_id=17  # Para reprodutibilidade
    )

    # Criar o modelo Decision Tree
    dt_model = create_model('dt')

    dt_tuned_model = tune_model(dt_model, custom_grid={
        'max_depth': [2, 3, 5, 7, 10, 15],  # Testando mais profundidades
        'min_samples_split': [2, 5, 10, 20, 30],  # Mais opções para divisão mínima
        'min_samples_leaf': [1, 2, 5, 10]  # Adicionando um novo parâmetro de folhas mínimas
    })

    # **3. Avaliação com K-Fold Cross Validation**
    # Extrai os resultados da validação cruzada
    cv_results = pull()

    # **4. Comparação com Métricas Anteriores**
    # Avaliação do modelo antes e depois da otimização
    predictions_before_tuning = predict_model(dt_model)
    predictions_after_tuning = predict_model(dt_tuned_model)

    accuracy_before = accuracy_score(predictions_before_tuning['shot_made_flag'], predictions_before_tuning['prediction_label'])
    accuracy_after = accuracy_score(predictions_after_tuning['shot_made_flag'], predictions_after_tuning['prediction_label'])

    print(f"Acurácia antes da tuning: {accuracy_before}")
    print(f"Acurácia após tuning: {accuracy_after}")

    # **5. Experimentos no MLflow**
    # Registrar as métricas do modelo otimizado
    true_labels = predictions_after_tuning['shot_made_flag']
    predicted_labels = predictions_after_tuning['prediction_label']
    predicted_scores = predictions_after_tuning['prediction_score']

    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    log_loss_value = log_loss(true_labels, predicted_scores)

    mlflow.log_metric("accuracy_before_tuning", accuracy_before)
    mlflow.log_metric("accuracy_after_tuning", accuracy_after)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("log_loss", log_loss_value)

    print(f"Métricas calculadas: Accuracy Antes={accuracy_before}, Accuracy Depois={accuracy_after}, Precision={precision}, Recall={recall}, F1-Score={f1}, Log Loss={log_loss_value}")

    # Salvar o modelo otimizado do Decision Tree
    save_model(dt_tuned_model, str(DT_OPTIMIZED_MODEL_PATH))
    mlflow.log_artifact(f"{DT_OPTIMIZED_MODEL_PATH}.pkl")

    # Registrar os parâmetros utilizados
    mlflow.log_param("pycaret_session_id", 17)
    mlflow.log_param("model_type", "Decision Tree")

print(f"Modelo Decision Tree otimizado salvo em:\n{DT_OPTIMIZED_MODEL_PATH}.pkl")

Unnamed: 0,Description,Value
0,Session id,17
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(19416, 7)"
4,Transformed data shape,"(19416, 7)"
5,Transformed train set shape,"(13591, 7)"
6,Transformed test set shape,"(5825, 7)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5728,0.5315,0.6789,0.6082,0.6416,0.1168,0.1179
1,0.5453,0.5102,0.6475,0.5877,0.6161,0.0616,0.062
2,0.5556,0.5187,0.6632,0.5948,0.6272,0.0811,0.0818
3,0.5379,0.5128,0.6423,0.5816,0.6104,0.046,0.0464
4,0.532,0.4951,0.6536,0.5741,0.6112,0.0296,0.03
5,0.546,0.4985,0.6614,0.5856,0.6212,0.0599,0.0606
6,0.5386,0.4914,0.6392,0.5821,0.6093,0.049,0.0493
7,0.5335,0.4909,0.6405,0.5771,0.6072,0.0367,0.037
8,0.5578,0.519,0.668,0.5956,0.6297,0.0853,0.0862
9,0.56,0.5067,0.668,0.5977,0.6309,0.0904,0.0912


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5588,0.5738,0.9595,0.5637,0.7101,0.0018,0.004
1,0.5548,0.5676,0.8773,0.568,0.6896,0.0169,0.023
2,0.5622,0.5602,0.842,0.5764,0.6844,0.0458,0.0555
3,0.5592,0.5594,0.9778,0.5627,0.7144,-0.0041,-0.0127
4,0.5681,0.5665,0.8889,0.5753,0.6985,0.0475,0.0645
5,0.5578,0.5777,0.8941,0.5681,0.6948,0.0204,0.0292
6,0.5681,0.5862,0.885,0.5757,0.6976,0.0487,0.0652
7,0.5666,0.5694,0.9582,0.5682,0.7134,0.0227,0.0462
8,0.5592,0.5579,0.885,0.5699,0.6933,0.0268,0.0369
9,0.574,0.6071,0.9046,0.5776,0.705,0.0574,0.0809


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.5523,0.513,0.6443,0.5947,0.6185,0.0788,0.0792


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.5669,0.5827,0.8799,0.5756,0.6959,0.0466,0.0617


Acurácia antes da tuning: 0.552274678111588
Acurácia após tuning: 0.5668669527896996
Métricas calculadas: Accuracy Antes=0.552274678111588, Accuracy Depois=0.5668669527896996, Precision=0.5482274747831243, Recall=0.5668669527896996, F1-Score=0.5000918411403842, Log Loss=0.6699957172196076
Transformation Pipeline and Model Successfully Saved
Modelo Decision Tree otimizado salvo em:
c:\Users\pesso\Documents\DevProjects\GitHubRepositories\ml_models\04_infnet_ml_engineering_pd\04-infnet-ml-engineering-pd\data\06_models\decision_tree_optimized_model.pkl


In [52]:
# Run para o modelo Logistic Regression
with mlflow.start_run(run_name="logistic_regression"):
    # Ler os dados de treino
    X_train = pd.read_parquet(X_TRAIN_FILE_PATH)
    y_train = pd.read_parquet(Y_TRAIN_FILE_PATH).squeeze()  # Convertendo para Series

    # Combinar X e y em um único DataFrame para PyCaret
    data = pd.concat([X_train, y_train], axis=1)

    # Configurar o PyCaret
    setup(
        data=data,
        target='shot_made_flag',  # Nome da variável dependente
        session_id=17  # Para reprodutibilidade
    )

    # Criar o modelo Logistic Regression
    lr_model = create_model('lr')

    # **2. Verifique os Resultados da Tuning**
    lr_tuned_model = tune_model(lr_model, custom_grid={
        'C': [0.01, 0.1, 1, 10, 100],  # Ajustando regularização
        'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'],  # Testando diferentes otimizadores
        'penalty': ['l1', 'l2', 'elasticnet', 'none']  # Ajustando penalizações
    })

    # **3. Avaliação com K-Fold Cross Validation**
    cv_results = pull()
    print("Resultados da Validação Cruzada:\n", cv_results)

    # **4. Comparação com Métricas Anteriores**
    predictions_before_tuning = predict_model(lr_model)
    predictions_after_tuning = predict_model(lr_tuned_model)

    accuracy_before = accuracy_score(predictions_before_tuning['shot_made_flag'], predictions_before_tuning['prediction_label'])
    accuracy_after = accuracy_score(predictions_after_tuning['shot_made_flag'], predictions_after_tuning['prediction_label'])

    print(f"Acurácia antes da tuning: {accuracy_before}")
    print(f"Acurácia após tuning: {accuracy_after}")

    # **5. Experimentos no MLflow**
    true_labels = predictions_after_tuning['shot_made_flag']
    predicted_labels = predictions_after_tuning['prediction_label']
    predicted_scores = predictions_after_tuning['prediction_score']

    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    log_loss_value = log_loss(true_labels, predicted_scores)

    mlflow.log_metric("accuracy_before_tuning", accuracy_before)
    mlflow.log_metric("accuracy_after_tuning", accuracy_after)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("log_loss", log_loss_value)

    print(f"Métricas calculadas: Accuracy Antes={accuracy_before}, Accuracy Depois={accuracy_after}, Precision={precision}, Recall={recall}, F1-Score={f1}, Log Loss={log_loss_value}")

    # Salvar o modelo otimizado de Logistic Regression
    save_model(lr_tuned_model, str(LR_OPTIMIZED_MODEL_PATH))
    mlflow.log_artifact(f"{LR_OPTIMIZED_MODEL_PATH}.pkl")

    # Registrar os parâmetros utilizados
    mlflow.log_param("pycaret_session_id", 17)
    mlflow.log_param("model_type", "Logistic Regression")

print(f"Modelo Logistic Regression otimizado salvo em:\n{LR_OPTIMIZED_MODEL_PATH}.pkl")

Unnamed: 0,Description,Value
0,Session id,17
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(19416, 7)"
4,Transformed data shape,"(19416, 7)"
5,Transformed train set shape,"(13591, 7)"
6,Transformed test set shape,"(5825, 7)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5691,0.5901,0.7794,0.5888,0.6708,0.0812,0.0881
1,0.5622,0.5803,0.7859,0.5828,0.6693,0.0623,0.0686
2,0.557,0.5743,0.7742,0.5802,0.6633,0.0533,0.0582
3,0.5651,0.5755,0.7846,0.5852,0.6704,0.0697,0.0764
4,0.5725,0.5728,0.7882,0.59,0.6749,0.0871,0.0952
5,0.5703,0.5895,0.7869,0.5885,0.6734,0.0822,0.0899
6,0.5659,0.591,0.7974,0.5837,0.674,0.0687,0.0766
7,0.546,0.5662,0.7686,0.572,0.6559,0.0293,0.0322
8,0.5585,0.5626,0.766,0.5819,0.6614,0.06,0.0648
9,0.574,0.6099,0.8144,0.5877,0.6827,0.0833,0.0942


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5684,0.5901,0.7794,0.5882,0.6704,0.0794,0.0863
1,0.5622,0.5803,0.7859,0.5828,0.6693,0.0623,0.0686
2,0.557,0.5743,0.7742,0.5802,0.6633,0.0533,0.0582
3,0.5666,0.5753,0.7859,0.5862,0.6715,0.0728,0.0799
4,0.5725,0.5728,0.7882,0.59,0.6749,0.0871,0.0952
5,0.5673,0.5895,0.7948,0.5852,0.6741,0.0729,0.0809
6,0.5651,0.591,0.7961,0.5833,0.6733,0.0673,0.0749
7,0.546,0.5662,0.7686,0.572,0.6559,0.0293,0.0322
8,0.5585,0.5626,0.766,0.5819,0.6614,0.06,0.0648
9,0.574,0.6096,0.8157,0.5876,0.6831,0.0829,0.094


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Resultados da Validação Cruzada:
       Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.5684  0.5901  0.7794  0.5882  0.6704  0.0794  0.0863
1       0.5622  0.5803  0.7859  0.5828  0.6693  0.0623  0.0686
2       0.5570  0.5743  0.7742  0.5802  0.6633  0.0533  0.0582
3       0.5666  0.5753  0.7859  0.5862  0.6715  0.0728  0.0799
4       0.5725  0.5728  0.7882  0.5900  0.6749  0.0871  0.0952
5       0.5673  0.5895  0.7948  0.5852  0.6741  0.0729  0.0809
6       0.5651  0.5910  0.7961  0.5833  0.6733  0.0673  0.0749
7       0.5460  0.5662  0.7686  0.5720  0.6559  0.0293  0.0322
8       0.5585  0.5626  0.7660  0.5819  0.6614  0.0600  0.0648
9       0.5740  0.6096  0.8157  0.5876  0.6831  0.0829  0.094

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5621,0.5852,0.7894,0.582,0.67,0.0614,0.068


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5621,0.5852,0.7894,0.582,0.67,0.0614,0.068


Acurácia antes da tuning: 0.5620600858369099
Acurácia após tuning: 0.5620600858369099
Métricas calculadas: Accuracy Antes=0.5620600858369099, Accuracy Depois=0.5620600858369099, Precision=0.5450884281499472, Recall=0.5620600858369099, F1-Score=0.529853773557635, Log Loss=0.6745155152016875
Transformation Pipeline and Model Successfully Saved
Modelo Logistic Regression otimizado salvo em:
c:\Users\pesso\Documents\DevProjects\GitHubRepositories\ml_models\04_infnet_ml_engineering_pd\04-infnet-ml-engineering-pd\data\06_models\logistic_regression_optimized_model.pkl
