In [1]:
import pandas as pd
import pycaret.classification as pc
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

import mlflow

from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.metrics import f1_score, log_loss, recall_score, precision_score, make_scorer, roc_curve, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

# Configurar MLFLOW

In [2]:
# Para usar o sqlite como repositorio
mlflow.set_tracking_uri("sqlite:///mlruns.db")

experiment_name = 'Projeto_Kobe'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id

2024/04/15 18:13:28 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/04/15 18:13:28 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

# Carregamento dos dados

In [3]:
data_cols = ['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance', 'shot_made_flag']
train_perc = 0.8

with mlflow.start_run(experiment_id = experiment_id, run_name = 'PreparacaoDados'):
    # Leitura do dataframe
    df_dev = pd.read_parquet(path = '../Data/Raw/dataset_kobe_dev.parquet', 
                         columns = data_cols)

    # Remoção de linhas com dados faltantes
    print("Antes do tratamento de dados faltantes:")
    print("\nDimensão dos dados:", df_dev.shape)
    
    print("Número de dados faltantes:")
    print(df_dev.isna().sum())
    
    df_dev_filtered = df_dev.dropna()
    
    print("\n\nDepois do tratamento de dados faltantes:")
    
    print("\nDimensão dos dados:", df_dev_filtered.shape)
    
    print("Número de dados faltantes:")
    print(df_dev_filtered.isna().sum())
    
    df_dev_filtered.to_parquet(path = '../Data/Processed/data_filtered.parquet')

    # Separação da base em treino e teste
    X = df_dev_filtered.drop(['shot_made_flag'], axis = 1)
    y = df_dev_filtered['shot_made_flag']
    
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size = train_perc, stratify = y)

    # Adicionando variavel target as bases
    xtrain['target'] = ytrain
    xtest['target'] = ytest
    
    print("\n\nDimensão dos dados de treino:", xtrain.shape)
    print("\nDimensão dos dados de teste:", xtest.shape)
    
    xtrain.to_parquet(path = '../Data/Processed/base_train.parquet')
    xtest.to_parquet(path = '../Data/Processed/base_test.parquet')

    # Registrando metricas no MLFLOW
    mlflow.log_params({'colunas_selecionadas': data_cols})
    mlflow.log_params({
        'perc_teste': 1 - train_perc,
        'qtd_linhas_teste': xtest.shape[0]
    })
    mlflow.log_params({
        'perc_treino': train_perc,
        'qtd_linhas_treino': xtrain.shape[0]
    })

# Renomeando variavel target
data_cols[-1] = 'target'

# Certifica de que a execução finalizou
mlflow.end_run()


Antes do tratamento de dados faltantes:

Dimensão dos dados: (24271, 7)
Número de dados faltantes:
lat                     0
lon                     0
minutes_remaining       0
period                  0
playoffs                0
shot_distance           0
shot_made_flag       3986
dtype: int64


Depois do tratamento de dados faltantes:

Dimensão dos dados: (20285, 7)
Número de dados faltantes:
lat                  0
lon                  0
minutes_remaining    0
period               0
playoffs             0
shot_distance        0
shot_made_flag       0
dtype: int64


Dimensão dos dados de treino: (16228, 7)

Dimensão dos dados de teste: (4057, 7)


# Analise Exploratoria

In [4]:
def densidade_treino_teste(ytrain, ytest):
    # Definições
    artifact_title = "densidade_treino_teste"
    local_path = f"Artifacts\{artifact_title}_temp.png"

    # Calculando as contagens de valores únicos em y_train e y_test
    train_counts = ytrain.value_counts()
    test_counts = ytest.value_counts()

    # Obtendo os valores únicos de y_train e y_test (para garantir que todas as classes estejam presentes)
    unique_classes = set(train_counts.index).union(set(test_counts.index))

    # Preenchendo contagens ausentes com zero para garantir que todas as classes estejam representadas
    for cls in unique_classes:
        if cls not in train_counts:
            train_counts[cls] = 0
        if cls not in test_counts:
            test_counts[cls] = 0

    # Ordenando as contagens pelo índice
    train_counts = train_counts.sort_index()
    test_counts = test_counts.sort_index()

    # Definindo a largura das barras
    bar_width = 0.35

    # Definindo a posição das barras no eixo x
    x_train = range(len(train_counts))
    x_test = [x + bar_width for x in x_train]

    # Criando o gráfico de barras
    plt.bar(x_train, train_counts.values, width = bar_width, label = 'y_train')
    plt.bar(x_test, test_counts.values, width = bar_width, label = 'y_test')

    # Adicionando legendas, rótulos e título
    plt.xlabel('Classes')
    plt.ylabel('Contagem')
    plt.title('Contagem de Classes')
    plt.xticks([x + bar_width/2 for x in x_train], train_counts.index)
    plt.legend()
    plt.tight_layout()
    plt.savefig(local_path)
    plt.close()

    # Envia para o MLFLOW
    mlflow.log_artifact(local_path = local_path, artifact_path = "AnaliseExploratoria")

In [5]:
def matriz_correlacao(df):
    # Definições
    artifact_title = "matriz_correlacao"
    local_path = f"Artifacts\{artifact_title}_temp.png"

    # Cria matriz de correção
    plt.figure(figsize = (10, 10))
    sns.heatmap(df.corr(), annot =  True)
    plt.title(f"Matriz de Correlação")
    plt.tight_layout()
    plt.savefig(local_path)
    plt.close()

    # Envia para o MLFLOW
    mlflow.log_artifact(local_path = local_path, artifact_path = "AnaliseExploratoria")

In [6]:
def histograma_individual(df):
    # Definições
    artifact_title = "histogramas_individuais"
    local_path = f"Artifacts\{artifact_title}_temp.png"

    # Cria histogramas
    plt.figure(figsize = (12, 8))
    df.hist()
    plt.suptitle("Histograma")
    plt.tight_layout()
    plt.savefig(local_path)
    plt.close()

    # Envia para o MLFLOW
    mlflow.log_artifact(local_path = local_path, artifact_path = "AnaliseExploratoria")

In [7]:
with mlflow.start_run(experiment_id = experiment_id, run_name = 'AnaliseExploratoria'):
    histograma_individual(df_dev_filtered)
    matriz_correlacao(df_dev_filtered)
    densidade_treino_teste(ytrain, ytest)

# Certifica de que a execução finalizou
mlflow.end_run()

# Treinamento do Modelo

In [8]:
def matriz_confusao(cm, prefixo):
    # Definições
    artifact_title = "matriz_confusao"
    local_path = f"Artifacts\{prefixo}_{artifact_title}_temp.png"

    # Cria matriz de correção
    plt.figure(figsize = (10, 10))
    sns.heatmap(cm, annot =  True, cmap = "Blues", cbar = False, fmt = "d")
    plt.title(f"Matriz de Confusão")
    plt.xlabel('Previsto')
    plt.ylabel('Real')
    plt.tight_layout()
    plt.savefig(local_path)
    plt.close()

    # Envia para o MLFLOW
    mlflow.log_artifact(local_path = local_path, artifact_path = "Treinamento")

In [9]:
def grafico_metricas(precision, recall, f1, prefixo, nome):
    # Definições
    artifact_title = "metricas"
    local_path = f"Artifacts\{prefixo}_{artifact_title}_temp.png"

    # Grafico de metricas
    plt.figure(figsize = (10, 10))
    plt.bar(['Precision', 'Recall', 'F1-score'], [precision, recall, f1])
    plt.xlabel('Métricas')
    plt.ylabel('Valores')
    plt.title(f'{nome} Métricas')
    plt.ylim(0, 1)
    plt.grid(axis = 'y')
    plt.savefig(local_path)
    plt.close()

    # Envia para o MLFLOW
    mlflow.log_artifact(local_path = local_path, artifact_path = "Treinamento")

In [10]:
def grafico_curva_validacao(model, prefixo, nome, xlabel, ylabel, params_, xtrain, ytrain):
    scorer = make_scorer(f1_score)
    
    # Treinando validation curve
    train_scores, test_scores = validation_curve(
        model, 
        xtrain.drop('target', axis = 1), 
        ytrain, 
        param_name = xlabel, 
        param_range = params_, 
        scoring = scorer, 
        n_jobs = -1
    )

    # Definições
    artifact_title = "curva_validacao"
    local_path = f"Artifacts\{prefixo}_{artifact_title}_temp.png"

    # Média e Desvio padrão de cada conjunto
    train_mean, test_mean = np.mean(train_scores, axis = 1), np.mean(test_scores, axis = 1)
    train_std, test_std = np.std(train_scores, axis = 1), np.std(test_scores, axis = 1)

    # Plot Curvas de Validação
    plt.figure(figsize = (10, 6))
    plt.title(f"{nome} Curva de Validação F1")
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.semilogx(params_, train_mean, label="F1-score de Treinamento", color = "r")
    plt.fill_between(params_, train_mean - train_std, train_mean + train_std, alpha = 0.2, color = "r")
    plt.semilogx(params_, test_mean, label = "F1-score de Teste", color = "b")
    plt.fill_between(params_, test_mean - test_std, test_mean + test_std, alpha = 0.2, color = "b")
    plt.legend(loc = "best")
    plt.savefig(local_path)
    plt.close()

    # Envia para o MLFLOW
    mlflow.log_artifact(local_path = local_path, artifact_path = "Treinamento")

In [11]:
def curva_roc(ytest, ypred_prob_lr, ypred_prob_dt):
    # Definições
    artifact_title = "curva_roc"
    local_path = f"Artifacts\{artifact_title}_temp.png"

    # Calcular as curvas ROC
    fpr_lr, tpr_lr, _ = roc_curve(ytest, ypred_prob_lr)
    fpr_dt, tpr_dt, _ = roc_curve(ytest, ypred_prob_dt)

    # Calcular as áreas sob a curva ROC (AUC)
    auc_lr = roc_auc_score(ytest, ypred_prob_lr)
    auc_dt = roc_auc_score(ytest, ypred_prob_dt)

    # Plotar as curvas ROC
    plt.figure(figsize = (12, 8))
    plt.plot(fpr_lr, tpr_lr, label = f'Regressão Logística - (AUC = {auc_lr:.2f})', color = 'b')
    plt.plot(fpr_dt, tpr_dt, label=f'Árvore de Decisão - (AUC = {auc_dt:.2f})', color = 'r')
    plt.plot([0, 1], [0, 1], linestyle = '--', color = 'gray', label = 'Aleatório')
    plt.xlabel('% Falso Positivo')
    plt.ylabel('% Positivo')
    plt.title('Curva ROC')
    plt.legend()
    plt.grid(True)
    plt.savefig(local_path)
    plt.close()

    # Envia para o MLFLOW
    mlflow.log_artifact(local_path = local_path, artifact_path = "Treinamento")

In [12]:
with mlflow.start_run(experiment_id = experiment_id, run_name = 'Treinamento'):   
    # Configurando pycaret
    exp = pc.setup(data = xtrain, target = 'target', test_data = xtest, normalize = True, log_experiment = False)
    lista_modelos = exp.compare_models(['lr','dt'], n_select = 2, sort = 'f1')

    ############################################################
    ############## Executando Regressão Logistica ##############
    ############################################################

    logistic_model = [model for model in lista_modelos if str(model).startswith("LogisticRegression")][0]
    # Capturando modelo de predição 
    ymodel_test = exp.predict_model(logistic_model)

    # Separando variaveis dependentes e independentes
    xtest_features = xtest.drop(columns = ['target'])

    # Obtendo probabilidades
    ypred_prob_lr = logistic_model.predict_proba(xtest_features)[:, 1]
    ypred_prob_lr = ypred_prob_lr

    # Calculando as métricas
    logloss = log_loss(ytest, ymodel_test['prediction_label'])
    recall_lr = recall_score(ytest, ymodel_test['prediction_label'])
    precision_lr = precision_score(ytest, ymodel_test['prediction_label'])
    f1_lr = f1_score(ytest, ymodel_test['prediction_label'])
    grafico_metricas(precision_lr, recall_lr, f1_lr, 'lr', 'Regressão Logística')

    # Curva de Validação
    grafico_curva_validacao(logistic_model, 'lr', 'Regressão Logística', 'C', 'F1', [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1], xtrain, ytrain)

    # Matriz de confusão
    matriz_confusao(confusion_matrix(ytest, ymodel_test['prediction_label']), 'lr')

    # Envia para o MLFLOW
    mlflow.log_metrics({
        'lr_log_loss': logloss,
        'lr_f1': f1_lr,
        'lr_recall': recall_lr,
        'lr_precision': precision_lr
    })

    ##########################################################
    ############## Executando Arvore de Decisão ##############
    ##########################################################

    # Obter o modelo de árvore de decisão da lista de modelos
    decision_tree_model = [model for model in lista_modelos if isinstance(model, DecisionTreeClassifier)][0]

    ymodel_test = exp.predict_model(decision_tree_model)
    
    # Obtendo probabilidades
    ypred_prob_dt = ymodel_test.iloc[:, 1]
    
    # Calculando as métricas
    logloss_dt = log_loss(ytest, ymodel_test['prediction_label'])
    recall_dt = recall_score(ytest, ymodel_test['prediction_label'])
    precision_dt = precision_score(ytest, ymodel_test['prediction_label'])
    f1_dt = f1_score(ytest, ymodel_test['prediction_label'])
    grafico_metricas(precision_dt, precision_dt, f1_dt, 'dt', 'Arvore de Decisão')

    # Curva de Validação
    grafico_curva_validacao(decision_tree_model, 'dt', 'Arvore de Decisão', 'max_depth', 'F1', [2, 3, 4, 5, 6, 7, 8, 9, 10], xtrain, ytrain)
    
    # Matriz de confusão
    matriz_confusao(confusion_matrix(ytest, ymodel_test['prediction_label']), 'dt')

    mlflow.log_metrics({
        'dt_log_loss': logloss_dt,
        'dt_recall': recall_dt,
        'dt_precision': precision_dt,
        'dt_f1': f1_dt,
    })

    ########################################################
    ############## Selecionando Melhor Modelo ##############
    ########################################################

    modelo_codigo = 0
    modelo_instancia = logistic_model
    print(f"Melhor modelo: Regressão Logística")

    # Tuning do melhor modelo
    tune_model = exp.tune_model(modelo_instancia,
                                optimize = 'f1',
                                search_library = 'scikit-learn',
                                search_algorithm = 'random',
                                n_iter = 10) 

    ymodel_test = exp.predict_model(tune_model, raw_score = True)  

    mlflow.log_metrics({
        'final_model_log_loss': log_loss(ymodel_test['target'], ymodel_test['prediction_label']),
        'final_model_f1': f1_score(ymodel_test['target'], ymodel_test['prediction_label']),
    })

    # Salvando modelo de teste
    ymodel_test.to_parquet('../Data/Processed/prediction_test.parquet')
    mlflow.log_artifact('../Data/Processed/prediction_test.parquet')

    modelo_tunado = exp.finalize_model(tune_model)

    # Exportando e carregando modelo tunado
    exp.save_model(modelo_tunado, f'./{experiment_name}') 
    model_pipe = exp.load_model(f'./{experiment_name}')

    # Carregando variaveis
    modelo_variaveis = list(xtrain.drop('target', axis = 1).columns)

    infer_signature_local = infer_signature(xtrain[data_cols], model_pipe.predict_proba(xtrain.drop('target', axis = 1))) 

    # Exemplo de entrada para o MLmodel
    df_amostra = {x: xtrain[x].values[:10] for x in modelo_variaveis}

    # Log do pipeline de modelagem do sklearn e registrar como uma nova versao
    mlflow.sklearn.log_model(
        sk_model = model_pipe,
        artifact_path = "sklearn-model",
        registered_model_name = experiment_name,
        signature = infer_signature_local,
        input_example = df_amostra,
        pyfunc_predict_fn = 'predict_proba'
    )

    # Criacao do cliente do servico MLFlow e atualizacao versao modelo
    client = MlflowClient()
    model_version = client.get_latest_versions(experiment_name)[-1].version
    client.set_registered_model_alias(
        name    = experiment_name, 
        alias   = "staging", 
        version = model_version
    ) 

    # Gerando curva ROC
    curva_roc(ytest, ypred_prob_lr, ypred_prob_dt)

# Certifica de que a execução finalizou
mlflow.end_run()

Unnamed: 0,Description,Value
0,Session id,5473
1,Target,target
2,Target type,Binary
3,Original data shape,"(20285, 7)"
4,Transformed data shape,"(20285, 7)"
5,Transformed train set shape,"(16228, 7)"
6,Transformed test set shape,"(4057, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.5391,0.5194,0.5778,0.5153,0.5447,0.0811,0.0817,0.272
lr,Logistic Regression,0.578,0.6006,0.4898,0.5674,0.5256,0.1493,0.1507,0.335


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5719,0.5911,0.4765,0.5608,0.5152,0.1364,0.1378


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.5304,0.5109,0.5782,0.5072,0.5404,0.0646,0.0652


Melhor modelo: Regressão Logística


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5816,0.6064,0.5401,0.5641,0.5518,0.1599,0.16
1,0.5712,0.5915,0.5103,0.5548,0.5316,0.1374,0.1378
2,0.5786,0.603,0.5716,0.5572,0.5643,0.1563,0.1564
3,0.5927,0.6104,0.5523,0.5768,0.5643,0.1823,0.1825
4,0.5736,0.6103,0.5523,0.5537,0.553,0.1454,0.1454
5,0.5595,0.5895,0.5084,0.5412,0.5243,0.1148,0.115
6,0.5823,0.6145,0.5523,0.5639,0.558,0.1621,0.1621
7,0.5958,0.6203,0.5458,0.5818,0.5632,0.1878,0.1881
8,0.553,0.5692,0.5323,0.5316,0.532,0.1042,0.1042
9,0.5629,0.5902,0.5439,0.5418,0.5429,0.1241,0.1241


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5647,0.5911,0.5266,0.5457,0.536,0.1263,0.1264


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


Successfully registered model 'Projeto_Kobe'.
Created version '1' of model 'Projeto_Kobe'.


In [13]:
# Certifica de que a execução finalizou
mlflow.end_run()