In [1]:
# Manipula√ß√£o e visualiza√ß√£o de dados
import pandas as pd
import seaborn as sns
import time

# Bibliotecas para aprendizado de m√°quina
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# MLflow para gerenciamento de experimentos
import mlflow
import mlflow.sklearn

# Supress√£o de avisos
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Busque algum outro dataset no Kaggle para um problema de regress√£o e fa√ßa um novo treino. Lembre de modificar as m√©tricas, ex.: MSE.
!pip install kaggle
import kagglehub

!mkdir -p ./kaggle_datasets

# Baixa e descompacta o dataset
!kaggle datasets download -d mirichoi0218/insurance -p ./kaggle_datasets --unzip

dados = pd.read_csv("./kaggle_datasets/insurance.csv")
dados.head()

Dataset URL: https://www.kaggle.com/datasets/mirichoi0218/insurance
License(s): DbCL-1.0
Downloading insurance.zip to ./kaggle_datasets
  0%|                                               | 0.00/16.0k [00:00<?, ?B/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16.0k/16.0k [00:00<00:00, 19.3MB/s]


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# Exibir informa√ß√µes sobre o dataset
print('Dados info:', dados.info())

# Exibir dimens√µes do dataset
print('Dimens√µes:', dados.shape)

# Exibir estat√≠sticas b√°sicas
display(dados.describe())

# Verificar valores ausentes
display(dados.isnull().sum().sort_values(ascending=False))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
Dados info: None
Dimens√µes: (1338, 7)


Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
# Dividindo dados para treino e teste
dados_limpos = dados.copy()
dados_limpos = pd.get_dummies(
    dados,
    columns=["sex", "smoker", "region"],
    drop_first=False
)

# Separando as features (X) e o target (y)
X = dados_limpos.drop(columns=["charges"], errors='ignore')
y = dados_limpos["charges"] 

# Divis√£o dos dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Conjunto de treinamento: {X_train.shape}")
print(f"Conjunto de teste: {X_test.shape}")

Conjunto de treinamento: (1070, 11)
Conjunto de teste: (268, 11)


In [5]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

#Comparando modelos de regress√£o
modelos = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "SVR": SVR(kernel="rbf")
}

# O 'mlflow' aqui refere-se ao nome do servi√ßo no docker-compose
MLFLOW_TRACKING_URI = "http://localhost:5050"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print(f"Configurando MLflow Tracking URI para: {MLFLOW_TRACKING_URI}")

# Define um experimento
mlflow.set_experiment("Comparacao_Regressao_Gasto_Hospitalar")

resultados = []

# Avaliar cada modelo
for nome, modelo in modelos.items():
    inicio = time.time()
    modelo.fit(X_train, y_train)  # Treinamento
    fim = time.time()

    # Previs√µes
    y_pred = modelo.predict(X_test)

    # M√©tricas
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    tempo_treino = fim - inicio

    # Registrar no MLflow
    with mlflow.start_run(run_name=nome):
        mlflow.log_param("Modelo", nome)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("R2", r2)
        mlflow.log_metric("Tempo de Treinamento", tempo_treino)
        mlflow.sklearn.log_model(modelo, "modelo")

    # Armazenar resultados
    resultados.append({
        "Modelo": nome,
        "MSE": mse,
        "R2": r2,
        "Tempo de Treinamento (s)": tempo_treino
    })
    print(f"Modelo {nome} treinado e registrado no MLflow.")



Configurando MLflow Tracking URI para: http://localhost:5050




üèÉ View run Linear Regression at: http://localhost:5050/#/experiments/1/runs/6de653cf57d040d29cf479c65d832501
üß™ View experiment at: http://localhost:5050/#/experiments/1
Modelo Linear Regression treinado e registrado no MLflow.




üèÉ View run Random Forest at: http://localhost:5050/#/experiments/1/runs/47e77d48465647d0b741ff14196324d6
üß™ View experiment at: http://localhost:5050/#/experiments/1
Modelo Random Forest treinado e registrado no MLflow.




üèÉ View run Gradient Boosting at: http://localhost:5050/#/experiments/1/runs/d76d7d2476b84a988e96af013eb701bc
üß™ View experiment at: http://localhost:5050/#/experiments/1
Modelo Gradient Boosting treinado e registrado no MLflow.




üèÉ View run KNN at: http://localhost:5050/#/experiments/1/runs/4619da33c0764eac8e8a07455629edde
üß™ View experiment at: http://localhost:5050/#/experiments/1
Modelo KNN treinado e registrado no MLflow.




üèÉ View run SVR at: http://localhost:5050/#/experiments/1/runs/a2c1d5142d1b40239004d8106ddb7013
üß™ View experiment at: http://localhost:5050/#/experiments/1
Modelo SVR treinado e registrado no MLflow.


In [6]:
# Criar um DataFrame com os resultados
df_resultados = pd.DataFrame(resultados)
df_resultados.sort_values(by=["MSE", "Tempo de Treinamento (s)"], ascending=[False, True], inplace=True)
print("Resultados da Compara√ß√£o:")
print(df_resultados)

# Exibir o modelo com melhor desempenho
melhor_modelo = df_resultados.iloc[0]
print(f"Melhor Modelo: {melhor_modelo['Modelo']}")

Resultados da Compara√ß√£o:
              Modelo           MSE        R2  Tempo de Treinamento (s)
4                SVR  1.665022e+08 -0.072486                  0.016077
3                KNN  1.087060e+08  0.299795                  0.001075
0  Linear Regression  3.359692e+07  0.783593                  0.002039
1      Random Forest  2.118309e+07  0.863554                  0.282648
2  Gradient Boosting  1.976146e+07  0.872711                  0.097659
Melhor Modelo: SVR


In [7]:
# Recuperar o modelo com melhor desempenho
nome_melhor_modelo = melhor_modelo["Modelo"]
modelo_final = modelos[nome_melhor_modelo]

with mlflow.start_run(run_name="Melhor modelo para o gasto hospitalar"):
    mlflow.log_param("Modelo", nome_melhor_modelo)
    mlflow.log_metric("MSE", melhor_modelo["MSE"])
    mlflow.log_metric("R2", melhor_modelo["R2"])
    mlflow.log_metric("Tempo de Treinamento", melhor_modelo["Tempo de Treinamento (s)"])
    mlflow.sklearn.log_model(modelo_final, "melhor_modelo")

print(f"Melhor modelo ({nome_melhor_modelo}) armazenado com sucesso no MLflow.")



üèÉ View run Melhor modelo para o gasto hospitalar at: http://localhost:5050/#/experiments/1/runs/c414e06638da478f899e173f11b915ce
üß™ View experiment at: http://localhost:5050/#/experiments/1
Melhor modelo (SVR) armazenado com sucesso no MLflow.
