In [14]:
import sys
import os

current_dir = os.getcwd()
project_root = current_dir

while not os.path.isdir(os.path.join(project_root, "src")) and project_root != "/":
    project_root = os.path.dirname(project_root)

sys.path.append(project_root)
print("Project root:", project_root)

Project root: /Users/guilhermealencar/ml-syphilis-congenita


In [23]:
import pandas as pd
import numpy as np
import joblib

from src.data.db_sqlite import load_dataframe_from_db

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

import mlflow
import mlflow.sklearn


ModuleNotFoundError: No module named 'mlflow.sklearn'

In [17]:
df = load_dataframe_from_db()
print("Shape df (SQLite):", df.shape)
display(df.head())

target_col = "VDRL_RESULT"

X = df.drop(columns=[target_col])
y = df[target_col]

# Colunas numéricas usadas no paper / modelo
num_cols = ["AGE", "NUM_RES_HOUSEHOLD", "NUM_LIV_CHILDREN",
            "NUM_ABORTIONS", "NUM_PREGNANCIES"]

cat_cols = [c for c in X.columns if c not in num_cols]

print("Numéricas:", num_cols)
print("Categóricas:", len(cat_cols))

Shape df (SQLite): (41762, 26)


Unnamed: 0,VDRL_RESULT,CONS_ALCOHOL,RH_FACTOR,SMOKER,PLAN_PREGNANCY,BLOOD_GROUP,HAS_PREG_RISK,TET_VACCINE,IS_HEAD_FAMILY,MARITAL_STATUS,...,HAS_FAM_INCOME,LEVEL_SCHOOLING,CONN_SEWER_NET,NUM_RES_HOUSEHOLD,HAS_FRU_TREE,HAS_VEG_GARDEN,FAM_INCOME,HOUSING_STATUS,WATER_TREATMENT,AGE
0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,5.0,0.0,2.0,1.0,1.0,0.0,1.0,2.0,25.0
1,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,5.0,...,0.0,5.0,0.0,2.0,1.0,1.0,2.0,1.0,3.0,25.0
2,1.0,1.0,2.0,1.0,1.0,4.0,1.0,0.0,1.0,5.0,...,1.0,6.0,2.0,4.0,1.0,1.0,3.0,0.0,3.0,24.0
3,1.0,1.0,2.0,1.0,0.0,4.0,0.0,0.0,1.0,5.0,...,0.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,2.0,28.0
4,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,4.0,1.0,3.0,1.0,1.0,0.0,0.0,3.0,27.0


Numéricas: ['AGE', 'NUM_RES_HOUSEHOLD', 'NUM_LIV_CHILDREN', 'NUM_ABORTIONS', 'NUM_PREGNANCIES']
Categóricas: 20


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Tamanho treino:", X_train.shape, " | Tamanho teste:", X_test.shape)

Tamanho treino: (33409, 25)  | Tamanho teste: (8353, 25)


In [19]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

rf_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        random_state=42,
        class_weight="balanced"
    ))
])

print("Pipeline montado!")


Pipeline montado!


In [20]:
mlflow.set_tracking_uri("http://mlflow-syphilis:5000")
mlflow.set_experiment("syphilis_rf_experiment")

AttributeError: module 'mlflow' has no attribute 'set_tracking_uri'

In [21]:
with mlflow.start_run(run_name="random_forest_v1"):

    # Hiperparâmetros
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("max_depth", 12)
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Treinar modelo
    rf_clf.fit(X_train, y_train)
    print("Treinamento concluído!")

    # Predições
    y_pred = rf_clf.predict(X_test)
    y_proba = rf_clf.predict_proba(X_test)[:, 1]

    # Métricas
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_proba)

    print("Classification report:")
    print(classification_report(y_test, y_pred))

    # Log no MLflow
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("roc_auc", roc)

    # Salvar modelo (pipeline) como artefato do MLflow
    mlflow.sklearn.log_model(rf_clf, artifact_path="model")

print("Métricas resumidas:")
print("ACC:", acc)
print("PREC:", prec)
print("REC:", rec)
print("F1:", f1)
print("ROC AUC:", roc)



AttributeError: module 'mlflow' has no attribute 'start_run'