In [3]:
import sys
import os

current_dir = os.getcwd()
project_root = current_dir

while not os.path.isdir(os.path.join(project_root, "src")) and project_root != "/":
    project_root = os.path.dirname(project_root)

sys.path.append(project_root)
print("Project root:", project_root)


Project root: /Users/guilhermealencar/ml-syphilis-congenita


In [4]:
import pandas as pd
import numpy as np

from src.data.load_data import load_csv


In [5]:
df = load_csv("../data/raw/data_set.csv")
print(df.shape)
df.head()


[OK] Dataset carregado com 41762 linhas e 26 colunas.
(41762, 26)


Unnamed: 0,VDRL_RESULT,CONS_ALCOHOL,RH_FACTOR,SMOKER,PLAN_PREGNANCY,BLOOD_GROUP,HAS_PREG_RISK,TET_VACCINE,IS_HEAD_FAMILY,MARITAL_STATUS,...,HAS_FAM_INCOME,LEVEL_SCHOOLING,CONN_SEWER_NET,NUM_RES_HOUSEHOLD,HAS_FRU_TREE,HAS_VEG_GARDEN,FAM_INCOME,HOUSING_STATUS,WATER_TREATMENT,AGE
0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,5.0,0.0,2.0,1.0,1.0,0.0,1.0,2.0,25.0
1,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,5.0,...,0.0,5.0,0.0,2.0,1.0,1.0,2.0,1.0,3.0,25.0
2,1.0,1.0,2.0,1.0,1.0,4.0,1.0,0.0,1.0,5.0,...,1.0,6.0,2.0,4.0,1.0,1.0,3.0,0.0,3.0,24.0
3,1.0,1.0,2.0,1.0,0.0,4.0,0.0,0.0,1.0,5.0,...,0.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,2.0,28.0
4,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,4.0,1.0,3.0,1.0,1.0,0.0,0.0,3.0,27.0


In [6]:
# Substituir idades negativas por NaN
df.loc[df["AGE"] < 0, "AGE"] = np.nan

# Preencher com a mediana
median_age = df["AGE"].median()
df["AGE"] = df["AGE"].fillna(median_age)

print("Min AGE:", df["AGE"].min(), "| Max AGE:", df["AGE"].max())
print("Nulos em AGE:", df["AGE"].isna().sum())


Min AGE: 0.0 | Max AGE: 51.0
Nulos em AGE: 0


In [7]:
target_col = "VDRL_RESULT"

X = df.drop(columns=[target_col])
y = df[target_col]

num_cols = ["AGE", "NUM_RES_HOUSEHOLD", "NUM_LIV_CHILDREN",
            "NUM_ABORTIONS", "NUM_PREGNANCIES"]

cat_cols = [c for c in X.columns if c not in num_cols]

print("Colunas numéricas:", num_cols)
print("Colunas categóricas:", cat_cols)


Colunas numéricas: ['AGE', 'NUM_RES_HOUSEHOLD', 'NUM_LIV_CHILDREN', 'NUM_ABORTIONS', 'NUM_PREGNANCIES']
Colunas categóricas: ['CONS_ALCOHOL', 'RH_FACTOR', 'SMOKER', 'PLAN_PREGNANCY', 'BLOOD_GROUP', 'HAS_PREG_RISK', 'TET_VACCINE', 'IS_HEAD_FAMILY', 'MARITAL_STATUS', 'FOOD_INSECURITY', 'FAM_PLANNING', 'TYPE_HOUSE', 'HAS_FAM_INCOME', 'LEVEL_SCHOOLING', 'CONN_SEWER_NET', 'HAS_FRU_TREE', 'HAS_VEG_GARDEN', 'FAM_INCOME', 'HOUSING_STATUS', 'WATER_TREATMENT']


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Tamanho treino:", X_train.shape, " | Tamanho teste:", X_test.shape)
print("Distribuição y_train:")
print(y_train.value_counts(normalize=True))


Tamanho treino: (33409, 25)  | Tamanho teste: (8353, 25)
Distribuição y_train:
VDRL_RESULT
1.0    0.980215
0.0    0.019785
Name: proportion, dtype: float64


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

print("Pipeline de pré-processamento criado!")


Pipeline de pré-processamento criado!
