In [17]:
# Ativa o IterativeImputer experimental
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

# Outros imports
import pandas as pd
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import OrdinalEncoder

import numpy as np

In [18]:
# Carrega os dados
data = pd.read_csv("arquivos/oasis_cross-sectional.csv")

df = data[['M/F', 'Age', 'Educ', 'SES','MMSE', 'eTIV', 'nWBV', 'CDR']].copy()
# Separar colunas por tipo
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436 entries, 0 to 435
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   M/F     436 non-null    object 
 1   Age     436 non-null    int64  
 2   Educ    235 non-null    float64
 3   SES     216 non-null    float64
 4   MMSE    235 non-null    float64
 5   eTIV    436 non-null    int64  
 6   nWBV    436 non-null    float64
 7   CDR     235 non-null    float64
dtypes: float64(5), int64(2), object(1)
memory usage: 27.4+ KB


In [19]:

# Define as colunas
col_ordinal = ["Educ", "SES", "CDR"]
col_categ = ["M/F"]
col_numericas = ["Age", "MMSE", "eTIV", "nWBV"]

# Backup dos mapeamentos originais para "M/F"
sexo_map = {'F': 0, 'M': 1}
inv_sexo_map = {v: k for k, v in sexo_map.items()}

# Codifica variáveis ordinais preservando ordem
ordinal_encoder = OrdinalEncoder()
df[col_ordinal] = ordinal_encoder.fit_transform(df[col_ordinal])

# Codifica "M/F" com códigos numéricos temporários
df["M/F"] = df["M/F"].map(sexo_map)

# Imputação
imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=42)
df_imputado = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)



In [20]:
# Pós-processamento para cada coluna

# "M/F": de volta para 'F' ou 'M'
df_imputado["M/F"] = df_imputado["M/F"].round().astype(int).map(inv_sexo_map)

# "CDR": arredonda para os valores válidos mais próximos
valores_cdr_validos = np.array([0, 1, 2, 3])
df_imputado["CDR"] = df_imputado["CDR"].apply(lambda x: valores_cdr_validos[np.argmin(np.abs(valores_cdr_validos - x))])

# "Educ" e "SES": arredondar para inteiros entre 1 e 5
df_imputado["Educ"] = df_imputado["Educ"].round().clip(1, 5).astype(int)
df_imputado["SES"] = df_imputado["SES"].round().clip(1, 5).astype(int)

df_imputado["MMSE"] = df_imputado["MMSE"].round().clip(1, 30).astype(int)

# "Age": arredondar para inteiro (parte inteira apenas)
df_imputado["Age"] = df_imputado["Age"].astype(int)

In [21]:
df_imputado

Unnamed: 0,M/F,Age,Educ,SES,MMSE,eTIV,nWBV,CDR
0,F,74,1,2,29,1344.0,0.743,0
1,F,55,3,1,29,1147.0,0.810,0
2,F,73,3,2,27,1454.0,0.708,1
3,M,28,3,1,29,1588.0,0.803,0
4,M,18,3,1,30,1737.0,0.848,0
...,...,...,...,...,...,...,...,...
431,M,20,3,1,30,1469.0,0.847,0
432,M,22,3,1,30,1684.0,0.790,0
433,M,22,3,1,30,1580.0,0.856,0
434,F,20,3,1,30,1262.0,0.861,0


In [22]:
df_imputado[col_ordinal] = ordinal_encoder.inverse_transform(df_imputado[col_ordinal])



In [28]:
df_imputado

Unnamed: 0,M/F,Age,Educ,SES,MMSE,eTIV,nWBV,CDR
0,F,74,2.0,3.0,29,1344.0,0.743,0.0
1,F,55,4.0,2.0,29,1147.0,0.810,0.0
2,F,73,4.0,3.0,27,1454.0,0.708,0.5
3,M,28,4.0,2.0,29,1588.0,0.803,0.0
4,M,18,4.0,2.0,30,1737.0,0.848,0.0
...,...,...,...,...,...,...,...,...
431,M,20,4.0,2.0,30,1469.0,0.847,0.0
432,M,22,4.0,2.0,30,1684.0,0.790,0.0
433,M,22,4.0,2.0,30,1580.0,0.856,0.0
434,F,20,4.0,2.0,30,1262.0,0.861,0.0


In [29]:
df_imputado.to_csv("arquivos/oasis_cross-sectional_imputado.csv", index=False)