# Missing Data Imputation Example

In [None]:

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer, SimpleImputer
import matplotlib.pyplot as plt


In [None]:

# 예시 데이터 생성
np.random.seed(42)
n = 100
df = pd.DataFrame({
    'id': np.repeat(np.arange(1, 21), 5),
    'visit_day': list(np.tile(np.arange(1, 6), 20)),
    'bp': np.random.normal(120, 10, size=100),
    'glucose': np.random.normal(90, 15, size=100)
})

# 인위적으로 결측값 삽입
df.loc[df.sample(frac=0.2, random_state=1).index, 'bp'] = np.nan
df.loc[df.sample(frac=0.15, random_state=2).index, 'glucose'] = np.nan


In [None]:

# 1. Forward Fill
df_sorted = df.sort_values(['id', 'visit_day']).copy()
df_sorted['bp_ffill'] = df_sorted.groupby('id')['bp'].ffill()
df_sorted['glucose_ffill'] = df_sorted.groupby('id')['glucose'].ffill()


In [None]:

# 2. Flag 변수 생성
df_sorted['bp_missing'] = df_sorted['bp'].isnull().astype(int)
df_sorted['glucose_missing'] = df_sorted['glucose'].isnull().astype(int)


In [None]:

# 3. Simple Imputer (중앙값으로)
simple_imputer = SimpleImputer(strategy='median')
df_sorted[['bp_simple', 'glucose_simple']] = simple_imputer.fit_transform(df_sorted[['bp', 'glucose']])


In [None]:

# 4. Iterative Imputer (MICE)
iter_imputer = IterativeImputer(random_state=0, max_iter=10)
df_sorted[['bp_mice', 'glucose_mice']] = iter_imputer.fit_transform(df_sorted[['bp', 'glucose']])


In [None]:

# 결과 요약 표
df_sorted[['id', 'visit_day', 'bp', 'bp_ffill', 'bp_simple', 'bp_mice',
           'glucose', 'glucose_ffill', 'glucose_simple', 'glucose_mice',
           'bp_missing', 'glucose_missing']].head()
