In [1]:
# impyute 및 autoimpute 패키지 설치
!pip install autoimpute impyute



# autoimpute - MICE

In [2]:
# 필요한 패키지 불러오기

import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

from autoimpute.imputations import SingleImputer, MultipleImputer, MiceImputer

# 데이터 로드
titan_data = sns.load_dataset('titanic')

# Object 타입(범주형 변수) → 숫자로 변환 (Label Encoding)
categorical_cols = ['sex', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']
label_encoders = {}

titan_copy = titan_data.copy() # titan
for col in categorical_cols:
    le = LabelEncoder()
    titan_copy[col] = titan_copy[col].astype(str)  # NaN 값이 있으면 문자열로 변환 필요
    titan_copy[col] = le.fit_transform(titan_copy[col])  # Label Encoding 적용
    label_encoders[col] = le  # 나중에 역변환할 수 있도록 저장

titan_copy_1 = titan_copy.where(titan_data.notna(), titan_data) # NaN 값이 있던 위치에 label encoding 값 대신 NaN 값으로 대체

In [3]:
# MiceImputer 설정

"""
n은 MICE의 반복 수,
strategy는 각 변수 별 적용할 estimator,
predictor는 특정 변수의 결측값을 예측할 때 어떤 다른 변수들을 고려할 것인지,
visit은 결측값을 어떤 순서로 채울 것인지,
return_list는 MICE의 각 iteration 별 결과값을 저장할 것인지를 의미.
"""

imp = MiceImputer(
    n=3,
    strategy={"age": "least squares", "embarked": "multinomial logistic", "deck": "multinomial logistic", "embark_town": "multinomial logistic"},
    predictors={"age": "all", "embarked": ["pclass", "sex", "class"]},
    visit="left-to-right",
    return_list=True
)
print(titan_copy_1.isnull().sum())

# MICE 적용
imputed_data_list = imp.fit_transform(titan_copy_1)

# 결과 확인
print(imputed_data_list[0][1].isnull().sum())

# 첫 번째 Imputed Data 선택
imputed_data_list[0][1].head()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,1,4,2,0,0
1,1,1,0,38.0,1,0,71.2833,0,0,2,0,2,0,1,0
2,1,3,0,26.0,0,0,7.925,2,2,2,0,4,2,1,1
3,1,1,0,35.0,1,0,53.1,2,0,2,0,2,2,1,0
4,0,3,1,35.0,0,0,8.05,2,2,1,1,4,2,0,1


In [4]:
imputed_df = imputed_data_list[0][1]

# 범주형 데이터 처리 (MICE 후 float 변환 방지)
for col in categorical_cols:
    imputed_df[col] = np.round(imputed_df[col]).astype(int)  # 정수 변환

# 데이터 타입 원래대로 변환
for col in imputed_df.columns:
    if col in categorical_cols:
        # 범주형 컬럼은 Label Decoding 적용
        imputed_df[col] = label_encoders[col].inverse_transform(imputed_df[col])
    else:
        # 숫자형 컬럼은 원래 데이터 타입 유지
        imputed_df[col] = imputed_df[col].astype(titan_data[col].dtype)

imputed_df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,E,Southampton,no,False
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,E,Southampton,yes,True
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,E,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,Second,man,True,E,Southampton,no,True
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,34.298373,1,2,23.4500,S,Third,woman,False,E,Southampton,no,False
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## autoimpute - singleImputer

In [5]:
# Univariate imputation 방법 사용 (수치형 데이터에는 mean을, 범주형 데이터에는 mode가 자동 적용됨)
# mean, mode, random, median 등 다양하게 옵션 제공
si = SingleImputer(
    strategy="default univariate",
)

In [6]:
titan_data = sns.load_dataset('titanic')
imputed_data_si = si.fit_transform(titan_data)
imputed_data_si

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,C,Southampton,no,False
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,C,Southampton,yes,True
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,C,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,Second,man,True,C,Southampton,no,True
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,29.699118,1,2,23.4500,S,Third,woman,False,C,Southampton,no,False
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# impyute

In [7]:
# 필요한 패키지 불러오기
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from impyute.imputation.cs import mean, median, fast_knn, mice

warnings.filterwarnings('ignore')

# numpy 버전 업그레이드로 인한 오류 발생 방지
np.float = float
np.int = int
np.object = object
np.bool = bool

In [8]:
# Titanic 데이터 로드
titanic_data = sns.load_dataset('titanic')

# 수치형 데이터만 선택 (impyute는 numpy 배열로 변환 필요)
num_cols = titanic_data.select_dtypes(include=['number']).columns
titanic_numeric = titanic_data[num_cols].to_numpy()

# 결측치 확인
print("결측치 개수 (변환 전):\n", titanic_data[num_cols].isnull().sum())

# Imputation 적용
titanic_imputed_mean = mean(titanic_numeric)    # 평균(mean) 대체
titanic_imputed_median = median(titanic_numeric)  # 중앙값(median) 대체
titanic_imputed_knn = fast_knn(titanic_numeric, k=5)  # KNN 대체 (k=5)
titanic_imputed_mice = mice(titanic_numeric)  # MICE 대체

# 다시 DataFrame으로 변환
titanic_imputed_df = pd.DataFrame(titanic_imputed_mice, columns=num_cols)

# 결측치 확인 (Imputation 후)
print("\n 결측치 개수 (MICE Imputation 후):\n", titanic_imputed_df.isnull().sum())

# 일부 샘플 출력
titanic_imputed_df.head()

결측치 개수 (변환 전):
 survived      0
pclass        0
age         177
sibsp         0
parch         0
fare          0
dtype: int64

 결측치 개수 (MICE Imputation 후):
 survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0.0,3.0,22.0,1.0,0.0,7.25
1,1.0,1.0,38.0,1.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.925
3,1.0,1.0,35.0,1.0,0.0,53.1
4,0.0,3.0,35.0,0.0,0.0,8.05
