In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator

import warnings
warnings.filterwarnings('ignore')

In [23]:
data: dict = {
    'id' : [1000, 1001, 1002, 1003, 1004, 1005, 1006],
    'date' : ['20230101', '20230102', '20230103', '20230104', '20230105', '20230106', '20230107'],
    'age' : [21, 56, 33, 49, 27, 42, 32],
    'income' : [67000, 220000, 97000, 166000, 81000, 157000, 96000],
    'gender' : ['Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female'],
    'education' : ['Bachelos', 'PhD', 'Masters', 'Masters', 'Bachelos', 'Bachelos', 'Bachelos'],
    'passed' : [False, True, True, True, False, False, True],
    'measurement' : np.random.randn(7).round(2),
}
df = pd.DataFrame(data)
df

Unnamed: 0,id,date,age,income,gender,education,passed,measurement
0,1000,20230101,21,67000,Male,Bachelos,False,0.75
1,1001,20230102,56,220000,Female,PhD,True,-1.21
2,1002,20230103,33,97000,Female,Masters,True,0.06
3,1003,20230104,49,166000,Male,Masters,True,0.42
4,1004,20230105,27,81000,Male,Bachelos,False,-0.07
5,1005,20230106,42,157000,Female,Bachelos,False,-0.58
6,1006,20230107,32,96000,Female,Bachelos,True,-1.66


### 컬럼간의 상관계수 계산

In [24]:
df.corr()

Unnamed: 0,id,age,income,passed,measurement
id,1.0,-0.006207,-0.076124,0.0,-0.540179
age,-0.006207,1.0,0.981673,0.537583,-0.365746
income,-0.076124,0.981673,1.0,0.413131,-0.368254
passed,0.0,0.537583,0.413131,1.0,-0.387029
measurement,-0.540179,-0.365746,-0.368254,-0.387029,1.0


### 컬럼정보 확인

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           7 non-null      int64  
 1   date         7 non-null      object 
 2   age          7 non-null      int64  
 3   income       7 non-null      int64  
 4   gender       7 non-null      object 
 5   education    7 non-null      object 
 6   passed       7 non-null      bool   
 7   measurement  7 non-null      float64
dtypes: bool(1), float64(1), int64(3), object(3)
memory usage: 527.0+ bytes


### 데이터 값 출력

In [31]:
df.values

array([[1000, '20230101', 21, 67000, 'Male', 'Bachelos', False, 0.75],
       [1001, '20230102', 56, 220000, 'Female', 'PhD', True, -1.21],
       [1002, '20230103', 33, 97000, 'Female', 'Masters', True, 0.06],
       [1003, '20230104', 49, 166000, 'Male', 'Masters', True, 0.42],
       [1004, '20230105', 27, 81000, 'Male', 'Bachelos', False, -0.07],
       [1005, '20230106', 42, 157000, 'Female', 'Bachelos', False, -0.58],
       [1006, '20230107', 32, 96000, 'Female', 'Bachelos', True, -1.66]],
      dtype=object)

### 데이터 결측치 임의로 삽입

In [32]:
df2 = df.copy()
df2.iloc[[3,5], [1, 2, 4, 5, 6, 7]] = np.nan
df2

Unnamed: 0,id,date,age,income,gender,education,passed,measurement
0,1000,20230101.0,21.0,67000,Male,Bachelos,False,0.75
1,1001,20230102.0,56.0,220000,Female,PhD,True,-1.21
2,1002,20230103.0,33.0,97000,Female,Masters,True,0.06
3,1003,,,166000,,,,
4,1004,20230105.0,27.0,81000,Male,Bachelos,False,-0.07
5,1005,,,157000,,,,
6,1006,20230107.0,32.0,96000,Female,Bachelos,True,-1.66


### 각 컬럼별 결측치 확인

In [33]:
df2.isnull().sum()

id             0
date           2
age            2
income         0
gender         2
education      2
passed         2
measurement    2
dtype: int64

### 결측치가 포함된 데이터 출력

In [35]:
df2[df2.isnull().any(axis=1)]

Unnamed: 0,id,date,age,income,gender,education,passed,measurement
3,1003,,,166000,,,,
5,1005,,,157000,,,,


### 결측치가 포함된 행 삭제

In [36]:
df2.dropna(axis=0)

Unnamed: 0,id,date,age,income,gender,education,passed,measurement
0,1000,20230101,21.0,67000,Male,Bachelos,False,0.75
1,1001,20230102,56.0,220000,Female,PhD,True,-1.21
2,1002,20230103,33.0,97000,Female,Masters,True,0.06
4,1004,20230105,27.0,81000,Male,Bachelos,False,-0.07
6,1006,20230107,32.0,96000,Female,Bachelos,True,-1.66


### 결측치가 포함된 열 삭제

In [37]:
df2.dropna(axis=1)


Unnamed: 0,id,income
0,1000,67000
1,1001,220000
2,1002,97000
3,1003,166000
4,1004,81000
5,1005,157000
6,1006,96000


### 결측치를 특정 값으로 치환

In [38]:
df2.loc[:,['age','measurement']].fillna(-9999)

Unnamed: 0,age,measurement
0,21.0,0.75
1,56.0,-1.21
2,33.0,0.06
3,-9999.0,-9999.0
4,27.0,-0.07
5,-9999.0,-9999.0
6,32.0,-1.66


In [40]:
df2.loc[:,['age','measurement']].fillna(df2.mean()[['age','measurement']])


Unnamed: 0,age,measurement
0,21.0,0.75
1,56.0,-1.21
2,33.0,0.06
3,33.8,-0.426
4,27.0,-0.07
5,33.8,-0.426
6,32.0,-1.66


### sklearn 패키지를 활용하여 결측치 처리
- impute를 사용하여 결측치 처리(특정값)

In [47]:
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-9999)
imputed = imp.fit_transform(df2.loc[:, ['age','measurement']].values)
pd.DataFrame(imputed, columns=['age','measurement'])

Unnamed: 0,age,measurement
0,21.0,0.75
1,56.0,-1.21
2,33.0,0.06
3,-9999.0,-9999.0
4,27.0,-0.07
5,-9999.0,-9999.0
6,32.0,-1.66



- impute를 사용하여 결측치 처리(평균)

In [48]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed = imp.fit_transform(df2.loc[:, ['age','measurement']].values)
pd.DataFrame(imputed, columns=['age','measurement'])

Unnamed: 0,age,measurement
0,21.0,0.75
1,56.0,-1.21
2,33.0,0.06
3,33.8,-0.426
4,27.0,-0.07
5,33.8,-0.426
6,32.0,-1.66


### imputer를 활용하여 결측치 처리(n_neighbor)

In [49]:
imp_knn = KNNImputer(n_neighbors=2, weights='uniform')
imputed = imp_knn.fit_transform(df2.loc[:, ['age','measurement']].values)
pd.DataFrame(imputed, columns=['age','measurement'])

Unnamed: 0,age,measurement
0,21.0,0.75
1,56.0,-1.21
2,33.0,0.06
3,33.8,-0.426
4,27.0,-0.07
5,33.8,-0.426
6,32.0,-1.66
