In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pylab as plt
import scipy as sp

In [14]:
# making df
np.random.seed(0)
df = pd.DataFrame(np.random.randn(5, 3),
        columns=['C1', 'C2', 'C3'])
df

Unnamed: 0,C1,C2,C3
0,1.764052,0.400157,0.978738
1,2.240893,1.867558,-0.977278
2,0.950088,-0.151357,-0.103219
3,0.410599,0.144044,1.454274
4,0.761038,0.121675,0.443863


In [17]:
# 결측치 삽입
# 0, 1 index의 C1열
df.iloc[0:2, 0] = np.nan
# 1 index의 C3열
df["C3"][1] = np.nan
# 2, 3 index의 C2열
df["C2"][[2, 3]] = np.nan
# 4 index의 C3열
df.iloc[4, 2] = np.nan

In [18]:
df

Unnamed: 0,C1,C2,C3
0,,0.400157,0.978738
1,,1.867558,
2,0.950088,,-0.103219
3,0.410599,,1.454274
4,0.761038,0.121675,


In [20]:
# 결측값을 특정 값으로 채우기 (replace missing values with scalar value)  
# df.fillna(0)
df.fillna(0)


Unnamed: 0,C1,C2,C3
0,0.0,0.400157,0.978738
1,0.0,1.867558,0.0
2,0.950088,0.0,-0.103219
3,0.410599,0.0,1.454274
4,0.761038,0.121675,0.0


In [25]:
# 결측값을 앞 방향으로 채우기 (fill gaps forward)
# fillna(method='ffill' or 'pad')
df.fillna(method="ffill")


Unnamed: 0,C1,C2,C3
0,,0.400157,0.978738
1,,1.867558,0.978738
2,0.950088,1.867558,-0.103219
3,0.410599,1.867558,1.454274
4,0.761038,0.121675,1.454274


In [29]:
# 결측값을 뒷 방향으로 채우기 (fill gaps backward)
# fillna(method='bfill' or 'backfill')
df.bfill()

Unnamed: 0,C1,C2,C3
0,0.950088,0.400157,0.978738
1,0.950088,1.867558,-0.103219
2,0.950088,0.121675,-0.103219
3,0.410599,0.121675,1.454274
4,0.761038,0.121675,


In [30]:
# limit 인수 사용. 결측값을 채우는 횟수 제한
df.fillna(method="bfill", limit=1)

Unnamed: 0,C1,C2,C3
0,,0.400157,0.978738
1,0.950088,1.867558,-0.103219
2,0.950088,,-0.103219
3,0.410599,0.121675,1.454274
4,0.761038,0.121675,


In [24]:
# 결측값을 변수별 평균으로 대체하기
# (filling missing values with mean per columns)
# df.fillna(df.mean()), df.where(pd.notnull(df), df.mean(), axis='columns')
print(df.mean(axis=0)) # 열 평균

C1    0.707242
C2    0.796463
C3    0.776598
dtype: float64


In [31]:
df.fillna(df.mean())

Unnamed: 0,C1,C2,C3
0,0.707242,0.400157,0.978738
1,0.707242,1.867558,0.776598
2,0.950088,0.796463,-0.103219
3,0.410599,0.796463,1.454274
4,0.761038,0.121675,0.776598


In [46]:
df.where(df.notnull(), df.mean(axis=0), axis="columns")

Unnamed: 0,C1,C2,C3
0,0.707242,0.400157,0.978738
1,0.707242,1.867558,0.776598
2,0.950088,0.796463,-0.103219
3,0.410599,0.796463,1.454274
4,0.761038,0.121675,0.776598


In [47]:
# c1열의 평균을 결측치 대체값으로 처리하기
df.fillna(df.mean()["C1"])

Unnamed: 0,C1,C2,C3
0,0.707242,0.400157,0.978738
1,0.707242,1.867558,0.707242
2,0.950088,0.707242,-0.103219
3,0.410599,0.707242,1.454274
4,0.761038,0.121675,0.707242


In [48]:
# C1, C2열만 각각 열의 평균을 결측치로 대체하고 C3는 그대로 결측치
df.fillna(df.mean()["C1":"C2"])

Unnamed: 0,C1,C2,C3
0,0.707242,0.400157,0.978738
1,0.707242,1.867558,
2,0.950088,0.796463,-0.103219
3,0.410599,0.796463,1.454274
4,0.761038,0.121675,


In [52]:
# 기촌 결측치가 있는 열을 그대로 두고 결측치를 대체하여 새로운 열로 만들기
# c2열의 결측치를 c1의 해당하는 값으로 처리하여 새로운 c2_new열 생성
df["C2_NEW"] = np.where(df["C2"].notnull()==True, df["C2"], df["C1"])
df

Unnamed: 0,C1,C2,C3,C2_NEW
0,,0.400157,0.978738,0.400157
1,,1.867558,,1.867558
2,0.950088,,-0.103219,0.950088
3,0.410599,,1.454274,0.410599
4,0.761038,0.121675,,0.121675
