## 결측값 채우기 - fillna()

In [1]:
import numpy as np
import pandas as pd

### 데이터 프레임 생성

In [3]:
df = pd.DataFrame(np.random.randn(5,3), columns=['C1','C2','C3'])
df

Unnamed: 0,C1,C2,C3
0,-0.004753,0.057605,-0.920896
1,0.110783,-0.052542,-0.222769
2,1.043662,-1.401253,0.625471
3,0.062306,2.335622,0.438799
4,-0.905146,-1.49213,-0.769429


### 몇개의 데이터를 NaN으로 변경

In [4]:
df.loc[0, 'C1'] = np.nan
df.loc[1, 'C1'] = np.nan
df.loc[1, 'C3'] = np.nan
df.loc[2, 'C2'] = np.nan
df.loc[3, 'C2'] = np.nan
df.loc[4, 'C3'] = np.nan

df

Unnamed: 0,C1,C2,C3
0,,0.057605,-0.920896
1,,-0.052542,
2,1.043662,,0.625471
3,0.062306,,0.438799
4,-0.905146,-1.49213,


### 결측값(NaN)을 특정 값으로 채우기

#### 숫자 0

In [8]:
df_0 = df.fillna(0)
df_0

Unnamed: 0,C1,C2,C3
0,0.0,0.057605,-0.920896
1,0.0,-0.052542,0.0
2,1.043662,0.0,0.625471
3,0.062306,0.0,0.438799
4,-0.905146,-1.49213,0.0


#### 문자열

In [10]:
df_string = df.fillna('hello')
df_string

Unnamed: 0,C1,C2,C3
0,hello,0.057605,-0.920896
1,hello,-0.052542,hello
2,1.043662,hello,0.625471
3,0.062306,hello,0.438799
4,-0.905146,-1.49213,hello


### 결측값을 앞 방향, 뒷 방향으로 채우기

In [14]:
df

Unnamed: 0,C1,C2,C3
0,,0.057605,-0.920896
1,,-0.052542,
2,1.043662,,0.625471
3,0.062306,,0.438799
4,-0.905146,-1.49213,


#### 위에서 아래로 채우기

```python
fillna(method='ffill') or fillna(method='pad')
```

In [15]:
df.fillna(method='ffill')

Unnamed: 0,C1,C2,C3
0,,0.057605,-0.920896
1,,-0.052542,-0.920896
2,1.043662,-0.052542,0.625471
3,0.062306,-0.052542,0.438799
4,-0.905146,-1.49213,0.438799


In [16]:
df.fillna(method='pad')

Unnamed: 0,C1,C2,C3
0,,0.057605,-0.920896
1,,-0.052542,-0.920896
2,1.043662,-0.052542,0.625471
3,0.062306,-0.052542,0.438799
4,-0.905146,-1.49213,0.438799


#### 아래에서 위로 채우기

```python
fillna(method='bfill') or fillna(method='backfill')
```

In [17]:
df.fillna(method='bfill')

Unnamed: 0,C1,C2,C3
0,1.043662,0.057605,-0.920896
1,1.043662,-0.052542,0.625471
2,1.043662,-1.49213,0.625471
3,0.062306,-1.49213,0.438799
4,-0.905146,-1.49213,


In [19]:
df.fillna(method='backfill')

Unnamed: 0,C1,C2,C3
0,1.043662,0.057605,-0.920896
1,1.043662,-0.052542,0.625471
2,1.043662,-1.49213,0.625471
3,0.062306,-1.49213,0.438799
4,-0.905146,-1.49213,


#### 앞, 뒤 방향 채우기 횟수 제한

```python
fillna(method='속성', limit = n)
```

In [20]:
df.fillna(method='bfill', limit=1)

Unnamed: 0,C1,C2,C3
0,,0.057605,-0.920896
1,1.043662,-0.052542,0.625471
2,1.043662,,0.625471
3,0.062306,-1.49213,0.438799
4,-0.905146,-1.49213,


#### 변수 별 평균으로 채우기

```python
fillna(df.mean()) or df.where(pd.notnull(df), df.mean(), axis='columns')
```

In [23]:
df.mean()

C1    0.066941
C2   -0.495689
C3    0.047791
dtype: float64

In [22]:
df.fillna(df.mean())

Unnamed: 0,C1,C2,C3
0,0.066941,0.057605,-0.920896
1,0.066941,-0.052542,0.047791
2,1.043662,-0.495689,0.625471
3,0.062306,-0.495689,0.438799
4,-0.905146,-1.49213,0.047791


#### 'C1' 의 평균 값으로 C1,C2,C3 칼럼의 결측값 채우기

```python
df.mean()['C1']
```

In [25]:
df.fillna(df.mean()['C1'])

Unnamed: 0,C1,C2,C3
0,0.066941,0.057605,-0.920896
1,0.066941,-0.052542,0.066941
2,1.043662,0.066941,0.625471
3,0.062306,0.066941,0.438799
4,-0.905146,-1.49213,0.066941


#### C1, C2 만 평균값으로 채우기

```python
df.mean()['C1':'C2']
```

In [26]:
df.fillna(df.mean()['C1':'C2'])

Unnamed: 0,C1,C2,C3
0,0.066941,0.057605,-0.920896
1,0.066941,-0.052542,
2,1.043662,-0.495689,0.625471
3,0.062306,-0.495689,0.438799
4,-0.905146,-1.49213,
