### 누락된 데이터 처리하기

In [1]:
import numpy as np
import pandas as pd

In [2]:
#  NaN 포함한 문자열 시리즈 만들기

string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
# NaN 값 찾기

string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
# 첫번째 값 NaN으로 바꾸기

string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 누락된 데이터 골라내기

**dropna**메소드를 적용하여 null이 아닌 데이터와 색인값만 들어 있는 Series를 반환할 수 있다.

In [7]:
from numpy import nan as NA

In [8]:
data = pd.Series([1,NA,3.5,NA,7])

In [9]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

위 코드를 다음과 같이 동일하게 표현도 가능하다.

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

DataFrame 객체의 경우 dropna는 기본적으로 NA값을 하나라도 포함하고 있는 로우를 제외시킨다.

In [11]:
data = pd.DataFrame([[1,2,3,4],[1.,NA,NA,NA],[5,3,1,2],[NA,NA,NA,NA]])

In [12]:
cleaned = data.dropna()

In [13]:
cleaned

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
2,5.0,3.0,1.0,2.0


**how = 'all'** 옵션을 넘기면 모두 NA 값인 로우만 제외시킨다.

In [14]:
data.dropna(how = 'all')

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,1.0,,,
2,5.0,3.0,1.0,2.0


컬럼을 제외시키는 방법도 동일하게 동작한다. 옵션으로 axis = 1을 넣어주면 된다.

In [15]:
# data 4번째 컬럼 NA로 채우기

data[4] = NA

data

Unnamed: 0,0,1,2,3,4
0,1.0,2.0,3.0,4.0,
1,1.0,,,,
2,5.0,3.0,1.0,2.0,
3,,,,,


In [16]:
data.dropna(axis = 1 ,how = 'all')

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,1.0,,,
2,5.0,3.0,1.0,2.0
3,,,,


DataFrame의 로우를 제외시키는 방법은 시계열 데이터에 주로 사용되는 경향이 있다.

몇 개 이상의 값이 들어 있는 로우만 살펴보고 싶다면 thresh 인자에 원하는 값을 넘기면 된다.

In [18]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-0.405878,0.345929,0.167025
1,-1.516207,-0.635372,-0.996764
2,-1.131788,-0.426068,-0.940085
3,0.062558,-0.939764,-1.620824
4,-0.052268,-0.504019,-0.077846
5,-1.119496,-1.444772,0.764476
6,0.349232,0.162422,1.7295


In [20]:
# df의 1번 컬럼 0부터 3번째 row 값 까지 NA

df.iloc[:4,1] = NA
df

Unnamed: 0,0,1,2
0,-0.405878,,0.167025
1,-1.516207,,-0.996764
2,-1.131788,,-0.940085
3,0.062558,,-1.620824
4,-0.052268,-0.504019,-0.077846
5,-1.119496,-1.444772,0.764476
6,0.349232,0.162422,1.7295


In [21]:
# 결측치 포함값 제거

df.dropna()

Unnamed: 0,0,1,2
4,-0.052268,-0.504019,-0.077846
5,-1.119496,-1.444772,0.764476
6,0.349232,0.162422,1.7295


In [26]:
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
0,-0.405878,,0.167025
1,-1.516207,,-0.996764
2,-1.131788,,-0.940085
3,0.062558,,-1.620824
4,-0.052268,-0.504019,-0.077846
5,-1.119496,-1.444772,0.764476
6,0.349232,0.162422,1.7295


### 결측치 채우기

누락된 값을 제외시키지않고 데이터 상의 **구멍**을 어떻게든 메워야할 때가 있다.

**fillna**메소드를 사용한다.

In [28]:
# fillna를 통해 0으로 메꿈

df.fillna(0)

Unnamed: 0,0,1,2
0,-0.405878,0.0,0.167025
1,-1.516207,0.0,-0.996764
2,-1.131788,0.0,-0.940085
3,0.062558,0.0,-1.620824
4,-0.052268,-0.504019,-0.077846
5,-1.119496,-1.444772,0.764476
6,0.349232,0.162422,1.7295


In [30]:
# fillna에 사전값을 넘겨 각 컬럼마다 다른 값을 채울수도 있다.

df.fillna({1 : 0.5, 2 : 0})

Unnamed: 0,0,1,2
0,-0.405878,0.5,0.167025
1,-1.516207,0.5,-0.996764
2,-1.131788,0.5,-0.940085
3,0.062558,0.5,-1.620824
4,-0.052268,-0.504019,-0.077846
5,-1.119496,-1.444772,0.764476
6,0.349232,0.162422,1.7295


In [31]:
df = pd.DataFrame(np.random.randn(6,3))

df

Unnamed: 0,0,1,2
0,1.683297,0.069818,0.802249
1,0.239839,1.307227,0.774159
2,0.077848,0.043853,1.986627
3,0.199861,-1.198788,0.697392
4,1.722663,-0.333607,-1.276646
5,-0.25017,-0.598597,-1.670357


In [35]:
# 3번째 row부터 NA

df.iloc[2:,1] = NA
df

Unnamed: 0,0,1,2
0,1.683297,0.069818,0.802249
1,0.239839,1.307227,0.774159
2,0.077848,,1.986627
3,0.199861,,0.697392
4,1.722663,,-1.276646
5,-0.25017,,-1.670357


In [36]:
# 5번째 값부터 NA

df.iloc[4:,2] = NA

df

Unnamed: 0,0,1,2
0,1.683297,0.069818,0.802249
1,0.239839,1.307227,0.774159
2,0.077848,,1.986627
3,0.199861,,0.697392
4,1.722663,,
5,-0.25017,,


In [37]:
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2
0,1.683297,0.069818,0.802249
1,0.239839,1.307227,0.774159
2,0.077848,1.307227,1.986627
3,0.199861,1.307227,0.697392
4,1.722663,1.307227,0.697392
5,-0.25017,1.307227,0.697392
