In [28]:
import pandas as pd
import numpy as np

In [None]:
# 7.1.1 - 누락된 데이터 골라내기

In [29]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [30]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [33]:
#dropna(): nan값을 하나라도 포함하고 있는 row제거(default)
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [39]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [48]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [49]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [42]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [44]:
# 모두 nan값인 로우만 제외
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [47]:
data.dropna(thresh=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [35]:
# 정수색인 이용한 결측치 만들기
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

In [38]:
df.dropna(axis=1)

Unnamed: 0,0
0,1.04744
1,-0.060863
2,-0.612635
3,1.262583
4,0.168602
5,1.056161
6,0.549918


In [None]:
#7.1.2 - 결측치 채우기

In [2]:
df = pd.DataFrame(np.random.randn(7,3))

In [9]:
# 정수색인 이용한 결측치 만들기
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

In [10]:
df

Unnamed: 0,0,1,2
0,0.071944,,
1,0.684228,,
2,0.64338,,0.372083
3,-1.039014,,-1.492419
4,-0.130281,0.307122,0.821978
5,-0.250097,0.791791,-0.438401
6,-2.042939,-1.549409,-1.10401


In [None]:
# fillna(value=None, method=None, axis=None, 
# inplace=False, limit=None, downcast=None)
# value: 비어있는 값을 채울 사전 형식의 객체
# method: 보간방식(ffill,dfill...)
# axis: 갑을 채워넣을 축
# inplace: 원본 파일에 반영
# limit: 값을 몇 개 채울지 지정

In [11]:
# 결측치 0으로 채우기
df.fillna(0)

Unnamed: 0,0,1,2
0,0.071944,0.0,0.0
1,0.684228,0.0,0.0
2,0.64338,0.0,0.372083
3,-1.039014,0.0,-1.492419
4,-0.130281,0.307122,0.821978
5,-0.250097,0.791791,-0.438401
6,-2.042939,-1.549409,-1.10401


In [17]:
# 결측치 str으로 채우기
df.fillna('missing')

Unnamed: 0,0,1,2
0,0.071944,missing,missing
1,0.684228,missing,missing
2,0.64338,missing,0.372083
3,-1.039014,missing,-1.49242
4,-0.130281,0.307122,0.821978
5,-0.250097,0.791791,-0.438401
6,-2.042939,-1.54941,-1.10401


In [12]:
# dictionary를 이용한 열마다 다른 값으로 결측치 채우기
# 컬럼1: 0.5, 컬럼2: 0
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,0.071944,0.5,0.0
1,0.684228,0.5,0.0
2,0.64338,0.5,0.372083
3,-1.039014,0.5,-1.492419
4,-0.130281,0.307122,0.821978
5,-0.250097,0.791791,-0.438401
6,-2.042939,-1.549409,-1.10401


In [13]:
# 정규분포(평균 0, 표준편차 1)에서 array(6,3)배열 생성
df1 = pd.DataFrame(np.random.randn(6,3))

In [14]:
df1.iloc[2:, 1] = np.nan
df1.iloc[4:, 2] = np.nan

In [21]:
df1

Unnamed: 0,0,1,2
0,-0.82617,-1.88291,0.386715
1,0.490278,-0.758414,1.610822
2,-1.493179,,2.538482
3,0.06875,,0.279243
4,-0.149722,,
5,-0.507127,,


In [18]:
# 앞의 행값으로 결측치 채우기
# fillna(method='ffill' or 'pad')
df1.fillna(method='pad')

Unnamed: 0,0,1,2
0,-0.82617,-1.88291,0.386715
1,0.490278,-0.758414,1.610822
2,-1.493179,-0.758414,2.538482
3,0.06875,-0.758414,0.279243
4,-0.149722,-0.758414,0.279243
5,-0.507127,-0.758414,0.279243


In [26]:
#limit는 각 columns당 NaN값을 채울 갯수
df1.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,-0.82617,-1.88291,0.386715
1,0.490278,-0.758414,1.610822
2,-1.493179,-0.758414,2.538482
3,0.06875,,0.279243
4,-0.149722,,0.279243
5,-0.507127,,


In [19]:
# 뒤의 행값으로 결측치 채우기
# fillna(method='bfill' or 'backfill')
# NaN값에 뒤의 행이 없어서, fillna함수가 반영되지 않았다.
df1.fillna(method='bfill')

Unnamed: 0,0,1,2
0,-0.82617,-1.88291,0.386715
1,0.490278,-0.758414,1.610822
2,-1.493179,,2.538482
3,0.06875,,0.279243
4,-0.149722,,
5,-0.507127,,


In [27]:
# 중간값으로 채우기
df1.fillna(df1.mean())

Unnamed: 0,0,1,2
0,-0.82617,-1.88291,0.386715
1,0.490278,-0.758414,1.610822
2,-1.493179,-1.320662,2.538482
3,0.06875,-1.320662,0.279243
4,-0.149722,-1.320662,1.203816
5,-0.507127,-1.320662,1.203816
