### 09 결측값 알아보기

#### [09 - 1] 결측값이란?

In [2]:
# numpy 라이브러리에서 결측값 불러오기
from numpy import NaN, NAN, nan

In [3]:
# 결측값과의 비교
print(NaN == True)
print(NaN == 0)
print(NaN == '')
print(NaN == NaN)
print(NaN == NAN)
print(NaN == nan)
print(nan == NAN)

False
False
False
False
False
False
False


In [4]:
import pandas as pd

# 결측값 확인
print(pd.isnull(NaN))
print(pd.isnull(nan))
print(pd.isnull(NAN))

True
True
True


In [5]:
# 결측값이 아닌지 확인
print(pd.notnull(NaN))
print(pd.notnull(42))
print(pd.notnull('missing'))

False
True
True


#### [09 - 2] 결측값은 왜 생길까?

##### 1. 데이터를 불러올 때 생기는 결측값

In [6]:
# 기본값 사용한 데이터 불러오기
visited_file = '../data/survey_visited.csv'
print(pd.read_csv(visited_file))

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [7]:
# keep_default_na를 False로 설정한 데이터 불러오기
print(pd.read_csv(visited_file, keep_default_na = False))       ## 결측값을 처리X

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3            
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


In [8]:
# na_values 결측값으로 빈 문자열 지정
print(pd.read_csv(visited_file, na_values = [""], keep_default_na = False))

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22


##### 2. 데이터를 연결할 때 생기는 결측값

In [9]:
visited = pd.read_csv('../data/survey_visited.csv')
survey = pd.read_csv('../data/survey_survey.csv')
print(visited)
print("---------------------")
print(survey)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22
---------------------
    taken person quant  reading
0     619   dyer   rad     9.82
1     619   dyer   sal     0.13
2     622   dyer   rad     7.80
3     622   dyer   sal     0.09
4     734     pb   rad     8.41
5     734   lake   sal     0.05
6     734     pb  temp   -21.50
7     735     pb   rad     7.22
8     735    NaN   sal     0.06
9     735    NaN  temp   -26.00
10    751     pb   rad     4.35
11    751     pb  temp   -18.50
12    751   lake   sal     0.10
13    752   lake   rad     2.19
14    752   lake   sal     0.09
15    752   lake  temp   -16.00
16    752    roe   sal    41.60
17    837   lake   rad     1.46
18    837   lake   sal     0.21
19    837    roe   sal    22.50
20    844    roe   rad    11.25


In [10]:
# 데이터 병합
vs = visited.merge(survey, left_on = "ident", right_on = "taken")
print(vs)

    ident   site       dated  taken person quant  reading
0     619   DR-1  1927-02-08    619   dyer   rad     9.82
1     619   DR-1  1927-02-08    619   dyer   sal     0.13
2     622   DR-1  1927-02-10    622   dyer   rad     7.80
3     622   DR-1  1927-02-10    622   dyer   sal     0.09
4     734   DR-3  1939-01-07    734     pb   rad     8.41
5     734   DR-3  1939-01-07    734   lake   sal     0.05
6     734   DR-3  1939-01-07    734     pb  temp   -21.50
7     735   DR-3  1930-01-12    735     pb   rad     7.22
8     735   DR-3  1930-01-12    735    NaN   sal     0.06
9     735   DR-3  1930-01-12    735    NaN  temp   -26.00
10    751   DR-3  1930-02-26    751     pb   rad     4.35
11    751   DR-3  1930-02-26    751     pb  temp   -18.50
12    751   DR-3  1930-02-26    751   lake   sal     0.10
13    752   DR-3         NaN    752   lake   rad     2.19
14    752   DR-3         NaN    752   lake   sal     0.09
15    752   DR-3         NaN    752   lake  temp   -16.00
16    752   DR

##### 3. 직접 입력한 결측값

In [11]:
# 결측값 포함한 시리즈 생성
num_legs = pd.Series({'goat' : 4, 'amoeba' : nan})
print(num_legs)

goat      4.0
amoeba    NaN
dtype: float64


In [12]:
# 결측값 포함한 데이터프레임 생성
scientists = pd.DataFrame(
    {"Name" : ["Rosaline Franklin", "William Gosset"],
     "Occupation" : ["Chemist", "Statistician"],
     "Born" : ["1920-07-25", "1876-06-13"],
     "Died" : ["1958-04-16", "1937-10-16"],
     "missing" : [NaN, nan],}
)

print(scientists)

                Name    Occupation        Born        Died  missing
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16      NaN
1     William Gosset  Statistician  1876-06-13  1937-10-16      NaN


In [13]:
# 데이터 자료형 확인
print(scientists.dtypes)

Name           object
Occupation     object
Born           object
Died           object
missing       float64
dtype: object


In [14]:
# 모든 값이 NaN인 열 할당
scientists = pd.DataFrame(
    {"Name" : ["Rosaline Franklin", "William Gosset"],
     "Occupation" : ["Chemist", "Statistician"],
     "Born" : ["1920-07-25", "1876-06-13"],
     "Died" : ["1958-04-16", "1937-10-16"],
    })

scientists["missing"] = nan     ## missing 열을 결측값으로 채우기
print(scientists)

                Name    Occupation        Born        Died  missing
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16      NaN
1     William Gosset  Statistician  1876-06-13  1937-10-16      NaN


##### 4. 인덱스를 다시 설정할 때 생기는 결측값

In [15]:
gapminder = pd.read_csv('../data/gapminder.tsv', sep = '\t')

# 연도별 평균수명
life_exp = gapminder.groupby('year')['lifeExp'].mean()
print(life_exp)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [16]:
# 2000년 이후 데이터 추출
y2000 = life_exp[life_exp.index > 2000]
print(y2000)

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [17]:
# 2002, 2007 제외한 나머지 인덱스 결측값 NaN 채우기
print(y2000.reindex(range(2000,2010)))

year
2000          NaN
2001          NaN
2002    65.694923
2003          NaN
2004          NaN
2005          NaN
2006          NaN
2007    67.007423
2008          NaN
2009          NaN
Name: lifeExp, dtype: float64


#### [09 - 3] 결측값 다루기

##### 1. 결측값 처리하기

##### 1) 결측값 개수 구하기

In [None]:
# 파일 불러오기
ebola = pd.read_csv('../data/country_timeseries.csv')

# NaN이 아닌 값의 개수
print(ebola.count())

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64


In [None]:
# 전체 행 개수
num_rows = ebola.shape[0]

# 결측값 개수
num_missing = num_rows - ebola.count()
print(num_missing)

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64


In [None]:
import numpy as np

# 결측값 개수 1
print(np.count_nonzero(ebola.isnull()))     ## count_nonzero : 0(False)을 제외한 수 개수

# 특정 열의 결측값 개수
print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

1214
29


In [33]:
# 결측값 개수 2
cnts = ebola.Cases_Guinea.value_counts(dropna = False)      ## dropna = False : 비어있는 값도 포함
print(cnts)

Cases_Guinea
NaN       29
86.0       3
495.0      2
112.0      2
390.0      2
          ..
1199.0     1
1298.0     1
1350.0     1
1472.0     1
49.0       1
Name: count, Length: 89, dtype: int64


In [None]:
# 결측값 개수 3
print(cnts.loc[pd.isnull(cnts.index)])

Cases_Guinea
NaN    29
Name: count, dtype: int64


In [30]:
# 결측값 개수 4
print(ebola.Cases_Guinea.isnull().sum())

29


##### 2) 결측값 대체하기

In [None]:
# 결측갑승ㄹ 모두 0으로 변환
print(ebola.fillna(0).iloc[:, 0:5])

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0            0.0            10030.0
1      1/4/2015  288        2775.0            0.0             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286           0.0         8157.0                0.0
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            0.0                0.0
119   3/25/2014    3          86.0            0.0                0.0
120   3/24/2014    2          86.0            0.0                0.0
121   3/22/2014    0          49.0            0.0                0.0

[122 rows x 5 columns]


##### 3) 정방향 채우기

In [37]:
# 결측값 직전에 찾은 값으로 채우기 (정방향 채우기)
print(ebola.fillna(method = 'ffill').iloc[:, 0:5])      ## 열의 첫 행이 결측값이면 해당 데이터는 결측값으로 유지

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0            NaN            10030.0
1      1/4/2015  288        2775.0            NaN             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286        2769.0         8157.0             9722.0
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            8.0                6.0
119   3/25/2014    3          86.0            8.0                6.0
120   3/24/2014    2          86.0            8.0                6.0
121   3/22/2014    0          49.0            8.0                6.0

[122 rows x 5 columns]


  print(ebola.fillna(method = 'ffill').iloc[:, 0:5])      ## 열의 첫 행이 결측값이면 해당 데이터는 결측값으로 유지


##### 4) 역방향 채우기

In [38]:
# 아래에서 위로 훑으면서 결측값 직전에 찾은 값으로 결측값 채우기 (역방향 채우기)
print(ebola.fillna(method = 'bfill').iloc[:, 0:5])

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0         8166.0            10030.0
1      1/4/2015  288        2775.0         8166.0             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286        2730.0         8157.0             9633.0
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            NaN                NaN
119   3/25/2014    3          86.0            NaN                NaN
120   3/24/2014    2          86.0            NaN                NaN
121   3/22/2014    0          49.0            NaN                NaN

[122 rows x 5 columns]


  print(ebola.fillna(method = 'bfill').iloc[:, 0:5])


##### 5) 보간법으로 채우기

In [39]:
# 결측값 양쪽 값의 중간값으로 채우기
print(ebola.interpolate().iloc[:, 0:5])

           Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0      1/5/2015  289        2776.0            NaN            10030.0
1      1/4/2015  288        2775.0            NaN             9780.0
2      1/3/2015  287        2769.0         8166.0             9722.0
3      1/2/2015  286        2749.5         8157.0             9677.5
4    12/31/2014  284        2730.0         8115.0             9633.0
..          ...  ...           ...            ...                ...
117   3/27/2014    5         103.0            8.0                6.0
118   3/26/2014    4          86.0            8.0                6.0
119   3/25/2014    3          86.0            8.0                6.0
120   3/24/2014    2          86.0            8.0                6.0
121   3/22/2014    0          49.0            8.0                6.0

[122 rows x 5 columns]


  print(ebola.interpolate().iloc[:, 0:5])


##### 6) 결측값 삭제하기

In [41]:
# ebola의 데이터 크기 확인
print(ebola.shape)

(122, 18)


In [42]:
# 결측값이 하나도 없는 행만 남기기
ebola_dropna = ebola.dropna()
print(ebola_dropna.shape)

(1, 18)


##### 7) 결측값이 있는 데이터 계산하기

In [43]:
ebola["Cases_multiple"] = (
    ebola["Cases_Guinea"]
    + ebola["Cases_Liberia"]
    + ebola["Cases_SierraLeone"]
)

In [45]:
ebola_subset = ebola.loc[:, ["Cases_Guinea", "Cases_Liberia", "Cases_SierraLeone", "Cases_multiple"]]
print(ebola_subset.head(10))

   Cases_Guinea  Cases_Liberia  Cases_SierraLeone  Cases_multiple
0        2776.0            NaN            10030.0             NaN
1        2775.0            NaN             9780.0             NaN
2        2769.0         8166.0             9722.0         20657.0
3           NaN         8157.0                NaN             NaN
4        2730.0         8115.0             9633.0         20478.0
5        2706.0         8018.0             9446.0         20170.0
6        2695.0            NaN             9409.0             NaN
7        2630.0         7977.0             9203.0         19810.0
8        2597.0            NaN             9004.0             NaN
9        2571.0         7862.0             8939.0         19372.0


In [48]:
# 결측값 건너뛰고 값 계산
print(ebola.Cases_Guinea.sum(skipna = True))

# 결측값 건너뛰지 않고 값 계산
print(ebola.Cases_Guinea.sum(skipna = False))

84729.0
nan


#### [09 - 4] 판다스 내장 NA 결측값 살펴보기

In [49]:
scientists = pd.DataFrame(
    {"Name" : ["Rosaline Franklin", "William Gosset"],
     "Occupation" : ["Chemist", "Statistician"],
     "Born" : ["1920-07-25", "1876-06-13"],
     "Died" : ["1958-04-16", "1937-10-16"],
     "Age" : [37, 61]
     }
)
print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61


In [50]:
# 데이터형
print(scientists.dtypes)

Name          object
Occupation    object
Born          object
Died          object
Age            int64
dtype: object


In [None]:
# 일부 값 결측값 변경
scientists.loc[1, "Name"] = pd.NA
scientists.loc[1, "Age"] = pd.NA
print(scientists)

                Name    Occupation        Born        Died   Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16  37.0
1               <NA>  Statistician  1876-06-13  1937-10-16   NaN


In [53]:
# 데이터형 다시 확인
print(scientists.dtypes)

Name           object
Occupation     object
Born           object
Died           object
Age           float64
dtype: object


In [54]:
# 결측값을 np.NaN으로 설정했을 때와 비교
scientists = pd.DataFrame(
    {"Name" : ["Rosaline Franklin", "William Gosset"],
     "Occupation" : ["Chemist", "Statistician"],
     "Born" : ["1920-07-25", "1876-06-13"],
     "Died" : ["1958-04-16", "1937-10-16"],
     "Age" : [37, 61]
     }
)
print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61


In [None]:
# np.NaN으로 결측값으로 변환
scientists.loc[1, "Name"] = np.NaN
scientists.loc[1, "Age"] = np.NaN
print(scientists)

# 데이터형 확인
print(scientists.dtypes)      

## 결과 : np.Na를 사용하든 np.NaN을 사용하든 결과는 똑같다!

                Name    Occupation        Born        Died   Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16  37.0
1                NaN  Statistician  1876-06-13  1937-10-16   NaN
Name           object
Occupation     object
Born           object
Died           object
Age           float64
dtype: object
