# 누락값 확인하기
    - 판다스 라이브러리 사용
        - pd.isnull(x), pd.isna(x)
    - 넘파이 라이브러리 사용
        - np.isnan(x)
        - numpy: 수학이나 과학 연산을 위해 만든 파이썬 라이브러리
    - math 라이브러리 사용

## 1. 누락값이란?
- 누락값은 0, '' 값과는 다른 개념
- 데이터 자체가 없다는 의미
- '같다'라는 개념도 없다.
- 누락값과 True, False, 0, ''를 비교한 결과
- 비교할 값 자체가 없으므로 False 출력

In [None]:
import numpy as np

In [11]:
from numpy import NaN, NAN, nan

In [5]:
NaN == False

False

In [6]:
NaN == True

False

In [27]:
NaN is 'ad'

False

In [28]:
pd.isna(nan)

True

#### 판다스에서 null 파악하기
- pd.isnull(x)
    - pd.isna(x)
- pd.notnull(x)

In [17]:
import pandas as pd
print(pd.isnull(NAN))
print(pd.isnull(NaN))
print(pd.isnull(nan))

print(pd.notnull('da'))

True
True
True
True


## 2 . 누락값이 생기는 이유

In [33]:
visited = pd.read_csv('data/survey_visited.csv')
survey = pd.read_csv('data/survey_survey.csv')

In [34]:
visited.head(3)

Unnamed: 0,ident,site,dated
0,619,DR-1,1927-02-08
1,622,DR-1,1927-02-10
2,734,DR-3,1939-01-07


In [35]:
survey.head(3)

Unnamed: 0,taken,person,quant,reading
0,619,dyer,rad,9.82
1,619,dyer,sal,0.13
2,622,dyer,rad,7.8


In [36]:
vs = visited.merge(survey, left_on='ident', right_on='taken')
print(vs) # dated부분이 nan이다

    ident   site       dated  taken person quant  reading
0     619   DR-1  1927-02-08    619   dyer   rad     9.82
1     619   DR-1  1927-02-08    619   dyer   sal     0.13
2     622   DR-1  1927-02-10    622   dyer   rad     7.80
3     622   DR-1  1927-02-10    622   dyer   sal     0.09
4     734   DR-3  1939-01-07    734     pb   rad     8.41
5     734   DR-3  1939-01-07    734   lake   sal     0.05
6     734   DR-3  1939-01-07    734     pb  temp   -21.50
7     735   DR-3  1930-01-12    735     pb   rad     7.22
8     735   DR-3  1930-01-12    735    NaN   sal     0.06
9     735   DR-3  1930-01-12    735    NaN  temp   -26.00
10    751   DR-3  1930-02-26    751     pb   rad     4.35
11    751   DR-3  1930-02-26    751     pb  temp   -18.50
12    751   DR-3  1930-02-26    751   lake   sal     0.10
13    752   DR-3         NaN    752   lake   rad     2.19
14    752   DR-3         NaN    752   lake   sal     0.09
15    752   DR-3         NaN    752   lake  temp   -16.00
16    752   DR

## 3. 데이터를 입력할 떄 누락값이생기는 이유


In [37]:
#시리즈
num_legs = pd.Series({
    'goat':4
    ,'amoeba':nan
})
print(num_legs)

goat      4.0
amoeba    NaN
dtype: float64


In [38]:
scientists = pd.DataFrame({
    'Name':['Rosaline Franklin','William Gosset']
    ,'Occupation':['Chemist','Statistician']
    ,'Born':['1920-07-05','1930-09-12']
    ,'Died':['1960-02-03','1970-07-11']
    ,'missing':[NaN,nan]
})

## 4. 범위를 지정하여 데이터를 추출할 때 누락값이 생기는 경우

In [52]:
gapminder = pd.read_csv('data/gapminder.tsv',sep='\t')
gapminder.shape # (1704, 6)
gapminder.head(3)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071


In [45]:
life_exp = gapminder.groupby(['year'])
list(life_exp)
list(life_exp)[0]

(1952,                  country continent  year  lifeExp       pop    gdpPercap
 0            Afghanistan      Asia  1952   28.801   8425333   779.445314
 12               Albania    Europe  1952   55.230   1282697  1601.056136
 24               Algeria    Africa  1952   43.077   9279525  2449.008185
 36                Angola    Africa  1952   30.015   4232095  3520.610273
 48             Argentina  Americas  1952   62.485  17876956  5911.315053
 ...                  ...       ...   ...      ...       ...          ...
 1644             Vietnam      Asia  1952   40.412  26246839   605.066492
 1656  West Bank and Gaza      Asia  1952   43.160   1030585  1515.592329
 1668         Yemen, Rep.      Asia  1952   32.548   4963829   781.717576
 1680              Zambia    Africa  1952   42.038   2672000  1147.388831
 1692            Zimbabwe    Africa  1952   48.451   3080907   406.884115
 
 [142 rows x 6 columns])

In [46]:
# year로 groupby를 한 후, lifeExp를 평균내라. -> 그룹마다 lifeExp 평균
life_exp = gapminder.groupby(['year'])['lifeExp'].mean()
life_exp

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [47]:
life_exp.loc[range(2000,2010),]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


year
2000          NaN
2001          NaN
2002    65.694923
2003          NaN
2004          NaN
2005          NaN
2006          NaN
2007    67.007423
2008          NaN
2009          NaN
Name: lifeExp, dtype: float64

In [51]:
y2000 = life_exp[life_exp.index>2000]
y2000

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

## 5.  누락값 개수 구하기
- count()
    - 누락값이 아닌 값의 개수를 구함

In [55]:
ebola = pd.read_csv('data/country_timeseries.csv')

In [60]:
print(ebola.count())
ebola.shape # (122,18)

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64


(122, 18)

In [67]:
# ebola.shape[0]에서 누락값이 아닌 값의 개수를 빼면 누락값의 개수를 구할 수 있다.
num_rows = ebola.shape[0] # 행의 개수
print(num_rows)
print()
num_missing = num_rows - ebola.count()
print(num_missing)
print('전체 누락값 수: ',sum(num_missing))

122

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64
전체 누락값 수:  1214


In [62]:
import numpy as np
np.count_nonzero(ebola['Cases_Guinea'].isnull())

29

In [63]:
np.count_nonzero(ebola.isnull())

1214

In [71]:
#value_counts 메서드는 지정한 열의 빈도를 구하느 메서드
print(ebola.Cases_Guinea.value_counts(dropna=False).head())# 누락값 포함
print(ebola.Cases_Guinea.value_counts(dropna=True).head()) # 누락값 제외

NaN      29
86.0      3
495.0     2
112.0     2
390.0     2
Name: Cases_Guinea, dtype: int64
86.0      3
112.0     2
390.0     2
495.0     2
2597.0    1
Name: Cases_Guinea, dtype: int64


## 6. 누락값 변경하기
- fillna

In [88]:
# fillna(x)메서드를 사용하면 누락값을 x로 변경
# df이 크고 메모리를 효율적으로 사용하고자 할 때 주로 사용
print(ebola[0:10][0:5])
print('-------------')
print(ebola.fillna(0).iloc[0:10,0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone  \
0    1/5/2015  289        2776.0            NaN            10030.0   
1    1/4/2015  288        2775.0            NaN             9780.0   
2    1/3/2015  287        2769.0         8166.0             9722.0   
3    1/2/2015  286           NaN         8157.0                NaN   
4  12/31/2014  284        2730.0         8115.0             9633.0   

   Cases_Nigeria  Cases_Senegal  Cases_UnitedStates  Cases_Spain  Cases_Mali  \
0            NaN            NaN                 NaN          NaN         NaN   
1            NaN            NaN                 NaN          NaN         NaN   
2            NaN            NaN                 NaN          NaN         NaN   
3            NaN            NaN                 NaN          NaN         NaN   
4            NaN            NaN                 NaN          NaN         NaN   

   Deaths_Guinea  Deaths_Liberia  Deaths_SierraLeone  Deaths_Nigeria  \
0         1786.0          

In [94]:
print(ebola.iloc[0:10,0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286           NaN         8157.0                NaN
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0            NaN             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0            NaN             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [95]:
# 누락값을 앞의 값으로 채운다print(ebola.iloc[0:10,0:5])
print(ebola.fillna(method="ffill").iloc[0:10,0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2769.0         8157.0             9722.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         8018.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7977.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [98]:
# 누락값을 뒤의 값으로 채운다
print(ebola.fillna(method='bfill').iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2749.5         8157.0             9677.5
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         7997.5             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7919.5             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [99]:
# 누락값이 있는 양쪽의 평균으로 대체
print(ebola.interpolate().iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2749.5         8157.0             9677.5
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         7997.5             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7919.5             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


##  7.누락값 삭제
    df.dropna()

In [101]:
ebola_dropna = ebola.dropna() 
ebola_dropna.shape # (1, 18)
ebola_dropna 

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
19,11/18/2014,241,2047.0,7082.0,6190.0,20.0,1.0,4.0,1.0,6.0,1214.0,2963.0,1267.0,8.0,0.0,1.0,0.0,6.0


## 8.누락값이 포함된 데이터 계산하기

In [107]:
ebola['Cases_multiple'] = ebola['Cases_Guinea'] \
        + ebola['Cases_Liberia'] + ebola['Cases_SierraLeone']
ebola.shape  # (122, 19)
ebola.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali,Cases_multiple
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,,20657.0
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,,20478.0


In [103]:
ebloa_subset = ebola.loc[:,['Cases_Guinea', 'Cases_Liberia' \
                            , 'Cases_SierraLeone','Cases_multiple']]
ebloa_subset

Unnamed: 0,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_multiple
0,2776.0,,10030.0,
1,2775.0,,9780.0,
2,2769.0,8166.0,9722.0,20657.0
3,,8157.0,,
4,2730.0,8115.0,9633.0,20478.0
...,...,...,...,...
117,103.0,8.0,6.0,117.0
118,86.0,,,
119,86.0,,,
120,86.0,,,


In [108]:
# 누락값을 무시한 채 계산하려면 skipna인잣값을 True로 설정
ebola.Cases_Guinea.sum(skipna=True)

84729.0