## How to handle missing values? 

In [21]:
import pandas as pd

In [22]:
#create dataframe
ufo_path = 'http://bit.ly/uforeports'
ufo = pd.read_csv(ufo_path)

In [23]:
ufo.tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00
18238,Eagle River,,,WI,12/31/2000 23:45
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45
18240,Ybor,,OVAL,FL,12/31/2000 23:59


### How to find NaN-values with isna and notna

In [24]:
#IMPORTANT: PANDAS
#isna #isnull #NaN #missingvalues #explore #prepare #pandas

#isna and isnull are the same (isnull will be deprecated)
#isna shows count of if 0 then true
#isna can count Trues as 1 and False as 0 by sum
ufo.isna().sum(axis=0)

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [25]:
#IMPORTANT: PANDAS
#isna #isnull #NaN #missingvalues #explore #prepare #pandas

#percentage of missing values
ufo.isna().mean()

City               0.001371
Colors Reported    0.842004
Shape Reported     0.144948
State              0.000000
Time               0.000000
dtype: float64

In [26]:
#IMPORTANT: PANDAS
#isna #isnull #NaN #missingvalues #cols #explore #prepare #pandas

#only keep cols in which less than 15% are NaN
ufo.dropna(thresh=len(ufo)*0.85, axis='columns').head()

Unnamed: 0,City,Shape Reported,State,Time
0,Ithaca,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,OTHER,NJ,6/30/1930 20:00
2,Holyoke,OVAL,CO,2/15/1931 14:00
3,Abilene,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,LIGHT,NY,4/18/1933 19:00


In [27]:
#IMPORTANT: PANDAS
#notna #notnull #NaN #missingvalues #explore #prepare #pandas

#notna is the opposite of isna
#notna and notnull are the same (notnull will be deprecated)
#notna shows count of if 0 then true
#notna can count Trues as 1 and False as 0 by sum
ufo.notna().sum(axis=0)

City               18216
Colors Reported     2882
Shape Reported     15597
State              18241
Time               18241
dtype: int64

In [28]:
pd.Series([True,False,True]).sum()

2

In [29]:
ufo[ufo.City.isna()]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
21,,,,LA,8/15/1943 0:00
22,,,LIGHT,LA,8/15/1943 0:00
204,,,DISK,CA,7/15/1952 12:30
241,,BLUE,DISK,MT,7/4/1953 14:00
613,,,DISK,NV,7/1/1960 12:00
1877,,YELLOW,CIRCLE,AZ,8/15/1969 1:00
2013,,,,NH,8/1/1970 9:30
2546,,,FIREBALL,OH,10/25/1973 23:30
3123,,RED,TRIANGLE,WV,11/25/1975 23:00
4736,,,SPHERE,CA,6/23/1982 23:00


In [30]:
#compare shapes
print(ufo.shape)
print(ufo[ufo.City.isna()].shape)

(18241, 5)
(25, 5)


### How to drop NaN-values with dropna

In [31]:
#option 1: drop NaN-Values (wenn row einen NaN hat)
ufo.dropna(how='any').shape

(2486, 5)

In [32]:
#option 2: If all values of a row = NaN
ufo.dropna(how='all').shape

(18241, 5)

In [33]:
#IMPORTANT: PANDAS
#dropna #NaN #missingvalues #explore #prepare #pandas

#option 3: Drop if any value of a subset is NaN
ufo.dropna(subset=['City','Shape Reported'], how='any').shape

(15576, 5)

In [34]:
#option 4: Drop if all values of a subset are NaNs
ufo.dropna(subset=['City','Shape Reported'], how='all').shape

(18237, 5)

### How to fill NaN-values with fillna

In [35]:
#IMPORTANT: PANDAS
#value_counts #isna #NaN #missingvalues #explore #prepare #pandas

#value counts and show NaN-values (2nd position)
ufo['Shape Reported'].value_counts(dropna=False)

LIGHT        2803
NaN          2644
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
ROUND           2
CRESCENT        2
PYRAMID         1
FLARE           1
HEXAGON         1
DOME            1
Name: Shape Reported, dtype: int64

In [36]:
#IMPORTANT: PANDAS
#fillna #NaN #missingvalues #explore #prepare #pandas

#fill in NaN Values with a constant value
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)

In [37]:
ufo['Shape Reported'].value_counts(dropna=False)

VARIOUS      2977
LIGHT        2803
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
ROUND           2
CRESCENT        2
HEXAGON         1
DOME            1
FLARE           1
PYRAMID         1
Name: Shape Reported, dtype: int64

In [38]:
#Information
#there are much more advanced concepts of fill NaNs in scikit-learn
#fill NaNs with scikit-learn is much more efficient and less time consuming when using pipelines