# Handling Missing Data
* Missing data is refered to as NA.
* For numeric data, pandas use NaN to represent missing data.

In [10]:
import numpy as np
import pandas as pd

# numeric data:
data = pd.Series([1, np.nan, 2, 7, None])  # `None` will be replaced by NaN in numeric data
print(data)

data.isnull()

0    1.0
1    NaN
2    2.0
3    7.0
4    NaN
dtype: float64


0    False
1     True
2    False
3    False
4     True
dtype: bool

In [11]:
# non-numeric data
string_data = pd.Series(['a', 'b', np.nan, 'c', None])  # `None` will still be `None`,
                                                        # but it is also treated as NA
print(string_data)

string_data.isnull()

0       a
1       b
2     NaN
3       c
4    None
dtype: object


0    False
1    False
2     True
3    False
4     True
dtype: bool

Functions related to missing-data handling:
* `dropna`
* `fillna`
* `isnull`, `notnull`

## Filtering out missing data

In [18]:
# for Series:
s = pd.Series([1, np.nan, 2, 7, None])

s1 = s.dropna()   
s1 = s[s.notnull()]  # same
s1

0    1.0
2    2.0
3    7.0
dtype: float64

In [23]:
# for DataFrames:
df = pd.DataFrame([[2, 6, 1], [-1, np.nan, np.nan], 
                   [np.nan, np.nan, np.nan], [np.nan, -6, 9]])
df

Unnamed: 0,0,1,2
0,2.0,6.0,1.0
1,-1.0,,
2,,,
3,,-6.0,9.0


In [None]:
df1 = df.dropna()           # drop rows that contain a NA value
df1 = df.dropna(how='all')  # drop rows that contain are all NA values
df1 = df.dropna(axis=1)             # drop columns that contain a NA value
df1 = df.dropna(axis=1, how='all')  # drop columns that are all NA values

In [28]:
df.dropna(thresh=2)  # drop rows that contain less than 2 non-NA values

Unnamed: 0,0,1,2
0,2.0,6.0,1.0
3,,-6.0,9.0


## Filling in missing data

In [None]:
df1 = df.fillna(0.0)  # replace NA by 0.0
df1 = df.fillna({0: 0.0, 1: 0.1, 2: 0.2})  # specify values to fill for each column
df.fillna(0.0, inplace=True)  # will modify df

# Data Transformation
## Removing duplicates

In [33]:
df = pd.DataFrame([[1, 'a'], [2, 'a'], [1, 'a'], [1, 'b'], [2, 'b'], [2, 'b']],
                  columns=['k', 'v'])
df

Unnamed: 0,k,v
0,1,a
1,2,a
2,1,a
3,1,b
4,2,b
5,2,b


In [34]:
df.duplicated()  # whether a row is a duplicate (has been observed before)

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

In [35]:
df.duplicated(subset=['k'])  # check duplicates based on column 'k'

0    False
1    False
2     True
3     True
4     True
5     True
dtype: bool

In [None]:
df.drop_duplicates()  # remove duplicate rows
df.drop_duplicates(subset=['k'])  # remove duplicate rows based on column 'k'

## Replacing values

In [51]:
s = pd.Series([15, 12, 999, 6, 999, -1])
s.replace([999, -1], np.nan)  # replace 999 and -1 by NA
s.replace([999, -1], [np.nan, 0])  # replace 999 by NA and -1 by 0
s.replace({999: np.nan, -1: 0})  # replace 999 by NA and -1 by 0

0    15.0
1    12.0
2     NaN
3     6.0
4     NaN
5     0.0
dtype: float64

## Transforming values using function or mapping

In [45]:
s = pd.Series(['Al', 'Bo', 'Ca'])

s.str.lower()  # convert each element to lowercase

0    al
1    bo
2    ca
dtype: object

In [49]:
# using map(), passing a function:
s1 = s.map(lambda x: x[0].lower())
print(s1)

# using map(), passing a dictionary:
d = {'a': 1, 'b': 2, 'c': 3}
s1.map(d)

0    a
1    b
2    c
dtype: object


0    1
1    2
2    3
dtype: int64

## Renaming axis indices

In [72]:
df = pd.DataFrame([['a', 1], ['b', 2], ['c', 3]], columns=['key', 'value'])
print(df, '\n')

# rename columns
new_col_names = df.columns.map(lambda x: x.upper())
print(new_col_names)

# rename index
new_index = df.index.map(str)
print(new_index)

# rename columns and index
df1 = df.rename(index=lambda i: 'r{}'.format(i),
                columns=lambda x: x.title())
df.rename(columns={'key': 'KEY', 'value': 'VALUE'}, inplace=True)

  key  value
0   a      1
1   b      2
2   c      3 

Index(['KEY', 'VALUE'], dtype='object')
Index(['0', '1', '2'], dtype='object')


Unnamed: 0,KEY,VALUE
0,a,1
1,b,2
2,c,3
