# Handling Missing Data
* Missing data is refered to as NA.
* For numeric data, pandas use NaN to represent missing data.

In [10]:
import numpy as np
import pandas as pd

# numeric data:
data = pd.Series([1, np.nan, 2, 7, None])  # `None` will be replaced by NaN in numeric data
print(data)

data.isnull()

0    1.0
1    NaN
2    2.0
3    7.0
4    NaN
dtype: float64


0    False
1     True
2    False
3    False
4     True
dtype: bool

In [11]:
# non-numeric data
string_data = pd.Series(['a', 'b', np.nan, 'c', None])  # `None` will still be `None`,
                                                        # but it is also treated as NA
print(string_data)

string_data.isnull()

0       a
1       b
2     NaN
3       c
4    None
dtype: object


0    False
1    False
2     True
3    False
4     True
dtype: bool

Functions related to missing-data handling:
* `dropna`
* `fillna`
* `isnull`, `notnull`

## Filtering out missing data

In [18]:
# for Series:
s = pd.Series([1, np.nan, 2, 7, None])

s1 = s.dropna()   
s1 = s[s.notnull()]  # same
s1

0    1.0
2    2.0
3    7.0
dtype: float64

In [23]:
# for DataFrames:
df = pd.DataFrame([[2, 6, 1], [-1, np.nan, np.nan], 
                   [np.nan, np.nan, np.nan], [np.nan, -6, 9]])
df

Unnamed: 0,0,1,2
0,2.0,6.0,1.0
1,-1.0,,
2,,,
3,,-6.0,9.0


In [None]:
df1 = df.dropna()           # drop rows that contain a NA value
df1 = df.dropna(how='all')  # drop rows that contain are all NA values
df1 = df.dropna(axis=1)             # drop columns that contain a NA value
df1 = df.dropna(axis=1, how='all')  # drop columns that are all NA values

In [28]:
df.dropna(thresh=2)  # drop rows that contain less than 2 non-NA values

Unnamed: 0,0,1,2
0,2.0,6.0,1.0
3,,-6.0,9.0


## Filling in missing data

In [None]:
df1 = df.fillna(0.0)  # replace NA by 0.0
df1 = df.fillna({0: 0.0, 1: 0.1, 2: 0.2})  # specify values to fill for each column
df.fillna(0.0, inplace=True)  # will modify df

# Data Transformation
## Removing duplicates

In [33]:
df = pd.DataFrame([[1, 'a'], [2, 'a'], [1, 'a'], [1, 'b'], [2, 'b'], [2, 'b']],
                  columns=['k', 'v'])
df

Unnamed: 0,k,v
0,1,a
1,2,a
2,1,a
3,1,b
4,2,b
5,2,b


In [34]:
df.duplicated()  # whether a row is a duplicate (has been observed before)

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

In [35]:
df.duplicated(subset=['k'])  # check duplicates based on column 'k'

0    False
1    False
2     True
3     True
4     True
5     True
dtype: bool

In [None]:
df.drop_duplicates()  # remove duplicate rows
df.drop_duplicates(subset=['k'])  # remove duplicate rows based on column 'k'

## Replacing values

In [51]:
s = pd.Series([15, 12, 999, 6, 999, -1])
s.replace([999, -1], np.nan)  # replace 999 and -1 by NA
s.replace([999, -1], [np.nan, 0])  # replace 999 by NA and -1 by 0
s.replace({999: np.nan, -1: 0})  # replace 999 by NA and -1 by 0

0    15.0
1    12.0
2     NaN
3     6.0
4     NaN
5     0.0
dtype: float64

## Transforming values using function or mapping

In [45]:
s = pd.Series(['Al', 'Bo', 'Ca'])

s.str.lower()  # convert each element to lowercase

0    al
1    bo
2    ca
dtype: object

In [49]:
# using map(), passing a function:
s1 = s.map(lambda x: x[0].lower())
print(s1)

# using map(), passing a dictionary:
d = {'a': 1, 'b': 2, 'c': 3}
s1.map(d)

0    a
1    b
2    c
dtype: object


0    1
1    2
2    3
dtype: int64

## Renaming axis indices

In [50]:
df = pd.DataFrame([['a', 1], ['b', 2], ['c', 3]], columns=['key', 'value'])
print(df, '\n')

# rename columns
new_col_names = df.columns.map(lambda x: x.upper())
print(new_col_names)

# rename index
new_index = df.index.map(str)
print(new_index)

# rename columns and index
df1 = df.rename(index=lambda i: 'r{}'.format(i),
                columns=lambda x: x.title())
df.rename(columns={'key': 'KEY', 'value': 'VALUE'}, inplace=True)

  key  value
0   a      1
1   b      2
2   c      3 

Index(['KEY', 'VALUE'], dtype='object')
Index(['0', '1', '2'], dtype='object')


In [52]:
df.add_prefix('c_')

Unnamed: 0,c_KEY,c_VALUE
0,a,1
1,b,2
2,c,3


## Discretization and binning of continuous data

In [8]:
import numpy as np
import pandas as pd

data = np.random.rand(20)

### categorize `data` into bins [0, 0.2), [0.2, 0.4), ..., [0.8, 1.0)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
cat = pd.cut(data, bins, right=False) 
            # * "right=False" indicates that the intervals are 
            #   closed in left edges, rather than in right edges
            # * The returned `cat` is a Categorical object
cat

[[0.8, 1.0), [0.4, 0.6), [0.2, 0.4), [0.8, 1.0), [0.6, 0.8), ..., [0.4, 0.6), [0.2, 0.4), [0.2, 0.4), [0.2, 0.4), [0.0, 0.2)]
Length: 20
Categories (5, interval[float64]): [[0.0, 0.2) < [0.2, 0.4) < [0.4, 0.6) < [0.6, 0.8) < [0.8, 1.0)]

In [9]:
cat.codes  # bin index to which data[i] belongs

array([4, 2, 1, 4, 3, 3, 4, 1, 3, 3, 1, 1, 3, 1, 1, 2, 1, 1, 1, 0],
      dtype=int8)

In [10]:
cat.categories

IntervalIndex([[0.0, 0.2), [0.2, 0.4), [0.4, 0.6), [0.6, 0.8), [0.8, 1.0)],
              closed='left',
              dtype='interval[float64]')

In [11]:
# get value count of each bin
pd.value_counts(cat)

[0.2, 0.4)    9
[0.6, 0.8)    5
[0.8, 1.0)    3
[0.4, 0.6)    2
[0.0, 0.2)    1
dtype: int64

In [26]:
# specify names/labels of bins
pd.cut(data, bins, right=False, labels=['xs', 's', 'm', 'l', 'xl'])

[xl, m, s, xl, l, ..., m, s, s, s, xs]
Length: 20
Categories (5, object): [xs < s < m < l < xl]

In [19]:
print(data.min(), data.max())

### categorize `data` into equal-length bins
cat = pd.cut(data, 5, precision=6, right=False)
cat.categories

0.16651280924994605 0.8811973399035696


IntervalIndex([[0.166513, 0.30945), [0.30945, 0.452387), [0.452387, 0.595324), [0.595324, 0.73826), [0.73826, 0.881912)],
              closed='left',
              dtype='interval[float64]')

In [25]:
### categorize based on quantiles
cat = pd.qcut(data, 5)  # bin edges are 0, 20, 40, 60, 80, 100 percentiles
cat = pd.qcut(data, [0.0, 0.1, 0.5, 0.9, 1.0])  # specify quantiles as bin edges

IntervalIndex([(0.166, 0.247], (0.247, 0.307], (0.307, 0.529], (0.529, 0.703], (0.703, 0.881]],
              closed='right',
              dtype='interval[float64]')

## Permutation and random sampling

In [28]:
# random permutation
np.random.permutation(5)

array([4, 0, 1, 3, 2])

In [35]:
# random sample, without replacement
df = pd.DataFrame(np.arange(8).reshape(4,2))
df.sample(2)

Unnamed: 0,0,1
2,4,5
1,2,3


In [40]:
# sampling with replacement (allowing repeats)
df.sample(8, replace=True)

Unnamed: 0,0,1
1,2,3
3,6,7
1,2,3
2,4,5
0,0,1
3,6,7
3,6,7
2,4,5


## Indicator/dummy variables 

In [49]:
df = pd.DataFrame({'x': ['a', 'b', 'a', 'c', 'b', 'c'], 'y': range(6)})
print(df)

x_dummy = pd.get_dummies(df['x'], prefix='X')
x_dummy

   x  y
0  a  0
1  b  1
2  a  2
3  c  3
4  b  4
5  c  5


Unnamed: 0,X_a,X_b,X_c
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,1
4,0,1,0
5,0,0,1


In [47]:
df[['y']].join(x_dummy)

Unnamed: 0,y,X_a,X_b,X_c
0,0,1,0,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,0,1,0
5,5,0,0,1


# String Manipulation
## Vectorized string functions
* [a summary of related methods](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#method-summary)

In [18]:
import numpy as np
import pandas as pd

data = pd.Series({'Al':'al@google.com', 'Bo':'bo@gmail.com', 
                  'Ci':'ci@gmail.com', 'Du':np.nan})
data

Al    al@google.com
Bo     bo@gmail.com
Ci     ci@gmail.com
Du              NaN
dtype: object

In [19]:
# example 1
data.str.contains('gmail')

Al    False
Bo     True
Ci     True
Du      NaN
dtype: object

In [22]:
# example 2
data.str[0:5]

Al    al@go
Bo    bo@gm
Ci    ci@gm
Du      NaN
dtype: object

In [20]:
# example 3
import re
pattern = r'([a-z0-9_]+)@([a-z0-9_]+)\.([a-z]{2,4})'
matches = data.str.findall(pattern, flags=re.IGNORECASE)
matches

Al    [(al, google, com)]
Bo     [(bo, gmail, com)]
Ci     [(ci, gmail, com)]
Du                    NaN
dtype: object

* Note. `matches[i]` is a list (except for the NaN value)

In [21]:
matches.str[0]  # indexing the list of each (non-NA) element

Al    (al, google, com)
Bo     (bo, gmail, com)
Ci     (ci, gmail, com)
Du                  NaN
dtype: object

In [24]:
# example 4
pattern = r'(?P<GR0>[a-z0-9_]+)@(?P<GR1>[a-z0-9_]+)\.(?P<GR2>[a-z]{2,4})'
data.str.extract(pattern, flags=re.IGNORECASE)  # returns a DataFrame where the fields are
                                                # the captured groups

Unnamed: 0,GR0,GR1,GR2
Al,al,google,com
Bo,bo,gmail,com
Ci,ci,gmail,com
Du,,,
