# Handling Missing Data

In [1]:

import numpy as np
import pandas as pd

val = np.array([1, None, 3, 4])
val

array([1, None, 3, 4], dtype=object)

In [2]:
for dtype in ['object', 'int']:
    print('dtype: ', dtype)
    %timeit np.arange(1E6, dtype = dtype).sum
    print()

dtype:  object
143 ms ± 1.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype:  int
5.85 ms ± 19.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [3]:
val.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [4]:
val2 = np.array([1, np.nan, 3, 4])
val2 

array([ 1., nan,  3.,  4.])

In [5]:
val2.dtype

dtype('float64')

In [6]:
np.nan +1 

nan

In [7]:
np.nan * 8

nan

In [8]:
val2.sum(), val2.min(). val2.max()

AttributeError: 'numpy.float64' object has no attribute 'val2'

In [9]:
np.nansum(val2), np.nanmax(val2), np.nanmin(val2) #for ignor NaN values

(8.0, 4.0, 1.0)

In [10]:
pd.Series([1, None, 3, np.nan])

0    1.0
1    NaN
2    3.0
3    NaN
dtype: float64

In [11]:
pd.Series(['c', None, 'a', np.nan])

0       c
1    None
2       a
3     NaN
dtype: object

In [12]:
data = pd.Series([1, None, 'hello', np.nan])

In [13]:
data

0        1
1     None
2    hello
3      NaN
dtype: object

In [14]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [15]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [16]:
data[data.isnull()]

1    None
3     NaN
dtype: object

In [17]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [18]:
data.dropna()

0        1
2    hello
dtype: object

In [19]:
df = pd.DataFrame([[1, np.nan, 3],
                   [4, 5, 6],
                  [np.nan,8,9]])
df

Unnamed: 0,0,1,2
0,1.0,,3
1,4.0,5.0,6
2,,8.0,9


In [20]:
# dropna() drop entire row or column which has NaN value
df.dropna()  

Unnamed: 0,0,1,2
1,4.0,5.0,6


In [21]:
df.dropna(axis= 'columns') # dropna() will dorp all columns which contain any null value

Unnamed: 0,2
0,3
1,6
2,9


In [22]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,3,
1,4.0,5.0,6,
2,,8.0,9,


In [23]:
df.dropna(axis = 'columns', how= 'all') #  will drop row/column which has all null values

Unnamed: 0,0,1,2
0,1.0,,3
1,4.0,5.0,6
2,,8.0,9


In [24]:
data1 = pd.Series([1, np.nan, 2, None, 3], index= list('abcde'))
data1

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [25]:
data1.fillna(0)  #  fillna() fills the null values to 0

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [26]:
data1

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [27]:
#forward-fill
data1.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [28]:
# Backward-fill
data1.fillna(method= 'bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [29]:
df

Unnamed: 0,0,1,2,3
0,1.0,,3,
1,4.0,5.0,6,
2,,8.0,9,


In [30]:
df.fillna(method= 'ffill', axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,3.0,3.0
1,4.0,5.0,6.0,6.0
2,,8.0,9.0,9.0


# Hierarchical Indexing

In [31]:
index = [('California', 2000), ('California', 2010),('New York', 2000), ('New York', 2010),('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,18976457, 19378102,20851820, 25145561]

pop = pd.Series(populations, index= index) # the bad way of multi indexing
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [32]:
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [33]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [34]:
# the better way of multi indexing
index1 = pd.MultiIndex.from_tuples(index)

In [35]:
index1

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [36]:
pop = pop.reindex(index1)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [37]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

#MultiIndex as extra dimension

In [38]:
pop_df = pop.unstack()   #  unstack() converts the multi index series into to dataFrame
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [39]:
pop_df.stack() #   stack() is opposite of unstack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [40]:
pop_df = pd.DataFrame({
    'total' : pop,
    'Under18' :[9267089, 9284094,4687374, 4318033,5906301, 6879014]})
pop_df # Adding column in Multi-indexed series 

Unnamed: 0,Unnamed: 1,total,Under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


method of multi index creation

In [41]:
df = pd.DataFrame(np.random.rand(4, 2),index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],columns=['data1', 'data2'])  
df  # Passing a list of two or more index array

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.309326,0.736379
a,2,0.281746,0.420957
b,1,0.798712,0.915078
b,2,0.40469,0.31129


In [42]:
data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [43]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1,2,1,2]])


MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [44]:
pd.MultiIndex.from_tuples([('a',1), ('a',2), ('b',1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [45]:
pd.MultiIndex.from_product([['a', 'b'], [1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [46]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

  


MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [47]:
pop.index.names = ['state', 'Year'] # Naming the levels of multi Index
pop

state       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

# MultiIndexing for columns

In [48]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],names=['year', 'visit'])
column = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],names=['subject', 'type'])

In [49]:
# Mock some Data
data = np.round(np.random.randn(4,6), 1)
data

array([[-0.9, -0.3,  0.9, -1.5, -1.3, -0.6],
       [ 2. , -0.6, -0.8,  1.3,  1. , -1.3],
       [-0.1,  2.4, -0.9, -0.6, -1.3,  0.3],
       [ 0.5, -0.3,  1.6,  1.7, -0.5,  0.7]])

In [50]:
health_data = pd.DataFrame(data, index= index, columns= column)
health_data 

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,-0.9,-0.3,0.9,-1.5,-1.3,-0.6
2013,2,2.0,-0.6,-0.8,1.3,1.0,-1.3
2014,1,-0.1,2.4,-0.9,-0.6,-1.3,0.3
2014,2,0.5,-0.3,1.6,1.7,-0.5,0.7


In [51]:
health_data['Bob']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,-0.9,-0.3
2013,2,2.0,-0.6
2014,1,-0.1,2.4
2014,2,0.5,-0.3


In [52]:
pop

state       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [53]:
pop['California', 2000]

33871648

In [54]:
pop['California']

Year
2000    33871648
2010    37253956
dtype: int64

In [55]:
pop['California': 'New York']

state       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [56]:
pop.loc['California': 'New York']

state       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [57]:
pop[:, 2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [58]:
pop[pop > 22000000]

state       Year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [59]:
pop[['California', 'Texas']]

state       Year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [60]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,-0.9,-0.3,0.9,-1.5,-1.3,-0.6
2013,2,2.0,-0.6,-0.8,1.3,1.0,-1.3
2014,1,-0.1,2.4,-0.9,-0.6,-1.3,0.3
2014,2,0.5,-0.3,1.6,1.7,-0.5,0.7


In [61]:
health_data['Guido', 'HR']

year  visit
2013  1        0.9
      2       -0.8
2014  1       -0.9
      2        1.6
Name: (Guido, HR), dtype: float64

In [62]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,-0.9,-0.3
2013,2,2.0,-0.6


In [63]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1       -0.9
      2        2.0
2014  1       -0.1
      2        0.5
Name: (Bob, HR), dtype: float64

In [64]:
health_data.loc[(:, 1), (:, 'HR')]

SyntaxError: invalid syntax (<ipython-input-64-fb34fa30ac09>, line 1)

In [65]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,-0.9,0.9,-1.3
2014,1,-0.1,-0.9,-1.3


# Rearranging Multi-Indices

In [66]:
#Sorted and Unsorted indices
index = pd.MultiIndex.from_product([['a','c','b'], [1,2]])
data = pd.Series(np.random.randn(6), index = index)
data.index.names = ['char', 'int']

In [67]:
data

char  int
a     1      0.436416
      2     -0.940494
c     1      0.714998
      2      2.088456
b     1      0.061988
      2     -1.310234
dtype: float64

In [68]:
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [69]:
data = data.sort_index()
data

char  int
a     1      0.436416
      2     -0.940494
b     1      0.061988
      2     -1.310234
c     1      0.714998
      2      2.088456
dtype: float64

In [70]:
data['a':'b']

char  int
a     1      0.436416
      2     -0.940494
b     1      0.061988
      2     -1.310234
dtype: float64

In [71]:
#Stacking and unstacking indices
pop

state       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [72]:
pop.unstack(level=0)

state,California,New York,Texas
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [73]:
pop.unstack(level=1)

Year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [74]:
pop.unstack().stack()

state       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [75]:
#Index setting and resetting
pop

state       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [76]:
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,state,Year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [77]:
pop_flat.set_index(['state', 'Year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,Year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [78]:
#Data Aggregations on Multi-Indices
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,-0.9,-0.3,0.9,-1.5,-1.3,-0.6
2013,2,2.0,-0.6,-0.8,1.3,1.0,-1.3
2014,1,-0.1,2.4,-0.9,-0.6,-1.3,0.3
2014,2,0.5,-0.3,1.6,1.7,-0.5,0.7


In [81]:
data_mean = health_data.mean(level= 'year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,0.55,-0.45,0.05,-0.1,-0.15,-0.95
2014,0.2,1.05,0.35,0.55,-0.9,0.5


In [82]:
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,0.15,-0.5
2014,-0.116667,0.7
