# hierarchical indexing

hierarchical indexing enables us to expand pandas dataframe to deal with high dimensional data (>2). Getting data out can be tricky in certain scenario especially when it involves colon, but let's first show how to make it.
1. from_arrays: Got a list of list, but each list has to have the same length of the axis we use.
2. from_product: Got a list of list, but the product (cartesian) has the same length of the axis. 

In [18]:
import pandas as pd 
import numpy as np
df = pd.DataFrame(np.random.randn(10,3))
df.index = pd.MultiIndex.from_product([['country_1', 'country_2'], list('abcde')])
df.columns = pd.MultiIndex.from_arrays([['province_1', 'province_1', 'province_2'], ['A', 'B', 'A']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,province_1,province_1,province_2
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,A
country_1,a,0.127382,-0.145379,-0.554296
country_1,b,-0.28355,2.24159,-0.343282
country_1,c,0.711099,-1.908173,-0.485162
country_1,d,1.590531,-2.02379,-1.362654
country_1,e,-1.293502,0.445486,1.794775
country_2,a,-0.370966,-0.363976,0.423996
country_2,b,-0.777755,0.699254,0.457224
country_2,c,-0.543855,-1.033849,1.042942
country_2,d,0.070436,0.970678,-0.974975
country_2,e,1.012209,-1.214662,-0.582336


In [24]:
#these two work as expected
df.loc['country_1']
df['province_1']

Unnamed: 0,Unnamed: 1,A,B
country_1,a,0.127382,-0.145379
country_1,b,-0.28355,2.24159
country_1,c,0.711099,-1.908173
country_1,d,1.590531,-2.02379
country_1,e,-1.293502,0.445486
country_2,a,-0.370966,-0.363976
country_2,b,-0.777755,0.699254
country_2,c,-0.543855,-1.033849
country_2,d,0.070436,0.970678
country_2,e,1.012209,-1.214662


To get the inner layer is harder, by which I mean just retrieve the rows with 'a'. Notice that this should get back two records, one has something to do with country_1 and the other has something to do with country_2. Such opeations are easy to implement in Series, but need some tweak in dataframe.

In [40]:
a_series = df['province_1']['A']
a_series.loc[:, 'a'] #retrieve the second layer

country_1    0.127382
country_2   -0.370966
Name: A, dtype: float64

In [46]:
#df.loc[[:, 'a'], :] # this intuitive way will give a syntax error
from pandas import IndexSlice as slicer #this will work.. just slap this operator on.
df.loc[slicer[:, 'a'], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,province_1,province_1,province_2
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,A
country_1,a,0.127382,-0.145379,-0.554296
country_2,a,-0.370966,-0.363976,0.423996


In [47]:
df.loc[:, slicer[:, 'A']] #this also works for column index as long as it's multiindex object

Unnamed: 0_level_0,Unnamed: 1_level_0,province_1,province_2
Unnamed: 0_level_1,Unnamed: 1_level_1,A,A
country_1,a,0.127382,-0.554296
country_1,b,-0.28355,-0.343282
country_1,c,0.711099,-0.485162
country_1,d,1.590531,-1.362654
country_1,e,-1.293502,1.794775
country_2,a,-0.370966,0.423996
country_2,b,-0.777755,0.457224
country_2,c,-0.543855,1.042942
country_2,d,0.070436,-0.974975
country_2,e,1.012209,-0.582336


That system does not work on iloc. The iloc operator simply disregards the multi-index system and count the dataframe as a two dimensional matrix.

In [52]:
df.iloc[[0,3,5], [0,2]]

Unnamed: 0_level_0,Unnamed: 1_level_0,province_1,province_2
Unnamed: 0_level_1,Unnamed: 1_level_1,A,A
country_1,a,0.127382,-0.554296
country_1,d,1.590531,-1.362654
country_2,a,-0.370966,0.423996


We can do some operation on index, arguarably more so than values. First, we can exchange the level from inner to outer or vice versa. However, we need to set axis=1 if we want to swap the index of columns. By the way, we name the axes or levels to make it easier for us to maniplate them later as we can see momentarily.

In [65]:
df.columns.names = ['province', 'letter']
df.index.names = ['country', 'lower_letter']
df

Unnamed: 0_level_0,province,province_1,province_1,province_2
Unnamed: 0_level_1,letter,A,B,A
country,lower_letter,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
country_1,a,0.127382,-0.145379,-0.554296
country_1,b,-0.28355,2.24159,-0.343282
country_1,c,0.711099,-1.908173,-0.485162
country_1,d,1.590531,-2.02379,-1.362654
country_1,e,-1.293502,0.445486,1.794775
country_2,a,-0.370966,-0.363976,0.423996
country_2,b,-0.777755,0.699254,0.457224
country_2,c,-0.543855,-1.033849,1.042942
country_2,d,0.070436,0.970678,-0.974975
country_2,e,1.012209,-1.214662,-0.582336


In [66]:
df.swaplevel('province', 'letter', axis=1)

Unnamed: 0_level_0,letter,A,B,A
Unnamed: 0_level_1,province,province_1,province_1,province_2
country,lower_letter,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
country_1,a,0.127382,-0.145379,-0.554296
country_1,b,-0.28355,2.24159,-0.343282
country_1,c,0.711099,-1.908173,-0.485162
country_1,d,1.590531,-2.02379,-1.362654
country_1,e,-1.293502,0.445486,1.794775
country_2,a,-0.370966,-0.363976,0.423996
country_2,b,-0.777755,0.699254,0.457224
country_2,c,-0.543855,-1.033849,1.042942
country_2,d,0.070436,0.970678,-0.974975
country_2,e,1.012209,-1.214662,-0.582336


In [67]:
df.swaplevel('country', 'lower_letter', axis=0)

Unnamed: 0_level_0,province,province_1,province_1,province_2
Unnamed: 0_level_1,letter,A,B,A
lower_letter,country,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,country_1,0.127382,-0.145379,-0.554296
b,country_1,-0.28355,2.24159,-0.343282
c,country_1,0.711099,-1.908173,-0.485162
d,country_1,1.590531,-2.02379,-1.362654
e,country_1,-1.293502,0.445486,1.794775
a,country_2,-0.370966,-0.363976,0.423996
b,country_2,-0.777755,0.699254,0.457224
c,country_2,-0.543855,-1.033849,1.042942
d,country_2,0.070436,0.970678,-0.974975
e,country_2,1.012209,-1.214662,-0.582336


In [73]:
df.swapaxes(axis1=0, axis2=1)#basically transpose

Unnamed: 0_level_0,country,country_1,country_1,country_1,country_1,country_1,country_2,country_2,country_2,country_2,country_2
Unnamed: 0_level_1,lower_letter,a,b,c,d,e,a,b,c,d,e
province,letter,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
province_1,A,0.127382,-0.28355,0.711099,1.590531,-1.293502,-0.370966,-0.777755,-0.543855,0.070436,1.012209
province_1,B,-0.145379,2.24159,-1.908173,-2.02379,0.445486,-0.363976,0.699254,-1.033849,0.970678,-1.214662
province_2,A,-0.554296,-0.343282,-0.485162,-1.362654,1.794775,0.423996,0.457224,1.042942,-0.974975,-0.582336


Still we can use numbers to refer the axes, but it is more clearly we assign a sensible name to it. axis determines if its row (0) or column (1) and level determines if it's inner or outer. The out most has level=0 and that increases as we go inside the indices.

In [79]:
df.sort_index(axis=1, level=1)
df.sort_index(axis='columns', level='letter') # do it more explicitly
# notice that A comes first and B comes later and that overides the province order

Unnamed: 0_level_0,province,province_1,province_2,province_1
Unnamed: 0_level_1,letter,A,A,B
country,lower_letter,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
country_1,a,0.127382,-0.554296,-0.145379
country_1,b,-0.28355,-0.343282,2.24159
country_1,c,0.711099,-0.485162,-1.908173
country_1,d,1.590531,-1.362654,-2.02379
country_1,e,-1.293502,1.794775,0.445486
country_2,a,-0.370966,0.423996,-0.363976
country_2,b,-0.777755,0.457224,0.699254
country_2,c,-0.543855,1.042942,-1.033849
country_2,d,0.070436,-0.974975,0.970678
country_2,e,1.012209,-0.582336,-1.214662
