# Hierarchical indexing in pandas

In [54]:
import pandas as pd
import numpy as np

### Pandas series will accept more than one index

In [55]:
data = pd.Series(np.random.randn(10),
                index=[['a','a','a','b','b','b','c','c','c','d'],[1,2,3,1,2,3,1,2,3,3]])
data

a  1   -0.379255
   2    0.077867
   3   -0.365154
b  1   -0.615564
   2   -0.536015
   3    1.600266
c  1    0.073475
   2   -0.537540
   3   -1.618616
d  3   -0.085408
dtype: float64

In [56]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 2]])

### It is possible to access sub groups of data

In [57]:
data['b']

1   -0.615564
2   -0.536015
3    1.600266
dtype: float64

In [58]:
data['b':'c']

b  1   -0.615564
   2   -0.536015
   3    1.600266
c  1    0.073475
   2   -0.537540
   3   -1.618616
dtype: float64

In [59]:
data[:,2]

a    0.077867
b   -0.536015
c   -0.537540
dtype: float64

### Unstack allows the series to be rearranged into a dataframe 

In [60]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.379255,0.077867,-0.365154
b,-0.615564,-0.536015,1.600266
c,0.073475,-0.53754,-1.618616
d,,,-0.085408


In [61]:
data.unstack().stack()

a  1   -0.379255
   2    0.077867
   3   -0.365154
b  1   -0.615564
   2   -0.536015
   3    1.600266
c  1    0.073475
   2   -0.537540
   3   -1.618616
d  3   -0.085408
dtype: float64

### Reordering and sorting levels

In [62]:
data.swaplevel()

1  a   -0.379255
2  a    0.077867
3  a   -0.365154
1  b   -0.615564
2  b   -0.536015
3  b    1.600266
1  c    0.073475
2  c   -0.537540
3  c   -1.618616
   d   -0.085408
dtype: float64

In [63]:
data.sortlevel(1)

a  1   -0.379255
b  1   -0.615564
c  1    0.073475
a  2    0.077867
b  2   -0.536015
c  2   -0.537540
a  3   -0.365154
b  3    1.600266
c  3   -1.618616
d  3   -0.085408
dtype: float64

### It often returns a better series if it is sorted lexicographically on the index that will be on the outside

In [64]:
y = data.sortlevel(1).swaplevel()
y

1  a   -0.379255
   b   -0.615564
   c    0.073475
2  a    0.077867
   b   -0.536015
   c   -0.537540
3  a   -0.365154
   b    1.600266
   c   -1.618616
   d   -0.085408
dtype: float64

### Descriptive statistics can be done by level using the level option

In [65]:
y.sum(level=0)

1   -0.921344
2   -0.995688
3   -0.468912
dtype: float64

### Using an existing column as an index

In [66]:
frame = pd.DataFrame(np.random.randint(0,100,(3,4)))
frame['classifier'] = ['one','one','two']
frame

Unnamed: 0,0,1,2,3,classifier
0,5,90,90,56,one
1,24,27,29,33,one
2,32,65,55,59,two


In [67]:
frame.set_index('classifier',inplace=True)
frame
#You can keep the column by setting drop=False

Unnamed: 0_level_0,0,1,2,3
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,5,90,90,56
one,24,27,29,33
two,32,65,55,59


In [68]:
frame.reset_index(inplace=True)
frame

Unnamed: 0,classifier,0,1,2,3
0,one,5,90,90,56
1,one,24,27,29,33
2,two,32,65,55,59


In [69]:
frame['type'] = ['stone','stone','wood']
frame

Unnamed: 0,classifier,0,1,2,3,type
0,one,5,90,90,56,stone
1,one,24,27,29,33,stone
2,two,32,65,55,59,wood


In [70]:
#probably the most useful way to obtain multiIndex's
frame.set_index(['classifier','type'],inplace=True)
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
classifier,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,stone,5,90,90,56
one,stone,24,27,29,33
two,wood,32,65,55,59


In [71]:
frame[0].mean(level=1)

type
stone    14.5
wood     32.0
Name: 0, dtype: float64

### MultiIndex for columns

In [72]:
index = pd.MultiIndex.from_product([[2016,2017], [1,2]], names=['Year','Visit'])
columns = pd.MultiIndex.from_product([['Jim','Bob','Frank'], ['Business','Casual']], names=['Name','Visit type'])

In [73]:
data = np.round(np.random.randn(4,6), 1)
data[:, ::2] *= 10
data += 37

In [74]:
df = pd.DataFrame(data, index=index, columns=columns)

In [75]:
df

Unnamed: 0_level_0,Name,Jim,Jim,Bob,Bob,Frank,Frank
Unnamed: 0_level_1,Visit type,Business,Casual,Business,Casual,Business,Casual
Year,Visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2016,1,37.0,37.6,47.0,38.4,41.0,38.2
2016,2,31.0,36.0,28.0,34.8,35.0,36.7
2017,1,38.0,37.0,19.0,38.2,28.0,38.2
2017,2,38.0,37.8,35.0,35.2,44.0,35.9


In [76]:
df['Jim']

Unnamed: 0_level_0,Visit type,Business,Casual
Year,Visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2016,1,37.0,37.6
2016,2,31.0,36.0
2017,1,38.0,37.0
2017,2,38.0,37.8


In [77]:
df['Jim'].unstack(level=0)

Visit type,Business,Business,Casual,Casual
Year,2016,2017,2016,2017
Visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,37.0,38.0,37.6,37.0
2,31.0,38.0,36.0,37.8


In [78]:
df['Jim'].unstack(level=1)

Visit type,Business,Business,Casual,Casual
Visit,1,2,1,2
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2016,37.0,31.0,37.6,36.0
2017,38.0,38.0,37.0,37.8


In [79]:
df.sortlevel('Name', axis=1,inplace=True)
df

Unnamed: 0_level_0,Name,Bob,Bob,Frank,Frank,Jim,Jim
Unnamed: 0_level_1,Visit type,Business,Casual,Business,Casual,Business,Casual
Year,Visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2016,1,47.0,38.4,41.0,38.2,37.0,37.6
2016,2,28.0,34.8,35.0,36.7,31.0,36.0
2017,1,19.0,38.2,28.0,38.2,38.0,37.0
2017,2,35.0,35.2,44.0,35.9,38.0,37.8


In [80]:
idx = pd.IndexSlice
df.loc[idx[:,1], idx[:,'Business']]

Unnamed: 0_level_0,Name,Bob,Frank,Jim
Unnamed: 0_level_1,Visit type,Business,Business,Business
Year,Visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2016,1,47.0,41.0,37.0
2017,1,19.0,28.0,38.0
