In [1]:
# Hierarchical Indexing

#  Hierarchical indexing is an important feature of pandas enabling you to have multiple
#  (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you
#  to work with higher dimensional data in a lower dimensional form.
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
print(data)
data.index  # MultiIndex as its index

a  1    0.145711
   2   -0.093945
   3    0.350624
b  1    1.082565
   2    0.372194
   3   -1.204185
c  1    0.284388
   2   -1.703306
d  2    2.432252
   3    0.272251
dtype: float64


MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [2]:
# With a hierarchically-indexed object, so-called partial indexing is possible, enabling you to concisely select subsets of the data
print(data['b'])
print("")
print(data['b':'c'])
print("")
data.ix[['b', 'd']] # not working

1    1.082565
2    0.372194
3   -1.204185
dtype: float64

b  1    1.082565
   2    0.372194
   3   -1.204185
c  1    0.284388
   2   -1.703306
dtype: float64



AttributeError: 'Series' object has no attribute 'ix'

In [None]:
# Selection is even possible in some cases from an “inner” level
data[:,2]

In [None]:
# this data could be rearranged into a DataFrame using its unstack method
print(data.unstack())
#The inverse operation of unstack is stack
data.unstack().stack()

In [16]:
#  With a DataFrame, either axis can have a hierarchical index
frame=DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio', 'Ohio', 'Colorado'],['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [17]:
# The hierarchical levels can have names (as strings or any Python objects). If so, these
# will show up in the console output (don’t confuse the index names with the axis labels!)

frame.index.names=['key1','key2']
frame.columns.names=['state','color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [18]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [19]:
# A MultiIndex can be created by itself and then reused; the columns in the above Data
# Frame with level names could be created like this:
#  MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
#                        names=['state', 'color'])

In [20]:
# Reordering and Sorting Levels

In [25]:
# The swaplevel takes two level numbers or names and
#  returns a new object with the levels interchanged (but the data is otherwise unaltered)

print(frame.swaplevel('key1','key2'))


state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
2    a        3   4        5
1    b        6   7        8
2    b        9  10       11


In [26]:
# Summary Statistics by Level

# Many descriptive and summary statistics on DataFrame and Series have a level option
#  in which you can specify the level you want to sum by on a particular axis. Consider
#  the above DataFrame; we can sum by level on either the rows or columns like so

print( frame.sum(level='key2'))
print('')
frame.sum(level='color',axis=1)

state  Ohio     Colorado
color Green Red    Green
key2                    
1         6   8       10
2        12  14       16



  print( frame.sum(level='key2'))
  frame.sum(level='color',axis=1)


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [27]:
#  Using a DataFrame’s Columns

#  It’s not unusual to want to use one or more columns from a DataFrame as the row
#  index; alternatively, you may wish to move the row index into the DataFrame’s col
# umns.

frame=DataFrame({
    'a':range(7),
    'b':range(7,0,-1),
    'c':['one', 'one', 'one', 'two', 'two', 'two', 'two'],
    'd':[0, 1, 2, 0, 1, 2, 3]
})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [29]:
# DataFrame’s set_index function will create a new DataFrame using one or more of itscolumns as the index

frame2=frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [30]:
frame.set_index(['c','d'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [31]:
# reset_index, on the other hand, does the opposite of set_index; the hierarchical index
#  levels are are moved into the columns

frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [32]:
# Other pandas Topics
#  Integer Indexing

ser = Series(np.arange(3.))
ser[-1] # generates error

KeyError: -1

In [33]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [34]:
# to solve the above error
ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

2.0

In [None]:
# END PANDAS