In [None]:
import numpy as np
import pandas as pd

# 1. Hierarchical Indexing (MultiIndex)
## 1.1. Creating a MultiIndex (hierarchical index) object

Create `MultiIndex` from a list of arrays using `MultiIndex.from_arrays()`

In [2]:
arrays = [
    ['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
    ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']
]

tuples = list(zip(*arrays))
print(tuples)

[('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]


In [3]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
print(index)

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])


In [5]:
s = pd.Series(np.random.randn(8), index=index)
print(s)

first  second
bar    one      -0.570037
       two      -0.221636
baz    one       0.422605
       two      -0.488381
foo    one      -0.722394
       two      -1.764910
qux    one       0.370350
       two       0.733640
dtype: float64


Use `MultiIndex.from_product()` to pair the elements in two iterables

In [6]:
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]

pd.MultiIndex.from_product(iterables, names=['first', 'second'])

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

Use `MultiIndex.from_frame()` to construct a `MultiIndex` from a `DataFrame` directly. This is a complementary method to `MultiIndex.to_frame()`

In [7]:
df = pd.DataFrame(
    [['bar', 'one'], ['bar', 'two'], ['foo', 'one'], ['foo', 'two']],
    columns=['first', 'second']
)

pd.MultiIndex.from_frame(df)

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('foo', 'one'),
            ('foo', 'two')],
           names=['first', 'second'])

You can pass a list of arrays directly into `Series` or `DataFrame` to construct a `MultiIndex` automatically:

In [8]:
arrays = [
    np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
    np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])
]

s = pd.Series(np.random.randn(8), index=arrays)
print(s)

bar  one   -0.726450
     two   -0.661707
baz  one    0.402740
     two    1.611377
foo  one    0.327615
     two   -0.000124
qux  one    0.118475
     two   -0.446800
dtype: float64


In [29]:
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-1.136077,0.47731,0.720747,0.588976
bar,two,2.140633,0.643084,0.110517,-1.467639
baz,one,-0.921315,-1.044878,0.934945,0.479942
baz,two,-1.726369,0.733819,-1.361579,-0.98742
foo,one,0.439748,0.222286,0.685588,-1.602266
foo,two,-0.009369,-0.642167,-0.124715,0.074216
qux,one,-0.052937,0.618344,-0.43485,-1.350119
qux,two,0.101521,1.697946,1.638083,-0.521474


All of the `MultiIndex` constructors accept a `names` argument which stores string names for the levels themselves. If no names are provided, `None` will be assigned:

In [30]:
df.index.names

FrozenList([None, None])

Similar example, but the `index` of `df` has `names`

In [31]:
df = pd.DataFrame(np.random.randn(8, 4),
                  index=pd.MultiIndex.from_tuples(list(zip(*arrays)), names=['first', 'second']))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bar,one,-0.481756,-0.848918,-1.586441,0.71043
bar,two,-1.205549,-1.203687,1.992126,-0.481374
baz,one,-0.637913,0.626489,1.083724,0.547852
baz,two,2.138898,2.505168,-0.757101,1.542089
foo,one,0.648321,-0.127591,-0.002398,1.298445
foo,two,-2.017968,-1.139336,-1.570778,0.681335
qux,one,-2.70886,-0.221162,0.375286,1.284937
qux,two,0.56272,0.032988,1.509337,-0.542731


In [32]:
df.index.names

FrozenList(['first', 'second'])

This index can back any axis of a pandas object, and the number of **levels** of the index is up to you

In [35]:
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
print(df)

first        bar                 baz                 foo                 qux  \
second       one       two       one       two       one       two       one   
A       1.943865 -0.253741 -0.935066  0.569808 -1.142609  0.247549 -0.566648   
B       0.090774  1.082529  1.783811  1.665499 -0.524537 -0.567881 -0.404460   
C      -0.788228  0.409432 -1.762442  1.306270  0.199252 -0.967424 -0.124007   

first             
second       two  
A      -0.249839  
B       0.363176  
C      -0.865870  


In [38]:
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,1.100674,-0.402734,1.541273,0.516486,-0.130038,0.415169
bar,two,0.027089,0.350919,0.091036,0.745868,-1.054244,1.445607
baz,one,0.4115,1.948296,-1.074115,0.856349,-0.408564,-0.918802
baz,two,-1.940771,-0.536189,0.794369,0.017011,1.233258,0.358535
foo,one,1.159481,0.674674,1.17786,1.146307,-0.782274,-1.374995
foo,two,1.032526,1.199748,0.366244,-0.319034,-0.085805,0.533111


The higher levels of the indexes is "sparsified" to make the console output a bit easier to see. Note that how the index is displayed can be controlled using the `multi_sparse` option in `pandas.set_options()`:

In [40]:
with pd.option_context('display.multi_sparse', False):
    print(df)

first        bar       bar       baz       baz       foo       foo       qux  \
second       one       two       one       two       one       two       one   
A       1.943865 -0.253741 -0.935066  0.569808 -1.142609  0.247549 -0.566648   
B       0.090774  1.082529  1.783811  1.665499 -0.524537 -0.567881 -0.404460   
C      -0.788228  0.409432 -1.762442  1.306270  0.199252 -0.967424 -0.124007   

first        qux  
second       two  
A      -0.249839  
B       0.363176  
C      -0.865870  


In [42]:
pd.Series(np.random.randn(8), index=tuples)

(bar, one)    1.006283
(bar, two)   -1.149321
(baz, one)    1.113813
(baz, two)   -0.440520
(foo, one)    1.092848
(foo, two)   -0.452644
(qux, one)    0.162460
(qux, two)    3.429784
dtype: float64

The reason that the `MultiIndex` matters is that it can allow you to do grouping, selection, and reshaping operations as we will describe below and in subsequent areas of the documentations. As you will see in later sections, you can find yourself working with hierarchically-indexed data without creating a `MultiIndex` explicitly yourself. However, when loading data from a file, you may wish to generate your own `MultiIndex` when preparing the data set.

## 1.2. Reconstructing the level labels

The method `get_level_values()` will return a vector of labels for each location at a particular level:

In [44]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [46]:
index.get_level_values(1)

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

## 1.3. Basic indexing on axis with MultiIndex

One of the important features of hierarchical indexing is that you can select data by a "partial" label identifying a subgroup in the data. **Partial** selection "drops" levels of the hierarchical index in the result in a completely analogous way to selecting a column in a regular DataFrame:

In [50]:
df["bar"]

second,one,two
A,1.943865,-0.253741
B,0.090774,1.082529
C,-0.788228,0.409432


In [51]:
df["bar", "one"]

A    1.943865
B    0.090774
C   -0.788228
Name: (bar, one), dtype: float64

In [52]:
df["bar"]["one"]

A    1.943865
B    0.090774
C   -0.788228
Name: one, dtype: float64

In [54]:
s["qux"]

one    0.118475
two   -0.446800
dtype: float64

## 1.4. Defined levels

The `MultiIndex` keeps all the defined levels of an index, even if they are not actually used. When slicing an index, you may notice this. For example:

In [58]:
# original MultiIndex
df.columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [59]:
# sliced
df[['foo', 'qux']].columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

This is done to avoid a recomputation of the levels in order to make slicing highly performant. If you want to see only the used levels, you can use the `get_level_values()` method

In [60]:
df[['foo', 'qux']].columns.to_numpy()

array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],
      dtype=object)

In [66]:
# for a specific level
df[['foo', 'qux']].columns.get_level_values(0)

Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

To reconstruct the `MultiIndex` with only the used levels, the `remove_unused_levels()` method may be used

In [68]:
new_mi = df[['foo', 'qux']].columns.remove_unused_levels()

new_mi.levels

FrozenList([['foo', 'qux'], ['one', 'two']])

## 1.5. Data alignment and using `reindex`

Operations between differently-indexed objects having `MultiIndex` on the axes will work as you expect; data alignment will work the same as an Index of tuples:

In [77]:
s

bar  one   -0.726450
     two   -0.661707
baz  one    0.402740
     two    1.611377
foo  one    0.327615
     two   -0.000124
qux  one    0.118475
     two   -0.446800
dtype: float64

In [74]:
s + s[:-2]

bar  one   -1.452899
     two   -1.323414
baz  one    0.805480
     two    3.222754
foo  one    0.655230
     two   -0.000249
qux  one         NaN
     two         NaN
dtype: float64

In [75]:
s + s[::2]

bar  one   -1.452899
     two         NaN
baz  one    0.805480
     two         NaN
foo  one    0.655230
     two         NaN
qux  one    0.236949
     two         NaN
dtype: float64

The `reindex()` method of `Series`/`DataFrame` can be called with another `MultiIndex`, or even a list or array of tuples

In [88]:
s.reindex(index[:3])

first  second
bar    one      -0.726450
       two      -0.661707
baz    one       0.402740
dtype: float64

In [80]:
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])

foo  two   -0.000124
bar  one   -0.726450
qux  one    0.118475
baz  one    0.402740
dtype: float64

# Reference

https://pandas.pydata.org/docs/user_guide/advanced.html#take-methods