# This cookbook goes through examples of creating hierarchical dataframe columns using multiindexing in pandas

# Multi-index Column Example

In [1]:
import pandas as pd

In [19]:
data = [['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i'], ['j', 'k', 'l']]
data

[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i'], ['j', 'k', 'l']]

In [4]:
column_names = ['1st col', '2nd col', '3rd col']
column_names

['1st col', '2nd col', '3rd col']

In [5]:
dataset_description = 'letter matrix a-i'
dataset_description

'letter matrix a-i'

In [9]:
timestamp='Nov 1, 2017'
timestamp

'Nov 1, 2017'

In [10]:
mi_list = []
for column_name in column_names:
    l = tuple([dataset_description, timestamp, column_name])
    mi_list.append(l)
mi_list

[('letter matrix a-i', 'Nov 1, 2017', '1st col'),
 ('letter matrix a-i', 'Nov 1, 2017', '2nd col'),
 ('letter matrix a-i', 'Nov 1, 2017', '3rd col')]

In [30]:
top_mi_lbl = 'Dataset Description:'
mid_mi_lbl = 'Timestamp Info:'
lower_mi_lbl = 'Column Description:'
mi_col = pd.MultiIndex.from_tuples(mi_list, names=[top_mi_lbl, mid_mi_lbl, lower_mi_lbl])
mi_col

MultiIndex(levels=[[u'letter matrix a-i'], [u'Nov 1, 2017'], [u'1st col', u'2nd col', u'3rd col']],
           labels=[[0, 0, 0], [0, 0, 0], [0, 1, 2]],
           names=[u'Dataset Description:', u'Timestamp Info:', u'Column Description:'])

In [12]:
index_data = [1, 2, 3, 4]
index_data

[1, 2, 3, 4]

In [29]:
index_name='Index Desciption:'

In [14]:
idx = pd.Index(data=index_data, name=index_name)
idx

Int64Index([1, 2, 3, 4], dtype='int64', name=u'Index Desciption')

In [28]:
df = pd.DataFrame(data=data, index=idx, columns=mi_col)
df

Dataset Description,letter matrix a-i,letter matrix a-i,letter matrix a-i
Timestamp Info,"Nov 1, 2017","Nov 1, 2017","Nov 1, 2017"
Column Description,1st col,2nd col,3rd col
Index Desciption,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
1,a,b,c
2,d,e,f
3,g,h,i
4,j,k,l


In [63]:
# Note column values have multiple attributes due to multiindex nature:
df.columns.tolist()

[('letter matrix a-i', 'Nov 1, 2017', '1st col'),
 ('letter matrix a-i', 'Nov 1, 2017', '2nd col'),
 ('letter matrix a-i', 'Nov 1, 2017', '3rd col')]

In [31]:
# if data is coming in on weird format, or need to transpose, consider:
[list(i) for i in zip(*data)]

[['a', 'd', 'g', 'j'], ['b', 'e', 'h', 'k'], ['c', 'f', 'i', 'l']]

# Slicing Multiindex DataFrame

## Using Filters

In [39]:
# filter on match close to provided Column name:
df.filter(like='1st')

Dataset Description,letter matrix a-i
Timestamp Info,"Nov 1, 2017"
Column Description,1st col
Index Desciption,Unnamed: 1_level_3
1,a
2,d
3,g
4,j


In [40]:
# filter on match close to provided Dataset Description:
df.filter(like='letter')

Dataset Description,letter matrix a-i,letter matrix a-i,letter matrix a-i
Timestamp Info,"Nov 1, 2017","Nov 1, 2017","Nov 1, 2017"
Column Description,1st col,2nd col,3rd col
Index Desciption,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
1,a,b,c
2,d,e,f
3,g,h,i
4,j,k,l


In [41]:
# filter on match close to provided Timestamp label:
df.filter(like='Nov')

Dataset Description,letter matrix a-i,letter matrix a-i,letter matrix a-i
Timestamp Info,"Nov 1, 2017","Nov 1, 2017","Nov 1, 2017"
Column Description,1st col,2nd col,3rd col
Index Desciption,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
1,a,b,c
2,d,e,f
3,g,h,i
4,j,k,l


## Using xs on columns

In [59]:
df.xs(('letter matrix a-i', 'Nov 1, 2017', '1st col'), axis=1)

Index Desciption
1    a
2    d
3    g
4    j
Name: (letter matrix a-i, Nov 1, 2017, 1st col), dtype: object

In [60]:
df.xs(('letter matrix a-i', '1st col'), axis=1, level=[0,2])

Timestamp Info,"Nov 1, 2017"
Index Desciption,Unnamed: 1_level_1
1,a
2,d
3,g
4,j


## By passing list of columns

In [106]:
df['letter matrix a-i']['Nov 1, 2017']['1st col']

Index Desciption
1    a
2    d
3    g
4    j
Name: 1st col, dtype: object

In [105]:
df['letter matrix a-i']['Nov 1, 2017'][['1st col', '2nd col']]

Column Description,1st col,2nd col
Index Desciption,Unnamed: 1_level_1,Unnamed: 2_level_1
1,a,b
2,d,e
3,g,h
4,j,k


## By using loc and passing axis argument

In [109]:
# note slice(None) is passed for levels that we can accept any available slices on:
df.loc(axis=1)[(slice(None), slice(None), '1st col')]

Dataset Description,letter matrix a-i
Timestamp Info,"Nov 1, 2017"
Column Description,1st col
Index Desciption,Unnamed: 1_level_3
1,a
2,d
3,g
4,j
