## Multi-Index and Index Hierarchy

Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:

In [7]:
import pandas as pd
import numpy as np

In [4]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [5]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [8]:
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.169364,-1.496473
G1,2,-0.957845,2.055482
G1,3,-0.140894,-1.557342
G2,1,0.759153,-0.395368
G2,2,1.882309,-1.184145
G2,3,-0.027654,2.498326


Now let's show how to index this! For index hierarchy we use df.loc[], if this was on the columns axis, you would just use normal bracket notation df[]. Calling one level of the index returns the sub-dataframe:

In [9]:
df.loc['G1']

Unnamed: 0,A,B
1,-0.169364,-1.496473
2,-0.957845,2.055482
3,-0.140894,-1.557342


In [10]:
df.loc['G1'].loc[1]

A   -0.169364
B   -1.496473
Name: 1, dtype: float64

In [11]:
df.index.names

FrozenList([None, None])

In [12]:
df.index.names = ['Group','Num']

In [13]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.169364,-1.496473
G1,2,-0.957845,2.055482
G1,3,-0.140894,-1.557342
G2,1,0.759153,-0.395368
G2,2,1.882309,-1.184145
G2,3,-0.027654,2.498326


In [14]:
df.loc['G1']

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.169364,-1.496473
2,-0.957845,2.055482
3,-0.140894,-1.557342


In [18]:
df.xs(('G1',1))

A   -0.169364
B   -1.496473
Name: (G1, 1), dtype: float64

In [15]:
df.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.169364,-1.496473
G2,0.759153,-0.395368


----------------------------

# thresh

In [19]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [20]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [21]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


Gives the rows which have atleast the thresh number of non-NaN values

In [26]:
df.dropna(thresh=1)
# Gives the rows which have atleast the 1 non-NaN values

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [23]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [24]:
df.dropna(thresh=3)

Unnamed: 0,A,B,C
0,1.0,5.0,1
