In [1]:
import numpy as np
import pandas as pd

# Hierarchical Indexing
- tow or more labels of index in our dataset

In [6]:
# a Series with hierarchical indexing
np.random.seed(100)
data = pd.Series(np.random.randn(9), index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],[1,2,3,1,3,1,2,2,3]])
data

a  1   -1.749765
   2    0.342680
   3    1.153036
b  1   -0.252436
   3    0.981321
c  1    0.514219
   2    0.221180
d  2   -1.070043
   3   -0.189496
dtype: float64

In [7]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

### Accessing data using outer indexing

In [8]:
data['a']

1   -1.749765
2    0.342680
3    1.153036
dtype: float64

In [9]:
data['c']

1    0.514219
2    0.221180
dtype: float64

In [10]:
data['a':'c']

a  1   -1.749765
   2    0.342680
   3    1.153036
b  1   -0.252436
   3    0.981321
c  1    0.514219
   2    0.221180
dtype: float64

In [11]:
data[['b','d']]

b  1   -0.252436
   3    0.981321
d  2   -1.070043
   3   -0.189496
dtype: float64

In [12]:
data.loc[['b','d']]

b  1   -0.252436
   3    0.981321
d  2   -1.070043
   3   -0.189496
dtype: float64

### Accessing data using inner indexing

In [13]:
data

a  1   -1.749765
   2    0.342680
   3    1.153036
b  1   -0.252436
   3    0.981321
c  1    0.514219
   2    0.221180
d  2   -1.070043
   3   -0.189496
dtype: float64

In [18]:
# Find the data values for inner index = 2
data[: , 2]

a    0.342680
c    0.221180
d   -1.070043
dtype: float64

In [29]:
# Find the values b and c, but only the inner index values ranging from 1 to 2
data.loc['b':'c'][[1,2]]

b  3    0.981321
c  1    0.514219
dtype: float64

In [32]:
data[['b','c']][0:3]

b  1   -0.252436
   3    0.981321
c  1    0.514219
dtype: float64

In [33]:
data

a  1   -1.749765
   2    0.342680
   3    1.153036
b  1   -0.252436
   3    0.981321
c  1    0.514219
   2    0.221180
d  2   -1.070043
   3   -0.189496
dtype: float64

In [34]:
type(data)

pandas.core.series.Series

In [35]:
# arrange data into a DataFrame - unstack()
data.unstack()

Unnamed: 0,1,2,3
a,-1.749765,0.34268,1.153036
b,-0.252436,,0.981321
c,0.514219,0.22118,
d,,-1.070043,-0.189496


In [36]:
type(data.unstack())

pandas.core.frame.DataFrame

In [38]:
# inverse of unstack()
data.unstack().stack()

a  1   -1.749765
   2    0.342680
   3    1.153036
b  1   -0.252436
   3    0.981321
c  1    0.514219
   2    0.221180
d  2   -1.070043
   3   -0.189496
dtype: float64

In [42]:
df = pd.DataFrame(np.arange(12).reshape((4,3)), 
                  index = [['a','a','b','b'], 
                           [1,2,1,2]], 
                  columns=[['Alberta', 'Alberta', 'Ontario'], 
                           ['Sunny', 'Cloudy', 'Cloudy']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Alberta,Alberta,Ontario
Unnamed: 0_level_1,Unnamed: 1_level_1,Sunny,Cloudy,Cloudy
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [43]:
df.loc['b',1]['Ontario']

Cloudy    8
Name: (b, 1), dtype: int32

In [44]:
df.loc['b',1]['Alberta','Cloudy']

7

In [47]:
df.loc['b',1]['Ontario', 'Cloudy']

8

In [48]:
df.loc['b',1]

Alberta  Sunny     6
         Cloudy    7
Ontario  Cloudy    8
Name: (b, 1), dtype: int32

In [50]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Alberta,Alberta,Ontario
Unnamed: 0_level_1,Unnamed: 1_level_1,Sunny,Cloudy,Cloudy
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [53]:
df.loc['b'].loc[1]['Ontario']['Cloudy']

8

In [54]:
df.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [55]:
df.columns

MultiIndex([('Alberta',  'Sunny'),
            ('Alberta', 'Cloudy'),
            ('Ontario', 'Cloudy')],
           )

In [56]:
df.index.names = ['Outer', 'Inner']
df.columns.names = ['Province', 'Weather']
df

Unnamed: 0_level_0,Province,Alberta,Alberta,Ontario
Unnamed: 0_level_1,Weather,Sunny,Cloudy,Cloudy
Outer,Inner,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [57]:
df.T

Unnamed: 0_level_0,Outer,a,a,b,b
Unnamed: 0_level_1,Inner,1,2,1,2
Province,Weather,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Alberta,Sunny,0,3,6,9
Alberta,Cloudy,1,4,7,10
Ontario,Cloudy,2,5,8,11


In [59]:
df = df.swaplevel('Outer', 'Inner')
df

Unnamed: 0_level_0,Province,Alberta,Alberta,Ontario
Unnamed: 0_level_1,Weather,Sunny,Cloudy,Cloudy
Inner,Outer,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [60]:
df = df.sort_index(level=0)
df

Unnamed: 0_level_0,Province,Alberta,Alberta,Ontario
Unnamed: 0_level_1,Weather,Sunny,Cloudy,Cloudy
Inner,Outer,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [61]:
df = df.swaplevel(0, 1).sort_index(level = 0)
df

Unnamed: 0_level_0,Province,Alberta,Alberta,Ontario
Unnamed: 0_level_1,Weather,Sunny,Cloudy,Cloudy
Outer,Inner,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [62]:
# default operation will be applied on column axis levels
df.sum()

Province  Weather
Alberta   Sunny      18
          Cloudy     22
Ontario   Cloudy     26
dtype: int64

In [63]:
df.sum(level=0)

  df.sum(level=0)


Province,Alberta,Alberta,Ontario
Weather,Sunny,Cloudy,Cloudy
Outer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [64]:
df.sum(level=1)

  df.sum(level=1)


Province,Alberta,Alberta,Ontario
Weather,Sunny,Cloudy,Cloudy
Inner,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [65]:
df

Unnamed: 0_level_0,Province,Alberta,Alberta,Ontario
Unnamed: 0_level_1,Weather,Sunny,Cloudy,Cloudy
Outer,Inner,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [66]:
df['Alberta']

Unnamed: 0_level_0,Weather,Sunny,Cloudy
Outer,Inner,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [67]:
df['Alberta']['Sunny']

Outer  Inner
a      1        0
       2        3
b      1        6
       2        9
Name: Sunny, dtype: int32

In [68]:
df[['Alberta', 'Ontario']]

Unnamed: 0_level_0,Province,Alberta,Alberta,Ontario
Unnamed: 0_level_1,Weather,Sunny,Cloudy,Cloudy
Outer,Inner,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [73]:
df = pd.read_csv('stocks.csv', usecols=['date', 'open','close','Name'], nrows=5000)
df.head()

Unnamed: 0,date,open,close,Name
0,2013-02-08,15.07,14.75,AAL
1,2013-02-11,14.89,14.46,AAL
2,2013-02-12,14.45,14.27,AAL
3,2013-02-13,14.3,14.66,AAL
4,2013-02-14,14.94,13.99,AAL


In [74]:
df.set_index(['Name', 'date'], inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,open,close
Name,date,Unnamed: 2_level_1,Unnamed: 3_level_1
AAL,2013-02-08,15.070,14.75
AAL,2013-02-11,14.890,14.46
AAL,2013-02-12,14.450,14.27
AAL,2013-02-13,14.300,14.66
AAL,2013-02-14,14.940,13.99
...,...,...,...
ABBV,2017-12-08,94.620,95.95
ABBV,2017-12-11,95.880,96.47
ABBV,2017-12-12,96.620,96.30
ABBV,2017-12-13,96.600,97.35


In [75]:
df.index

MultiIndex([( 'AAL', '2013-02-08'),
            ( 'AAL', '2013-02-11'),
            ( 'AAL', '2013-02-12'),
            ( 'AAL', '2013-02-13'),
            ( 'AAL', '2013-02-14'),
            ( 'AAL', '2013-02-15'),
            ( 'AAL', '2013-02-19'),
            ( 'AAL', '2013-02-20'),
            ( 'AAL', '2013-02-21'),
            ( 'AAL', '2013-02-22'),
            ...
            ('ABBV', '2017-12-01'),
            ('ABBV', '2017-12-04'),
            ('ABBV', '2017-12-05'),
            ('ABBV', '2017-12-06'),
            ('ABBV', '2017-12-07'),
            ('ABBV', '2017-12-08'),
            ('ABBV', '2017-12-11'),
            ('ABBV', '2017-12-12'),
            ('ABBV', '2017-12-13'),
            ('ABBV', '2017-12-14')],
           names=['Name', 'date'], length=5000)

In [79]:
df.loc['AAL'].head(2)

Unnamed: 0_level_0,open,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-02-08,15.07,14.75
2013-02-11,14.89,14.46


In [80]:
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,open,close
Name,date,Unnamed: 2_level_1,Unnamed: 3_level_1
AAL,2013-02-08,15.07,14.75
AAL,2013-02-11,14.89,14.46


In [81]:
# get the opening and the closing prices of AAL on 2013-02-13
df.loc[('AAL','2013-02-13')]

open     14.30
close    14.66
Name: (AAL, 2013-02-13), dtype: float64

In [82]:
df.loc[('AAL','2013-02-13'), :]

open     14.30
close    14.66
Name: (AAL, 2013-02-13), dtype: float64

In [83]:
# get the opening price of AAL on 2013-02-13
df.loc[('AAL','2013-02-13')]['open']

14.3

In [84]:
df.loc[('AAL', '2013-02-13'), 'open']

14.3

In [85]:
# get the opening price of AAL and ABBV on 2013-02-13
df.loc[(['AAL', 'ABBV'], '2013-02-13'), 'open']

Name  date      
AAL   2013-02-13    14.30
ABBV  2013-02-13    35.42
Name: open, dtype: float64

In [86]:
df.loc[[('AAL','2013-02-13'),('ABBV','2013-02-13')],'open']

Name  date      
AAL   2013-02-13    14.30
ABBV  2013-02-13    35.42
Name: open, dtype: float64

In [87]:
# get the opening price of AAL and ABBV on 2013-02-13 and 14-02-2013
df.loc[(['AAL', 'ABBV'], ['2013-02-13', '2013-02-14']), 'open']

Name  date      
AAL   2013-02-13    14.30
      2013-02-14    14.94
ABBV  2013-02-13    35.42
      2013-02-14    35.05
Name: open, dtype: float64

In [91]:
df.loc[(['AAL', 'AAPL', 'AAP', 'ABBV'], '2013-02-14'), 'open']

Name  date      
AAL   2013-02-14    14.9400
AAPL  2013-02-14    66.3599
AAP   2013-02-14    78.6600
ABBV  2013-02-14    35.0500
Name: open, dtype: float64

In [88]:
df.loc[(slice(None),'2013-02-14'), 'open']

Name  date      
AAL   2013-02-14    14.9400
AAPL  2013-02-14    66.3599
AAP   2013-02-14    78.6600
ABBV  2013-02-14    35.0500
Name: open, dtype: float64

In [89]:
df['open'].unique()

array([15.07 , 14.89 , 14.45 , ..., 95.88 , 96.6  , 97.345])