# Hierarchical Indexing
## Heirarchical indexing

In [95]:
import numpy as np
import pandas as pd

data = pd.Series(np.random.randn(9),
                 index=[['a','a','a','b','b','c','c','d','d'], 
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data.index.names = ['key1', 'key2']
data

key1  key2
a     1      -0.727056
      2       1.433781
      3       0.152062
b     1       2.373529
      3      -0.304197
c     1      -0.141102
      2       0.842371
d     2       0.236001
      3       0.076705
dtype: float64

In [9]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]],
           names=['key1', 'key2'])

In [15]:
data['b']

key2
1    1.350293
3   -0.996060
dtype: float64

In [14]:
data.loc['b', :]

key1  key2
b     1       1.350293
      3      -0.996060
dtype: float64

In [16]:
data.loc['b':'c']

key1  key2
b     1       1.350293
      3      -0.996060
c     1      -0.836520
      2      -1.246362
dtype: float64

In [18]:
data.loc[:, 2:4]

key1  key2
a     2      -0.689159
      3      -0.051046
b     3      -0.996060
c     2      -1.246362
d     2       1.300064
      3      -1.076836
dtype: float64

## Swapping and sorting levels 

In [19]:
data

key1  key2
a     1       0.877426
      2      -0.689159
      3      -0.051046
b     1       1.350293
      3      -0.996060
c     1      -0.836520
      2      -1.246362
d     2       1.300064
      3      -1.076836
dtype: float64

In [20]:
data.swaplevel('key1', 'key2')
data.swaplevel(0, 1)  # same

key2  key1
1     a       0.877426
2     a      -0.689159
3     a      -0.051046
1     b       1.350293
3     b      -0.996060
1     c      -0.836520
2     c      -1.246362
      d       1.300064
3     d      -1.076836
dtype: float64

In [22]:
data.swaplevel('key1', 'key2').sort_index(level=0)

key2  key1
1     a       0.877426
      b       1.350293
      c      -0.836520
2     a      -0.689159
      c      -1.246362
      d       1.300064
3     a      -0.051046
      b      -0.996060
      d      -1.076836
dtype: float64

## Summary statistics by level 

In [25]:
data

key1  key2
a     1       0.877426
      2      -0.689159
      3      -0.051046
b     1       1.350293
      3      -0.996060
c     1      -0.836520
      2      -1.246362
d     2       1.300064
      3      -1.076836
dtype: float64

In [24]:
data.sum(level=0)

key1
a    0.137221
b    0.354233
c   -2.082883
d    0.223228
dtype: float64

## Setting indices using columns 

In [32]:
df = pd.DataFrame({'a': np.arange(5), 'b': np.arange(5, 0, -1),
                   'c': ['x', 'x', 'y', 'y', 'y'],
                   'd': [0, 1, 0, 1, 2]})
df

Unnamed: 0,a,b,c,d
0,0,5,x,0
1,1,4,x,1
2,2,3,y,0
3,3,2,y,1
4,4,1,y,2


In [33]:
df2 = df.set_index(['c', 'd'])  # set columns 'c' and 'd' as indices
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
x,0,0,5
x,1,1,4
y,0,2,3
y,1,3,2
y,2,4,1


In [34]:
df2.reset_index()  # moves indices into columns

Unnamed: 0,c,d,a,b
0,x,0,0,5
1,x,1,1,4
2,y,0,2,3
3,y,1,3,2
4,y,2,4,1


# Combining and Merging Datasets
## Joining DataFrames
* Conceptually, an inner join of 2 DataFrames on a key is the same as an cross join filtered to retain only records with matching values of the key.

In [84]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
print(df1, '\n')

df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
print(df2, '\n')

pd.merge(df1, df2, on='key')  # inner join by default
df1.merge(df2, on='key')  # same

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6 

  key  data2
0   a      0
1   b      1
2   d      2 



Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [86]:
pd.merge(df1, df2, on='key', how='left')  # left join
df1.merge(df2, on='key', how='left')  # same

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [88]:
# if the columns on which the join is performed have different names:
df1 = pd.DataFrame({'key1': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key2': ['a', 'b', 'c'],
                    'data2': range(3)})
pd.merge(df1, df2, left_on='key1', right_on='key2')
df1.merge(df2, left_on='key1', right_on='key2')  # same

Unnamed: 0,key1,data1,key2,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0
6,c,3,c,2


In [55]:
# on overlapping column names
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'c'],
                    'data': range(3)})
pd.merge(df1, df2, on='key', suffixes=('_left', '_right'))

Unnamed: 0,key,data_left,data_right
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0
6,c,3,2


In [57]:
pd.merge(df1, df2, on='key', suffixes=('_left', '_right'), sort=True)

Unnamed: 0,key,data_left,data_right
0,a,2,0
1,a,4,0
2,a,5,0
3,b,0,1
4,b,1,1
5,b,6,1
6,c,3,2


## Joining on index 

In [62]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a'], 'value1': range(5)})
df2 = pd.DataFrame({'value2': range(10, 13)}, index=['a', 'b', 'c'])
print(df1, '\n\n', df2, '\n')

pd.merge(df1, df2, left_on='key', right_index=True)  # the key for `df2` is its index

  key  value1
0   b       0
1   b       1
2   a       2
3   c       3
4   a       4 

    value2
a      10
b      11
c      12 



Unnamed: 0,key,value1,value2
0,b,0,11
1,b,1,11
2,a,2,10
4,a,4,10
3,c,3,12


In [73]:
# using join() for merging by index
df1 = pd.DataFrame({'value1': range(5)}, index=['b', 'b', 'a', 'c', 'a'])
df2 = pd.DataFrame({'value2': [10, 11, 12]}, index=['a', 'b', 'd'])
print(df1, '\n\n', df2, '\n')

df1.join(df2)  # left join by default

   value1
b       0
b       1
a       2
c       3
a       4 

    value2
a      10
b      11
d      12 



Unnamed: 0,value1,value2
a,2,10.0
a,4,10.0
b,0,11.0
b,1,11.0
c,3,


In [74]:
df1.join(df2, how='outer') 

Unnamed: 0,value1,value2
a,2.0,10.0
a,4.0,10.0
b,0.0,11.0
b,1.0,11.0
c,3.0,
d,,12.0


In [77]:
# joining 3 DataFrames:
df3 = pd.DataFrame({'value3': [20, 21, 22]}, index=['b', 'a', 'c'])
df1.join([df2, df3], how='outer')

Unnamed: 0,value1,value2,value3
a,2.0,10.0,21.0
a,4.0,10.0,21.0
b,0.0,11.0,20.0
b,1.0,11.0,20.0
c,3.0,,22.0
d,,12.0,


## Concatenating along axis 
### Concatenating Series

In [126]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])

pd.concat([s1, s2], axis=0)

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [127]:
s3 = pd.Series([5, 6, 7], index=['a', 'b', 'c'])
pd.concat([s1, s3], axis=1, join='outer', sort=False)  # joining by index, creating
                                                       # a DataFrame

Unnamed: 0,0,1
a,0.0,5
b,1.0,6
c,,7


In [128]:
pd.concat([s1, s3], axis=1, join_axes=[['a', 'b']], sort=False)  # retain only records
                                                                 # of specified indices

Unnamed: 0,0,1
a,0,5
b,1,6


In [102]:
pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])  # creates hierarchical index

one    a    0
       b    1
two    a    0
       b    1
three  a    5
       b    6
       c    7
dtype: int64

In [105]:
pd.concat([s1, s1, s3], axis=1, join='outer',
          keys=['one', 'two', 'three'], sort=False)
pd.concat({'one': s1, 'two': s1, 'three': s3},
          axis=1, join='outer',  sort=False)  # same

Unnamed: 0,one,two,three
a,0.0,0.0,5
b,1.0,1.0,6
c,,,7


### Concatenating DataFrames/Series

In [122]:
# concatenating along axis 0 (rows)
df1 = pd.DataFrame(np.random.randn(6).reshape(2, 3), columns=['A', 'B', 'C'])
df2 = pd.DataFrame(np.random.randn(4).reshape(2, 2), columns=['A', 'C'])
print(df1, '\n\n', df2)

pd.concat([df1, df2], axis=0, join='outer', sort=False)
pd.concat([df1, df2], axis=0, join='outer', 
          ignore_index=True, sort=False)  # don't preserve original indices

          A         B         C
0 -0.020517  0.553872  0.861029
1 -0.725692  0.332890 -0.804782 

           A         C
0  1.507996  0.677283
1 -0.542809  1.397429


Unnamed: 0,A,B,C
0,-0.020517,0.553872,0.861029
1,-0.725692,0.33289,-0.804782
2,1.507996,,0.677283
3,-0.542809,,1.397429


In [113]:
# concatenating along axis 1 (columns)
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), 
                   index=['a', 'b', 'c'], columns=['one', 'two'])
df2 = pd.DataFrame(np.arange(4).reshape(2, 2) + 5, 
                   index=['a', 'c'], columns=['three', 'four'])
print(df1, '\n\n', df2)

pd.concat([df1, df2], axis=1, join='outer', keys=['table1', 'table2'], 
          names=['upper', 'lower'], sort=False)

   one  two
a    0    1
b    2    3
c    4    5 

    three  four
a      5     6
c      7     8


upper,table1,table1,table2,table2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [138]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), 
                   index=['a', 'b', 'c'], columns=['one', 'two'])
s2 = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name='X')
print(df1, '\n')
print(s2)

pd.concat([df1, s2], axis=1)

   one  two
a    0    1
b    2    3
c    4    5 

a    10
b    11
c    12
Name: X, dtype: int64


Unnamed: 0,one,two,X
a,0,1,10
b,2,3,11
c,4,5,12


## Combine with overlap
* `combine_first()`

In [143]:
df1 = pd.DataFrame({'a': [np.nan, 1, 2], 'b': [5, 6, np.nan], 'c': [10, 12, 14]})
print(df1, '\n')
df2 = pd.DataFrame({'a': [-1, 4, np.nan, 9], 'b': [0, 1, 2, 3]})
print(df2)

df1.combine_first(df2)  # if df1[i,j] is NA or does not exist, df2[i,j] is used 
                        # in the result 

     a    b   c
0  NaN  5.0  10
1  1.0  6.0  12
2  2.0  NaN  14 

     a  b
0 -1.0  0
1  4.0  1
2  NaN  2
3  9.0  3


Unnamed: 0,a,b,c
0,-1.0,5.0,10.0
1,1.0,6.0,12.0
2,2.0,2.0,14.0
3,9.0,3.0,


# Reshaping and Pivoting 
## Reshaping with heirarchical indexing
* `stack()`, `unstack()`

In [154]:
data = pd.DataFrame(np.arange(6).reshape(2, 3), 
                    index=pd.Index(['a', 'b'], name='key1'),
                    columns=pd.Index(['v1', 'v2', 'v3'], name='V'))
print(data)

s1 = data.stack()  # the columns becomes the innermost level of hierarchical index
s1

V     v1  v2  v3
key1            
a      0   1   2
b      3   4   5


key1  V 
a     v1    0
      v2    1
      v3    2
b     v1    3
      v2    4
      v3    5
dtype: int64

In [164]:
s1.unstack()  # the innermost level of index becomes columns
s1.unstack(1)  # same
s1.unstack('V')  # same

V,v1,v2,v3
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0,1,2
b,3,4,5


In [165]:
s1.unstack('key1')  # the index level 'key1' becomes columns
s1.unstack(0)  # same

key1,a,b
V,Unnamed: 1_level_1,Unnamed: 2_level_1
v1,0,3
v2,1,4
v3,2,5


## Pivoting "long" to "wide" format 

In [166]:
df = pd.DataFrame({'year': [2001, 2001, 2001, 2002, 2002, 2002, 2003, 2003, 2003],
                   'item': ['A', 'B', 'C'] * 3,
                   'value': np.random.randn(9)})
df

Unnamed: 0,year,item,value
0,2001,A,-1.034591
1,2001,B,0.818607
2,2001,C,0.009776
3,2002,A,-0.114091
4,2002,B,-1.306164
5,2002,C,-0.81794
6,2003,A,-1.575751
7,2003,B,-0.522755
8,2003,C,0.555181


In [189]:
df1 = df.pivot('year', 'item', 'value')  # 'year' will be rows, 'item' will be columns
df1

item,A,B,C
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,-1.034591,0.818607,0.009776
2002,-0.114091,-1.306164,-0.81794
2003,-1.575751,-0.522755,0.555181


In [215]:
df1 = df.set_index(['year', 'item']).unstack()  # same
df1

Unnamed: 0_level_0,value,value,value
item,A,B,C
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2001,-1.034591,0.818607,0.009776
2002,-0.114091,-1.306164,-0.81794
2003,-1.575751,-0.522755,0.555181


## Pivoting "wide" to "long" format

In [216]:
df1.columns = df1.columns.droplevel([0])
df2 = df1.reset_index()
df2

item,year,A,B,C
0,2001,-1.034591,0.818607,0.009776
1,2002,-0.114091,-1.306164,-0.81794
2,2003,-1.575751,-0.522755,0.555181


In [219]:
pd.melt(df2, id_vars=['year'])

Unnamed: 0,year,item,value
0,2001,A,-1.034591
1,2002,A,-0.114091
2,2003,A,-1.575751
3,2001,B,0.818607
4,2002,B,-1.306164
5,2003,B,-0.522755
6,2001,C,0.009776
7,2002,C,-0.81794
8,2003,C,0.555181


In [220]:
pd.melt(df2, id_vars=['year'], value_vars=['A', 'B'])

Unnamed: 0,year,item,value
0,2001,A,-1.034591
1,2002,A,-0.114091
2,2003,A,-1.575751
3,2001,B,0.818607
4,2002,B,-1.306164
5,2003,B,-0.522755


In [223]:
pd.melt(df2, value_vars=['A', 'B'])

Unnamed: 0,item,value
0,A,-1.034591
1,A,-0.114091
2,A,-1.575751
3,B,0.818607
4,B,-1.306164
5,B,-0.522755
