In [None]:
# There are a number of fundamental operations for rearranging tabular data. These are
#  alternatingly referred to as reshape or pivot operations

In [None]:
# Reshaping with Hierarchical Indexing

In [4]:
#  Hierarchical indexing provides a consistent way to rearrange data in a DataFrame.
#  There are two primary actions:
#  • stack: this “rotates” or pivots from the columns in the data to the rows
#  • unstack: this pivots from the rows into the columns

import numpy as np
import pandas as pd

data = pd.DataFrame(np.arange(6).reshape((2, 3)),
        index=pd.Index(['Ohio', 'Colorado'], name='state'),
        columns=pd.Index(['one', 'two', 'three'], name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [7]:
#  Using the stack method on this data pivots the columns into the rows, producing a
#  Series
result=data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [10]:
#  From a hierarchically-indexed Series, you can rearrange the data back into a DataFrame
#  with unstack
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [11]:
# By default the innermost level is unstacked (same with stack). You can unstack a dif
# ferent level by passing a level number or name
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [12]:
result.unstack('state') #alternate option

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [14]:
#   Unstacking might introduce missing data if all of the values in the level aren’t found in
#  each of the subgroups
s1=pd.Series([0,1,2,3],index=['a','b','c','d'])
s2=pd.Series([4,5,6],index=['c','d','e'])

data2=pd.concat([s1,s2],keys=['one','two'])
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [16]:
#  Stacking filters out missing data by default, so the operation is easily invertible
print(data2.unstack().stack())
print("")
print(data2.unstack().stack(dropna=False))

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64


In [19]:
#  When unstacking in a DataFrame, the level unstacked becomes the lowest level in the
#  result:
df=pd.DataFrame({
    'left':result,
    'right':result+5
},columns=pd.Index(['left','right'],name='side'))
print(df)
df.unstack()

side             left  right
state    number             
Ohio     one        0      5
         two        1      6
         three      2      7
Colorado one        3      8
         two        4      9
         three      5     10


side,left,left,left,right,right,right
number,one,two,three,one,two,three
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Ohio,0,1,2,5,6,7
Colorado,3,4,5,8,9,10


In [22]:
print(df.unstack('state'))
df.unstack('state').stack('side')

side   left          right         
state  Ohio Colorado  Ohio Colorado
number                             
one       0        3     5        8
two       1        4     6        9
three     2        5     7       10


Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


In [1]:
# Pivoting “long” to “wide” Format

# Check this part from book-the data is not available to perform

In [None]:
#  A common way to store multiple time series in databases and CSV is in so-called long
#  or stacked format:
#  In [116]: ldata[:10]
#  Out[116]:
#                  date     item     value
#  0 1959-03-31 00:00:00  realgdp  2710.349
#  1 1959-03-31 00:00:00     infl     0.000
#  2 1959-03-31 00:00:00    unemp     5.800
#  3 1959-06-30 00:00:00  realgdp  2778.801
#  4 1959-06-30 00:00:00     infl     2.340
#  5 1959-06-30 00:00:00    unemp     5.100
#  6 1959-09-30 00:00:00  realgdp  2775.488
#  7 1959-09-30 00:00:00     infl     2.740
#  8 1959-09-30 00:00:00    unemp     5.300
#  9 1959-12-31 00:00:00  realgdp  2785.204

In [None]:
# you might prefer to have a DataFrame containing one column per distinct
#  item value indexed by timestamps in the date column. DataFrame’s pivot method per
# forms exactly this transformation:
#  In [117]: pivoted = ldata.pivot('date', 'item', 'value')
#  In [118]: pivoted.head()
#  Out[118]:
#  item        infl   realgdp  unemp
#  date
#  1959-03-31  0.00  2710.349    5.8
#  1959-06-30  2.34  2778.801    5.1
#  1959-09-30  2.74  2775.488    5.3
#  1959-12-31  0.27  2785.204    5.6
#  1960-03-31  2.31  2847.699    5.2

In [None]:
# Note that pivot is just a shortcut for creating a hierarchical index using set_index and
#  reshaping with unstack:
#  In [124]: unstacked = ldata.set_index(['date', 'item']).unstack('item')
#  In [125]: unstacked[:7]
#  Out[125]:
#             value                     value2
#  item         infl   realgdp  unemp      infl   realgdp     unemp
#  date
#  1959-03-31   0.00  2710.349    5.8 -0.438570  1.669025 -0.539741
#  1959-06-30   2.34  2778.801    5.1  3.248944  0.476985 -1.021228
#  1959-09-30   2.74  2775.488    5.3  0.124121 -0.577087  0.302614
#  1959-12-31   0.27  2785.204    5.6  0.000940  0.523772  1.343810
#  1960-03-31   2.31  2847.699    5.2 -0.831154 -0.713544 -2.370232
#  1960-06-30   0.14  2834.390    5.2 -0.860757 -1.860761  0.560145
#  1960-09-30   2.70  2839.022    5.6  0.119827 -1.265934 -1.063512

In [None]:
# END