In [1]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

In [2]:
#重塑层次化索引
#stack:将数据的列“旋转”为行 "叠"操作 二维变成一维叠起来放得高
#unstack:将数据的行“旋转”为列 “不叠”操作 将一维变成二维 铺开

In [4]:
data = DataFrame(np.arange(6).reshape(2,3),
                 index=pd.Index(['Ohio','Colorado'],name='state'),
                 columns=pd.Index(['one','two','three'],name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [6]:
result = data.stack()
result
#使用stack将二维的DataFrame转换成了一维的Series
#很明显 由于一行有多列，这种特殊的结构可以轻松地合并为二层索引

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [7]:
result.unstack()
#unstack可以将一维的Series的多层索引结构变为DataFrame

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [10]:
#默认情况下 stack/unstack操作的都是最内层，比如unstack将内层索引变成column stack将column变成最内层索引
#设定参数可以改变stack/unstack的内容
print(result.unstack(0)) #可以使用数字声明
result.unstack('state') #可以显式声明属性名

state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5


state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [12]:
s1 = Series([0,1,2,3],index=['a','b','c','d'])
s2 = Series([4,5,6],index=['c','d','e'])
data2 = pd.concat([s1,s2],keys=['one','two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [13]:
data2.unstack()
#unstack时很明显列会取每个内层索引的并集 所以会产生没有数据的情况NaN

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [14]:
data2.unstack().stack()
#同样地 如果DataFrame中有NaN stack时不会保留会去掉

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [15]:
df = DataFrame({'left':result,'right':result+5},
               columns=pd.Index(['left','right'],name='side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [16]:
df.unstack()
#DataFrame当然也可以unstack 只要index是层次索引就可以unstack
#此时内层被抽出来放在columns上的最内层

side,left,left,left,right,right,right
number,one,two,three,one,two,three
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Ohio,0,1,2,5,6,7
Colorado,3,4,5,8,9,10


In [17]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [18]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


In [34]:
ldata = pd.read_csv('../examples/spx.csv',skiprows=1,header=None)
ldata.columns=['time','value']
ldata['item']='unknown'
ldata

Unnamed: 0,time,value,item
0,1990-02-01 00:00:00,328.79,unknown
1,1990-02-02 00:00:00,330.92,unknown
2,1990-02-05 00:00:00,331.85,unknown
3,1990-02-06 00:00:00,329.66,unknown
4,1990-02-07 00:00:00,333.75,unknown
...,...,...,...
5467,2011-10-10 00:00:00,1194.89,unknown
5468,2011-10-11 00:00:00,1195.54,unknown
5469,2011-10-12 00:00:00,1207.25,unknown
5470,2011-10-13 00:00:00,1203.66,unknown


In [36]:
print("######################################################")
#pivot 枢轴 不同于stack/unstack的叠操作 它们应用于层次索引的情况
#而pivot应用于一层索引铺开表示

item,unknown
time,Unnamed: 1_level_1
1990-02-01 00:00:00,328.79
1990-02-02 00:00:00,330.92
1990-02-05 00:00:00,331.85
1990-02-06 00:00:00,329.66
1990-02-07 00:00:00,333.75


In [37]:
#在关系型数据库中如ldata存储，一般而言time和item列是主键，value是不同时刻的值
#如果让它们变得更具有DataFrame风格 值在特定的位置 有index和column的概念 使用pivot
pivoted = ldata.pivot('time','item','value')
pivoted.head()
#pivot第一个参数是index 第二个参数是column 第三个参数是在形成index-column对时
#在该位置放置的数据

item,unknown
time,Unnamed: 1_level_1
1990-02-01 00:00:00,328.79
1990-02-02 00:00:00,330.92
1990-02-05 00:00:00,331.85
1990-02-06 00:00:00,329.66
1990-02-07 00:00:00,333.75


In [38]:
ldata['value2'] = np.random.randn(len(ldata))
ldata

Unnamed: 0,time,value,item,value2
0,1990-02-01 00:00:00,328.79,unknown,-0.128324
1,1990-02-02 00:00:00,330.92,unknown,-0.556554
2,1990-02-05 00:00:00,331.85,unknown,0.951119
3,1990-02-06 00:00:00,329.66,unknown,0.569570
4,1990-02-07 00:00:00,333.75,unknown,0.893959
...,...,...,...,...
5467,2011-10-10 00:00:00,1194.89,unknown,-1.253363
5468,2011-10-11 00:00:00,1195.54,unknown,0.491323
5469,2011-10-12 00:00:00,1207.25,unknown,1.515392
5470,2011-10-13 00:00:00,1203.66,unknown,-0.099124


In [39]:
pivoted2 = ldata.pivot('time','item')
pivoted2
#会形成层次索引 会将未指明的数据变为层次索引

Unnamed: 0_level_0,value,value2
item,unknown,unknown
time,Unnamed: 1_level_2,Unnamed: 2_level_2
1990-02-01 00:00:00,328.79,-0.128324
1990-02-02 00:00:00,330.92,-0.556554
1990-02-05 00:00:00,331.85,0.951119
1990-02-06 00:00:00,329.66,0.569570
1990-02-07 00:00:00,333.75,0.893959
...,...,...
2011-10-10 00:00:00,1194.89,-1.253363
2011-10-11 00:00:00,1195.54,0.491323
2011-10-12 00:00:00,1207.25,1.515392
2011-10-13 00:00:00,1203.66,-0.099124


In [41]:
#使用set_index也可以将列变成index存在
ununstacked = ldata.set_index(['time','item'])
ununstacked

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value2
time,item,Unnamed: 2_level_1,Unnamed: 3_level_1
1990-02-01 00:00:00,unknown,328.79,-0.128324
1990-02-02 00:00:00,unknown,330.92,-0.556554
1990-02-05 00:00:00,unknown,331.85,0.951119
1990-02-06 00:00:00,unknown,329.66,0.569570
1990-02-07 00:00:00,unknown,333.75,0.893959
...,...,...,...
2011-10-10 00:00:00,unknown,1194.89,-1.253363
2011-10-11 00:00:00,unknown,1195.54,0.491323
2011-10-12 00:00:00,unknown,1207.25,1.515392
2011-10-13 00:00:00,unknown,1203.66,-0.099124


In [42]:
ununstacked.unstack()

Unnamed: 0_level_0,value,value2
item,unknown,unknown
time,Unnamed: 1_level_2,Unnamed: 2_level_2
1990-02-01 00:00:00,328.79,-0.128324
1990-02-02 00:00:00,330.92,-0.556554
1990-02-05 00:00:00,331.85,0.951119
1990-02-06 00:00:00,329.66,0.569570
1990-02-07 00:00:00,333.75,0.893959
...,...,...
2011-10-10 00:00:00,1194.89,-1.253363
2011-10-11 00:00:00,1195.54,0.491323
2011-10-12 00:00:00,1207.25,1.515392
2011-10-13 00:00:00,1203.66,-0.099124
