In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
np.random.seed(42)

data_dir = "D:/python/np_pd_mat/datasets/"

### 分层索引
在DataFrame中，每个轴都可以拥有分层索引:

In [2]:
frame = DataFrame(np.arange(12).reshape(4, 3), 
    index=[['a', 'a', 'b', 'b'], ['1a', '2a', '1b', '2b']], 
    columns=[['SH', 'SH', 'SZ'], ['PD', 'PX', 'NS']])

# 分层的层级可以有名称
frame.index.names = ['k1', 'k2']
frame.columns.names = ['city', 'dist']
frame

Unnamed: 0_level_0,city,SH,SH,SZ
Unnamed: 0_level_1,dist,PD,PX,NS
k1,k2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1a,0,1,2
a,2a,3,4,5
b,1b,6,7,8
b,2b,9,10,11


In [3]:
frame['SZ']

Unnamed: 0_level_0,dist,NS
k1,k2,Unnamed: 2_level_1
a,1a,2
a,2a,5
b,1b,8
b,2b,11


### 重排序和层级排序

In [4]:
# swaplevel接收两个层级序号或层级名称，返回一个进行了层级变更的新对象(但是数据是不变的)
frame.swaplevel('k1', 'k2')

Unnamed: 0_level_0,city,SH,SH,SZ
Unnamed: 0_level_1,dist,PD,PX,NS
k2,k1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1a,a,0,1,2
2a,a,3,4,5
1b,b,6,7,8
2b,b,9,10,11


In [5]:
# 在进行层级变换时使用sort_index以使得结果按照层级进行字典排序
frame.swaplevel('k1', 'k2').sort_index(level='k2')

Unnamed: 0_level_0,city,SH,SH,SZ
Unnamed: 0_level_1,dist,PD,PX,NS
k2,k1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1a,a,0,1,2
1b,b,6,7,8
2a,a,3,4,5
2b,b,9,10,11


### 按层级进行数据汇总

In [6]:
frame.sum(level='k1')

city,SH,SH,SZ
dist,PD,PX,NS
k1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [7]:
frame.sum(level='city', axis=1)

Unnamed: 0_level_0,city,SH,SZ
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1a,1,2
a,2a,7,5
b,1b,13,8
b,2b,19,11


### 使用DataFrame的列进行索引
通常不会使用DataFrame中一个或多个列作为行索引，反而可能要将行索引移动到DataFrame的列中：

In [8]:
df = DataFrame({'a': range(5), 'b': range(5, 0, -1), 'c': ['one']*2 + ['two']*3, 'd': [0, 1]*2 + [2]})
df

Unnamed: 0,a,b,c,d
0,0,5,one,0
1,1,4,one,1
2,2,3,two,0
3,3,2,two,1
4,4,1,two,2


In [9]:
# set_index函数会生成一个新的DataFrame，新的DataFrame使用一个或多个列作为索引
# 默认情况下，这些列会从DataFrame中移除，也可以将它们留在DataFrame中(传入drop=False)
df2 = df.set_index(['c', 'd'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,5
one,1,1,4
two,0,2,3
two,1,3,2
two,2,4,1


In [10]:
df2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,5
1,one,1,1,4
2,two,0,2,3
3,two,1,3,2
4,two,2,4,1


### 联合与合并数据集 --> 数据库风格的DataFrame连接

In [11]:
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a'], 'data1': range(5)})
df2 = DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
print(df1)
print('------------')
print(df2)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
------------
  key  data2
0   a      0
1   b      1
2   d      2


如果连接的键没有指定，merge会自动将重叠列名作为连接的键，但最好显式地指定连接键。默认情况下，merge做的是内连接，返回两张表的交集:

In [12]:
pd.merge(df1, df2, on='key', how='inner')          # 两个DataFrame共有的键有a和b, how参数还可传入：outer, left, right

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0


如果每个对象的列名是不同的，可以分别为它们指定列名：

In [13]:
df3 = DataFrame({'lkey': ['b','b','a','c','a'], 'data1': range(5)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,a,2,a,0
3,a,4,a,0


使用多个键进行合并时，传入一个列名的列表：

In [14]:
left = DataFrame({'key1': ['foo', 'foo', 'bar'], 'key2': ['one', 'two', 'one'], 'Lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], 'key2': ['one', 'one', 'one', 'two'], 'Rval': [4, 5, 6, 7]})
print(left)
print('----------------------')
print(right)
print('----------------------')
pd.merge(left, right, on=['key1', 'key2'], how='outer')

  key1 key2  Lval
0  foo  one     1
1  foo  two     2
2  bar  one     3
----------------------
  key1 key2  Rval
0  foo  one     4
1  foo  one     5
2  bar  one     6
3  bar  two     7
----------------------


Unnamed: 0,key1,key2,Lval,Rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


合并操作中最后一个要考虑的问题是如何处理重叠的列名：

In [15]:
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,Lval,key2_y,Rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [16]:
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,Lval,key2_right,Rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


### 根据索引合并
在某些情况下，DataFrame中用于合并的键是它的索引，此时，可以传递left_index=True或者right_index=True(或者都传)来表示索引需要作为合并的键:

In [17]:
left1 = DataFrame({'key': ['a', 'b', 'a', 'b', 'c'], 'value': range(5)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
# 合并的键是right1的index
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
1,b,1,7.0
3,b,3,7.0


在多层索引的情况下，在索引上连接是一个隐式的多键合并：

In [18]:
left3 = DataFrame({'key1': ['SH', 'SH', 'SZ'], 'key2': [2000, 2001, 2001], 'data': np.arange(3.0)})
right3 = DataFrame(np.arange(12).reshape(6, 2), 
                   index=[['SZ']*2 + ['SH']*4, [2001, 2000, 2000, 2000, 2001, 2002]], columns=['e1','e2'])
print(left3)
print('-------------------')
print(right3)
print('----------------------------')

# 这种情况下，必须以列表的方式指明合并所需的多个列(注意使用how='outer'处理重复的索引值)
pd.merge(left3, right3, left_on=['key1', 'key2'], right_index=True, how='outer')

  key1  key2  data
0   SH  2000   0.0
1   SH  2001   1.0
2   SZ  2001   2.0
-------------------
         e1  e2
SZ 2001   0   1
   2000   2   3
SH 2000   4   5
   2000   6   7
   2001   8   9
   2002  10  11
----------------------------


Unnamed: 0,key1,key2,data,e1,e2
0,SH,2000,0.0,4,5
0,SH,2000,0.0,6,7
1,SH,2001,1.0,8,9
2,SZ,2001,2.0,0,1
2,SZ,2000,,2,3
2,SH,2002,,10,11


使用两边的索引进行合并也是可以的：

In [19]:
left4 = DataFrame([[1, 2], [3, 4], [5, 6]], index=['a', 'c', 'e'], columns=['SH', 'SZ'])
right4 = DataFrame([[7, 8], [9, 10], [11, 12]], index=['b', 'c', 'd'], columns=['East', 'South'])
print(left4)
print('---------------')
print(right4)
print('---------------------')

pd.merge(left4, right4, how='outer', left_index=True, right_index=True)

   SH  SZ
a   1   2
c   3   4
e   5   6
---------------
   East  South
b     7      8
c     9     10
d    11     12
---------------------


Unnamed: 0,SH,SZ,East,South
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,,


DataFrame有一个方便的join实例方法，用于按照索引合并，也可以用于合并**多个索引相同或相似但没有重叠列**的DataFrame对象：

In [20]:
left4.join(right4, how='outer')

Unnamed: 0,SH,SZ,East,South
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,,


### 沿轴向连接

In [21]:
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
# 调用concat方法会将值和索引粘在一起
pd.concat([s1, s2])

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [22]:
# concat方法默认是沿着axis=0的轴向生效的
pd.concat([s1, s2], axis=1, sort=False)

Unnamed: 0,0,1
a,0.0,
b,1.0,
c,,2.0
d,,3.0
e,,4.0


In [23]:
s3 = pd.concat([s1, s2])
pd.concat([s1, s3], axis=1, sort=False)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
c,,2
d,,3
e,,4


In [24]:
# 传入join="inner", "c", "d", "e"标签就消失了
pd.concat([s1, s3], axis=1, join="inner", sort=False)

Unnamed: 0,0,1
a,0,0
b,1,1


在连接的轴上创建一个多层索引：

In [25]:
result = pd.concat([s1, s2], keys=['one', 'two'])
result

one  a    0
     b    1
two  c    2
     d    3
     e    4
dtype: int64

沿着轴向axis=1连接Series时，keys则成为DataFrame的列头：

In [26]:
pd.concat([s1, s2], axis=1, keys=['one', 'two'], sort=False)

Unnamed: 0,one,two
a,0.0,
b,1.0,
c,,2.0
d,,3.0
e,,4.0


将相同的逻辑拓展到DataFrame对象：

In [27]:
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one','two'])
df2 = DataFrame(5+np.arange(4).reshape(2, 2), index=['a','c'], columns=['three', 'four'])
print(df1)
print('---------------')
print(df2)
print('---------------------')

# equivalents to pd.concat({'level1': df1, 'level2': df2}, axis=1, sort=False)
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], sort=False)  

   one  two
a    0    1
b    2    3
c    4    5
---------------
   three  four
a      5     6
c      7     8
---------------------


Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


需要考虑行索引中不包含任何相关数据的DataFrame：

In [28]:
df5 = DataFrame(np.random.randn(2, 3), columns=['a', 'b', 'c'])
df6 = DataFrame(np.random.randn(1, 2), columns=['b', 'd'])
print(df5)
print('----------------------------------------')
print(df6)
print('----------------------------------------')
pd.concat([df5, df6], sort=False)

          a         b         c
0  0.496714 -0.138264  0.647689
1  1.523030 -0.234153 -0.234137
----------------------------------------
          b         d
0  1.579213  0.767435
----------------------------------------


Unnamed: 0,a,b,c,d
0,0.496714,-0.138264,0.647689,
1,1.52303,-0.234153,-0.234137,
0,,1.579213,,0.767435


In [29]:
pd.concat([df5, df6], ignore_index=True, sort=False)     # 上面的index是[0, 1, 0]

Unnamed: 0,a,b,c,d
0,0.496714,-0.138264,0.647689,
1,1.52303,-0.234153,-0.234137,
2,,1.579213,,0.767435


### 联合重叠数据

In [30]:
a = Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan])
b = Series([0.0, np.nan, 2.0, np.nan, np.nan, 5.0])
np.where(pd.isnull(a), b, a)           # a为NaN则取b的值，否则取a的值，a.combine_first(b)有同样的效果

array([0. , 2.5, 0. , 3.5, 4.5, 5. ])

在DataFrame中，combine_first逐列做相同的操作，可以认为它是根据传入的对象来"修补"调用对象的缺失值:

In [31]:
df1 = DataFrame({'a': [1, np.nan, 5, np.nan], 'b': [np.nan, 2, np.nan, 6], 'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5, 4, np.nan, 3, 7], 'b': [np.nan, 3, 4, 6, 8]})
print(df1)
print('---------------')
print(df2)

     a    b   c
0  1.0  NaN   2
1  NaN  2.0   6
2  5.0  NaN  10
3  NaN  6.0  14
---------------
     a    b
0  5.0  NaN
1  4.0  3.0
2  NaN  4.0
3  3.0  6.0
4  7.0  8.0


In [32]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


### 重塑和透视 --> 使用多层索引进行重塑
- stack: 拆堆，该操作会将行中的数据透视到列
- unstack: 堆叠，该操作会将列中的数据透视到行

In [33]:
data = DataFrame(np.arange(6).reshape(2, 3), 
                 index=pd.Index(['SH', 'SZ'], name='city'), 
                 columns=pd.Index(['one', 'two', 'three'], name='num'))
data

num,one,two,three
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SH,0,1,2
SZ,3,4,5


In [34]:
result = data.stack()
result

city  num  
SH    one      0
      two      1
      three    2
SZ    one      3
      two      4
      three    5
dtype: int32

In [35]:
result.unstack()

num,one,two,three
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SH,0,1,2
SZ,3,4,5


默认情况下，最内层是已拆堆的(与stack方法一样)，可以传入一个层级序号或名称来拆分一个不同的层级:

In [36]:
result.unstack(0)     # equivalents to: result.unstack('city)

city,SH,SZ
num,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


如果层级中的所有值并未包含于每个子分组中时，拆分可能会引入缺失值:

In [37]:
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [38]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


默认情况下，堆叠会过滤出缺失值，因此堆叠拆堆的操作是可逆的：

In [39]:
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [40]:
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

当在DataFrame中拆堆时，被拆堆的层级会变为结果中最低的层级：

In [41]:
df = DataFrame({'left': result, 'right': result+5}, columns=pd.Index(['left', 'right'], name='side'))
df

Unnamed: 0_level_0,side,left,right
city,num,Unnamed: 2_level_1,Unnamed: 3_level_1
SH,one,0,5
SH,two,1,6
SH,three,2,7
SZ,one,3,8
SZ,two,4,9
SZ,three,5,10


In [42]:
df.unstack('city')

side,left,left,right,right
city,SH,SZ,SH,SZ
num,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


在调用stack方法时，可以指明需要堆叠的轴向名称：

In [43]:
df.unstack('city').stack('side')

Unnamed: 0_level_0,city,SH,SZ
num,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,0,3
one,right,5,8
two,left,1,4
two,right,6,9
three,left,2,5
three,right,7,10
