In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
from pandas import Series, DataFrame

### 数据分组 --> GroupBy机制

In [2]:
df = DataFrame({'key1': ['a', 'a', 'b', 'b', 'a' ], 'key2': ['one', 'two', 'one', 'two', 'one'],
                'data1': np.random.randn(5), 'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.496714,-0.234137
1,a,two,-0.138264,1.579213
2,b,one,0.647689,0.767435
3,b,two,1.52303,-0.469474
4,a,one,-0.234153,0.54256


In [3]:
grouped = df['data1'].groupby(df['key1'])
grouped                  # grouped现在是一个GroupBy对象，它实际上还没有进行任何计算

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001B04BB774F0>

In [4]:
grouped.mean()

key1
a    0.041432
b    1.085359
Name: data1, dtype: float64

In [5]:
means = df.data1.groupby([df.key1, df.key2]).mean()
means

key1  key2
a     one     0.131280
      two    -0.138264
b     one     0.647689
      two     1.523030
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.13128,-0.138264
b,0.647689,1.52303


分组键也可以是正确长度的任何数组:

In [7]:
cities = np.array(['SH', 'SZ', 'SZ', 'SH', 'SH'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df.data1.groupby([cities, years]).mean()

SH  2005    1.009872
    2006   -0.234153
SZ  2005   -0.138264
    2006    0.647689
Name: data1, dtype: float64

分组信息作为想要继续处理的数据，通常包含在同一个DataFrame中，这种情况下，可以传递列名(无论列名是字符串、数字或其他Python对象)作为分组键:

In [8]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.041432,0.629212
b,1.085359,0.14898


In [9]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.13128,0.154212
a,two,-0.138264,1.579213
b,one,0.647689,0.767435
b,two,1.52303,-0.469474


size方法返回一个包含组大小信息的Series:

In [10]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

注意，分组键中的任何缺失值会被排除在结果之外！

### 数据分组 --> 遍历各分组
GroupBy对象支持迭代，会生成一个包含组名和数据块的2维元组序列:

In [11]:
for name, group in df.groupby('key1'):
    print('Group Name: ', name)
    print(group)
    print('---------------------------------')

Group Name:  a
  key1 key2     data1     data2
0    a  one  0.496714 -0.234137
1    a  two -0.138264  1.579213
4    a  one -0.234153  0.542560
---------------------------------
Group Name:  b
  key1 key2     data1     data2
2    b  one  0.647689  0.767435
3    b  two  1.523030 -0.469474
---------------------------------


在多个分组键的情况下，元组中的第一个元素是键值的元组:

In [12]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print('Group Name: ', (k1, k2))
    print(group)
    print('---------------------------------')

Group Name:  ('a', 'one')
  key1 key2     data1     data2
0    a  one  0.496714 -0.234137
4    a  one -0.234153  0.542560
---------------------------------
Group Name:  ('a', 'two')
  key1 key2     data1     data2
1    a  two -0.138264  1.579213
---------------------------------
Group Name:  ('b', 'one')
  key1 key2     data1     data2
2    b  one  0.647689  0.767435
---------------------------------
Group Name:  ('b', 'two')
  key1 key2    data1     data2
3    b  two  1.52303 -0.469474
---------------------------------


计算出数据块的字典:

In [13]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':   key1 key2     data1     data2
 0    a  one  0.496714 -0.234137
 1    a  two -0.138264  1.579213
 4    a  one -0.234153  0.542560,
 'b':   key1 key2     data1     data2
 2    b  one  0.647689  0.767435
 3    b  two  1.523030 -0.469474}

In [14]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,0.647689,0.767435
3,b,two,1.52303,-0.469474


默认情况下，groupby在axis=0的轴向上分组，也可以在其他任意轴向上进行分组:

In [15]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [16]:
grouped = df.groupby(df.dtypes, axis=1)

for dtype, group in grouped:
    print('Dtype: ', dtype)
    print(group)
    print('---------------------')

Dtype:  float64
      data1     data2
0  0.496714 -0.234137
1 -0.138264  1.579213
2  0.647689  0.767435
3  1.523030 -0.469474
4 -0.234153  0.542560
---------------------
Dtype:  object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
---------------------


### 数据分组 --> 选择一列或所有列的子集
将从DataFrame创建的GroupBy对象用列名称或列名称数组进行索引时，会产生用于聚合的列子集的效果，这表明：  
df.groupby('key1')\['data1'\] 是 df\['data1'\].groupby(df\['key1'\])   
df.groupby('key1')\[\['data2'\]\] 是 df\[\['data1'\]\].groupby(df\['key1'\])   
的语法糖！

In [17]:
df.groupby(['key1', 'key2'])['data2'].mean()      # equivalents to: df['data2'].groupby([df['key1'], df['key2']]).mean()

key1  key2
a     one     0.154212
      two     1.579213
b     one     0.767435
      two    -0.469474
Name: data2, dtype: float64

In [18]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.154212
a,two,1.579213
b,one,0.767435
b,two,-0.469474


上面两个语句表明：如果传递的是列表或数组，则此索引操作返回的对象是分组的DataFrame；如果只有单个列作为标量传递，则为分组的Series。

### 数据分组 --> 使用字典和Series分组
分组信息可能会以非数组形式存在:

In [19]:
weekday = DataFrame(np.random.randn(5, 5), columns = ['a','b','c','d','e'], index = ['Mon','Tues','Wed','Thur','Fri'])
weekday.iloc[2:3, [1, 2]] = np.nan
weekday

Unnamed: 0,a,b,c,d,e
Mon,-0.463418,-0.46573,0.241962,-1.91328,-1.724918
Tues,-0.562288,-1.012831,0.314247,-0.908024,-1.412304
Wed,1.465649,,,-1.424748,-0.544383
Thur,0.110923,-1.150994,0.375698,-0.600639,-0.291694
Fri,-0.601707,1.852278,-0.013497,-1.057711,0.822545


假设有各列的分组对应关系，并把各列按组累加：

In [20]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}
by_column = weekday.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Mon,-1.671318,-2.654065
Tues,-0.593777,-2.987422
Wed,-1.424748,0.921266
Thur,-0.224941,-1.331765
Fri,-1.071208,2.073116


Seiries也有相同的功能，可以视为固定大小的映射:

In [21]:
map_series = Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [22]:
weekday.groupby(map_series, axis = 1).count()

Unnamed: 0,blue,red
Mon,2,3
Tues,2,3
Wed,1,2
Thur,2,3
Fri,2,3


### 数据分组 --> 使用函数分组
与使用字典或Series分组相比，使用Python函数是定义分组关系的一种更为通用的方式。作为分组键传递的函数将会按照每个索引值调用一次，同时返回值会被用作分组名称：

In [23]:
# 根据名称的长度进行分组
weekday.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.400524,1.386548,0.228465,-4.395739,-1.446756
4,-0.451365,-2.163825,0.689945,-1.508663,-1.703997


可将函数与数组，字典或Series进行混合，所有的对象都会在内部转换为数组:

In [24]:
key_list = ['one', 'one', 'one', 'two', 'two']
weekday.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.463418,-0.46573,0.241962,-1.91328,-1.724918
3,two,-0.601707,1.852278,-0.013497,-1.057711,0.822545
4,one,-0.562288,-1.012831,0.314247,-0.908024,-1.412304
4,two,0.110923,-1.150994,0.375698,-0.600639,-0.291694


### 数据分组 --> 根据索引层级分组

In [25]:
columns = pd.MultiIndex.from_arrays([['sh', 'sh', 'sh', 'sz', 'sz'], [1, 3, 5, 1, 3]], names = ['city', 'tenor'])
city_df = DataFrame(np.random.randn(4, 5), columns = columns)
city_df

city,sh,sh,sh,sz,sz
tenor,1,3,5,1,3
0,-1.220844,0.208864,-1.95967,-1.328186,0.196861
1,0.738467,0.171368,-0.115648,-0.301104,-1.478522
2,-0.719844,-0.460639,1.057122,0.343618,-1.76304
3,0.324084,-0.385082,-0.676922,0.611676,1.031


根据层级分组时，将层级数值或层级名称传递给level关键字：

In [26]:
city_df.groupby(level='city', axis=1).count()

city,sh,sz
0,3,2
1,3,2
2,3,2
3,3,2


### 数据聚合
聚合是指所有数据根据数组产生标量值的数据转换过程，例如以下优化的GroupBy方法：
- count: 分组中的非NA值数量
- sum: 非NA值的累和
- mean: 非NA值的均值
- median: 非NA值的算数中位数
- std、var: 无偏的(n-1分母)的标准差和方差
- min、max: 非NA值的最小值、最大值
- prod: 非NA值的累积
- first、last: 非NA值的第一个和最后一个值  

可以使用自行制定的聚合，并再调用已经在分组对象上定义好的方法，例如可以使用quantile计算Series或DataFrame列的样本分位数。监管quantile并不是显式地为GroupBy对象实现的，但它是Series的方法，因此也可以用于聚合。在内部，GroupBy有效地对Series进行切片，为每一块调用piece.quantile(0.9),然后将这些结果一起组装到结果对象中:

In [27]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.496714,-0.234137
1,a,two,-0.138264,1.579213
2,b,one,0.647689,0.767435
3,b,two,1.52303,-0.469474
4,a,one,-0.234153,0.54256


In [28]:
grouped = df.groupby('key1')
grouped.data1.quantile(0.9)

key1
a    0.369718
b    1.435496
Name: data1, dtype: float64

要使用自定的聚合函数，需要将函数传递给aggregate或者agg方法:

In [29]:
def peak_to_peak(arr):
    return arr.max() - arr.min()


grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.730868,1.81335
b,0.875341,1.236909


自定义聚合函数通常比优化的GroupBy方法慢得多，因为在构造中间组数据块时有一些额外的开销(函数调用、数据重新排列等)！

### 数据聚合 --> 使用指定分组值填充缺失值
假设需要填充值按组来变化，一个方法是对数据分组后使用apply和一个在每个数据块上都调用fillna的函数：

In [30]:
cities = ['Shanghai', 'Suzhou', 'Hangzhou', 'Hefei', 'Xian', 'Lanzhou', 'Chengdu', 'Lasa']
group_key = ['East'] * 4 + ['West'] * 4
data = Series(np.random.randn(8), index=cities)
data[['Hangzhou', 'Lanzhou', 'Lasa']] = np.nan
data

Shanghai    0.931280
Suzhou     -0.839218
Hangzhou         NaN
Hefei       0.331263
Xian        0.975545
Lanzhou          NaN
Chengdu    -0.185659
Lasa             NaN
dtype: float64

In [31]:
data.groupby(group_key).mean()

East    0.141109
West    0.394943
dtype: float64

使用分组的平均值来填充NA值：

In [32]:
data.groupby(group_key).apply(lambda g: g.fillna(g.mean()))

Shanghai    0.931280
Suzhou     -0.839218
Hangzhou    0.141109
Hefei       0.331263
Xian        0.975545
Lanzhou     0.394943
Chengdu    -0.185659
Lasa        0.394943
dtype: float64

在另一种情况下，可能在代码中已经为每个分组预定义了填充值，由于每个分组都有一个内置的name属性，可以这样使用：

In [33]:
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

Shanghai    0.931280
Suzhou     -0.839218
Hangzhou    0.500000
Hefei       0.331263
Xian        0.975545
Lanzhou    -1.000000
Chengdu    -0.185659
Lasa       -1.000000
dtype: float64