In [1]:
print("""
@Description: How to Think About Group Operations
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-08-06 17:38:50
""")


@Description: How to Think About Group Operations
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-08-06 17:38:50



# How to Think About Group Operations

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                   "key2" : pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"),
                   "data1" : np.random.standard_normal(7),
                   "data2" : np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-1.23307,1.258685
1,a,2.0,-2.924705,0.484069
2,,1.0,0.206532,2.723181
3,b,2.0,-0.557765,0.374275
4,b,1.0,1.645875,-2.298543
5,a,,-0.706208,-1.757133
6,,1.0,0.199733,0.508407


In [4]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001D128F11D90>

In [5]:
grouped.mean()

key1
a   -1.621327
b    0.544055
Name: data1, dtype: float64

In [6]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     1      -1.233070
      2      -2.924705
b     1       1.645875
      2      -0.557765
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.23307,-2.924705
b,1.645875,-0.557765


In [10]:
df.groupby(['key1', 'key2']).size()
# 存在任何缺失值都会被丢弃

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [11]:
df.groupby('key1', dropna=False).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [12]:
df.groupby(['key1', 'key2'], dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [14]:
df.groupby('key1').count()
# 计算每个组内的非空值
# 空组也不会作为单独的组进行统计，除非 drop_na = False

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


## 迭代组

GroupBy 对象支持迭代，可以产生一组二元元组（由分组名和数据块组成）。

In [15]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1 -1.233070  1.258685
1    a     2 -2.924705  0.484069
5    a  <NA> -0.706208 -1.757133
b
  key1  key2     data1     data2
3    b     2 -0.557765  0.374275
4    b     1  1.645875 -2.298543


对于多重键的情况，元组的第一个元素将会是由键值组成的元组

In [16]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 1)
  key1  key2    data1     data2
0    a     1 -1.23307  1.258685
('a', 2)
  key1  key2     data1     data2
1    a     2 -2.924705  0.484069
('b', 1)
  key1  key2     data1     data2
4    b     1  1.645875 -2.298543
('b', 2)
  key1  key2     data1     data2
3    b     2 -0.557765  0.374275


In [18]:
pieces = {name: group for name, group in df.groupby('key1')}
pieces['b']

Unnamed: 0,key1,key2,data1,data2
3,b,2,-0.557765,0.374275
4,b,1,1.645875,-2.298543


In [24]:
# grouped = df.groupby({'key1': 'key', 'key2': 'key', 
#                       'data1': 'data', 'data2': 'data'}, axis='columns')
# 将被遗弃
grouped = df.T.groupby({'key1': 'key', 'key2': 'key', 
                        'data1': 'data', 'data2': 'data'})
for group_key, grouped_value in grouped:
    print(group_key)
    print(grouped_value)

data
              0         1         2         3         4         5         6
data1  -1.23307 -2.924705  0.206532 -0.557765  1.645875 -0.706208  0.199733
data2  1.258685  0.484069  2.723181  0.374275 -2.298543 -1.757133  0.508407
key
      0  1     2  3  4     5     6
key1  a  a  None  b  b     a  None
key2  1  2     1  2  1  <NA>     1


## 选取一列或列的子集

In [25]:
df.groupby('key1')['data1']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001D13957FB10>

In [26]:
df.groupby('key1')[['data2']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D13957C690>

上面这种写法是下面写法的语法糖

In [27]:
df['data1'].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001D128EC9E90>

In [28]:
df[['data2']].groupby(df['key1'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D13A20D910>

In [29]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001D13A125010>

In [30]:
s_grouped.mean()

key1  key2
a     1       1.258685
      2       0.484069
b     1      -2.298543
      2       0.374275
Name: data2, dtype: float64

## 通过字典或 Series 进行分组

In [32]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                      columns=["a", "b", "c", "d", "e"],
                      index=["Joe", "Steve", "Wanda", "Jill", "Trey"])
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.561075,-0.278912,0.981564,-2.118902,0.574776
Steve,-0.887901,-2.151963,-0.332642,-1.006194,-1.189322
Wanda,-0.224952,,,1.225954,0.596564
Jill,-0.41315,0.599174,-0.81273,0.843372,-0.606284
Trey,0.394217,-0.531279,-2.297491,0.098857,-0.833303


In [33]:
 mapping = {"a": "red", "b": "red", "c": "blue",
            "d": "blue", "e": "red", "f" : "orange"}

In [36]:
by_column = people.T.groupby(mapping)
by_column.sum().T

Unnamed: 0,blue,red
Joe,-1.137338,0.85694
Steve,-1.338836,-4.229186
Wanda,1.225954,0.371612
Jill,0.030642,-0.420259
Trey,-2.198634,-0.970365


In [39]:
map_series = pd.Series(mapping)
map_series
# pd.Series 可以看作是一种映射（大小固定）

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [41]:
people.T.groupby(map_series).count().T
# 因为未来版本的弃用，axis='columns' 不再使用

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wanda,1,2
Jill,2,3
Trey,2,3


## 通过函数进行分组

In [42]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.561075,-0.278912,0.981564,-2.118902,0.574776
4,-0.018932,0.067895,-3.110222,0.94223,-1.439587
5,-1.112853,-2.151963,-0.332642,0.21976,-0.592758


In [44]:
key_list = ['one', 'one', 'one', 'two', 'two']
# list(map(len, people.index))
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.561075,-0.278912,0.981564,-2.118902,0.574776
4,two,-0.41315,-0.531279,-2.297491,0.098857,-0.833303
5,one,-0.887901,-2.151963,-0.332642,-1.006194,-1.189322


## 根据索引级别分组

In [56]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])

In [57]:
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)),
                       columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.536851,1.614498,0.138663,-1.956374,1.305015
1,-0.675382,1.331327,1.444221,0.303998,-0.46366
2,0.353462,0.442711,0.416734,0.398578,0.530968
3,-1.096948,-0.736004,0.601824,0.243234,1.234884


In [60]:
hier_df.T.groupby(level='cty').count()

Unnamed: 0_level_0,0,1,2,3
cty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JP,2,2,2,2
US,3,3,3,3


In [62]:
hier_df.T

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
cty,tenor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
US,1,-0.536851,-0.675382,0.353462,-1.096948
US,3,1.614498,1.331327,0.442711,-0.736004
US,5,0.138663,1.444221,0.416734,0.601824
JP,1,-1.956374,0.303998,0.398578,0.243234
JP,3,1.305015,-0.46366,0.530968,1.234884
