In [1]:
print("""
@File         : Aggregations.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-12-26 21:23:08
@Email        : cuixuanstephen@gmail.com
@Description  : Aggregations
""")


@File         : Aggregations.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-12-26 21:23:08
@Email        : cuixuanstephen@gmail.com
@Description  : 



In [2]:
import pandas as pd
import numpy as np

In [3]:
np.random.seed(42)
ser = pd.Series(np.random.rand(10_000), dtype=pd.Float64Dtype())
ser

0        0.37454
1       0.950714
2       0.731994
3       0.598658
4       0.156019
          ...   
9995    0.857656
9996    0.897509
9997    0.946708
9998    0.397488
9999     0.21714
Length: 10000, dtype: Float64

pandas 库提供了许多常用聚合的方法，例如 `pd.Series.count`、`pd.Series.mean`、`pd.Series.std`、`pd.Series.min`、`pd.Series.max` 和 `pd.Series.sum`：

In [4]:
print(f"Count is: {ser.count()}")
print(f"Mean value is: {ser.mean()}")
print(f"Standard deviation is: {ser.std()}")
print(f"Minimum value is: {ser.min()}")
print(f"Maximum value is: {ser.max()}")
print(f"Summation is: {ser.sum()}")

Count is: 10000
Mean value is: 0.49415955768429964
Standard deviation is: 0.2876301265269928
Minimum value is: 1.1634755366141114e-05
Maximum value is: 0.9997176732861306
Summation is: 4941.595576842997


除了直接调用这些方法之外，调用这些聚合的更通用的方法是使用 `pd.Series.agg`，并以字符串形式提供想要执行的聚合的名称

In [5]:
print(f"Count is: {ser.agg('count')}")
print(f"Mean value is: {ser.agg('mean')}")
print(f"Standard deviation is: {ser.agg('std')}")
print(f"Minimum value is: {ser.agg('min')}")
print(f"Maximum value is: {ser.agg('max')}")
print(f"Summation is: {ser.agg('sum')}")

Count is: 10000
Mean value is: 0.49415955768429964
Standard deviation is: 0.2876301265269928
Minimum value is: 1.1634755366141114e-05
Maximum value is: 0.9997176732861306
Summation is: 4941.595576842997


使用 `pd.Series.agg` 的一个优点是它可以执行多个聚合。例如，如果您想在一个步骤中计算某个字段的最小值和最大值，可以通过向 `pd.Series.agg` 提供一个列表来实现：

In [6]:
ser.agg(['min', 'max'])

min    0.000012
max    0.999718
dtype: float64

In [8]:
np.random.seed(42)
df = pd.DataFrame(
    np.random.randn(10_000, 6),
    columns=list('abcdef')
).convert_dtypes(dtype_backend='numpy_nullable')
df

Unnamed: 0,a,b,c,d,e,f
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137
1,1.579213,0.767435,-0.469474,0.54256,-0.463418,-0.46573
2,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247
3,-0.908024,-1.412304,1.465649,-0.225776,0.067528,-1.424748
4,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694
...,...,...,...,...,...,...
9995,1.951254,0.324704,1.937021,-0.125083,0.589664,0.869128
9996,0.624062,-0.31734,-1.636983,2.390878,-0.597118,2.670553
9997,-0.470192,1.511932,0.718306,0.764051,-0.495094,-0.273401
9998,-0.259206,0.274769,-0.084735,-0.406717,-0.815527,-0.716988


In [9]:
df.sum()

a    -21.365908
b     -7.963987
c    152.032992
d   -180.727498
e     29.399311
f     25.042078
dtype: Float64

如果想要聚合每行的数据，可以指定 `axis=1` 参数，最好显式指明 `axis='columns'`。

> 需要注意的是，pandas 对 `axis=0` 操作进行了更优化，因此这有可能比聚合列慢得多。即便如此，这也是 pandas 的一个相当独特的功能，当性能不是主要问题时，它非常有用：

In [10]:
df.sum(axis='columns')

0       2.060878
1       1.490586
2      -4.657107
3      -2.437675
4      -2.101088
          ...   
9995     5.54669
9996    3.134053
9997    1.755601
9998   -2.008404
9999   -3.314518
Length: 10000, dtype: Float64

In [12]:
df.agg(['max', 'min'], axis='index')

Unnamed: 0,a,b,c,d,e,f
max,3.602415,3.745379,3.727833,4.479084,3.691625,3.942331
min,-4.295391,-3.436062,-3.9224,-4.465604,-3.836656,-4.157734


> 对于更复杂的情况，可以传入可调用参数。每个可调用函数都应接受单个参数 `pd.Series` 并缩减为标量：

In [13]:
def mean_and_add_42(ser:pd.Series):
    return ser.mean() + 42

def mean_and_sub_42(ser : pd.Series):
    return ser.mean() - 42

In [14]:
np.random.seed(42)
ser = pd.Series(np.random.rand(10_000), dtype=pd.Float64Dtype())
ser.agg([mean_and_add_42, mean_and_sub_42])

mean_and_add_42    42.49416
mean_and_sub_42   -41.50584
dtype: float64