# 数据聚合

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
df = DataFrame({'data1': np.random.randn(5) * 2,
                'data2': np.random.randn(5) * 2,
                'key1': ['a', 'a', 'b', 'b', 'a'],
                'key2': ['one', 'two', 'one', 'two', 'one']})

In [4]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.127699,-2.760985,a,one
1,-1.464567,-2.195542,a,two
2,-1.965913,-1.096304,b,one
3,-1.563437,0.073929,b,two
4,0.921956,-2.249545,a,one


In [5]:
grouped = df.groupby('key1')

In [7]:
grouped['data1'].quantile(0.9)

key1
a    0.712025
b   -1.603685
Name: data1, dtype: float64

In [8]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [10]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.386523,0.565443
b,0.402476,1.170233


In [11]:
grouped.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,count,3.0,3.0
a,mean,-0.223437,-2.402024
a,std,1.196139,0.31204
a,min,-1.464567,-2.760985
a,25%,-0.796133,-2.505265
a,50%,-0.127699,-2.249545
a,75%,0.397128,-2.222543
a,max,0.921956,-2.195542
b,count,2.0,2.0
b,mean,-1.764675,-0.511188


In [13]:
df.median()

data1   -1.464567
data2   -2.195542
dtype: float64

In [14]:
df.prod()

data1    0.529969
data2    1.105210
dtype: float64

In [15]:
tips = pd.read_csv('/Users/wonderful/Desktop/tips.csv')

In [16]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [17]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [19]:
tips['tip_pct']

0      0.059447
1      0.160542
2      0.166587
3      0.139780
4      0.146808
5      0.186240
6      0.228050
7      0.116071
8      0.130319
9      0.218539
10     0.166504
11     0.141804
12     0.101816
13     0.162778
14     0.203641
15     0.181650
16     0.161665
17     0.227747
18     0.206246
19     0.162228
20     0.227679
21     0.135535
22     0.141408
23     0.192288
24     0.160444
25     0.131387
26     0.149589
27     0.157604
28     0.198157
29     0.152672
         ...   
214    0.230742
215    0.085271
216    0.106572
217    0.129422
218    0.186047
219    0.102522
220    0.180921
221    0.259314
222    0.223776
223    0.187735
224    0.117735
225    0.153657
226    0.198216
227    0.146699
228    0.204819
229    0.130199
230    0.083299
231    0.191205
232    0.291990
233    0.136490
234    0.193175
235    0.124131
236    0.079365
237    0.035638
238    0.130338
239    0.203927
240    0.073584
241    0.088222
242    0.098204
243    0.159744
Name: tip_pct, dtype: fl

## 面向列的多函数应用

In [20]:
grouped = tips.groupby(['sex', 'smoker'])

In [21]:
grouped

<pandas.core.groupby.DataFrameGroupBy object at 0x113894c50>

In [22]:
grouped_pct = grouped['tip_pct']

In [23]:
grouped_pct.agg('mean')

sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64

In [24]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


In [25]:
grouped_pct.agg([('foo', 'mean'), ('bat', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bat
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,0.156921,0.036421
Female,Yes,0.18215,0.071595
Male,No,0.160669,0.041849
Male,Yes,0.152771,0.090588


In [26]:
functions = ['count', 'mean', 'max']

In [28]:
result = grouped['tip_pct', 'total_bill'].agg(functions)

In [29]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,No,54,0.156921,0.252672,54,18.105185,35.83
Female,Yes,33,0.18215,0.416667,33,17.977879,44.3
Male,No,97,0.160669,0.29199,97,19.791237,48.33
Male,Yes,60,0.152771,0.710345,60,22.2845,50.81


In [30]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,54,0.156921,0.252672
Female,Yes,33,0.18215,0.416667
Male,No,97,0.160669,0.29199
Male,Yes,60,0.152771,0.710345


In [31]:
ftuples = [('Durchschnitt', 'mean'), ('Abweuchung', np.var)]

In [32]:
grouped['tip_pct', 'total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweuchung,Durchschnitt,Abweuchung
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,No,0.156921,0.001327,18.105185,53.092422
Female,Yes,0.18215,0.005126,17.977879,84.451517
Male,No,0.160669,0.001751,19.791237,76.152961
Male,Yes,0.152771,0.008206,22.2845,98.244673


In [33]:
grouped.agg({'tip': np.max, 'size': 'std'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,5.2,1.073146
Female,Yes,6.5,0.613917
Male,No,9.0,0.989094
Male,Yes,10.0,0.89253


In [34]:
grouped.agg({'tip_pct': ['min', 'max', 'mean', 'std'],
             'size': 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,No,0.056797,0.252672,0.156921,0.036421,140
Female,Yes,0.056433,0.416667,0.18215,0.071595,74
Male,No,0.071804,0.29199,0.160669,0.041849,263
Male,Yes,0.035638,0.710345,0.152771,0.090588,150


# 以“无索引”的形式返回聚合数据

In [36]:
tips.groupby(['sex', 'smoker'], as_index=False).mean()

Unnamed: 0,sex,smoker,total_bill,tip,size,tip_pct
0,Female,No,18.105185,2.773519,2.592593,0.156921
1,Female,Yes,17.977879,2.931515,2.242424,0.18215
2,Male,No,19.791237,3.113402,2.71134,0.160669
3,Male,Yes,22.2845,3.051167,2.5,0.152771
