# Chapter 4 分组
参考DataWhale：https://datawhalechina.github.io/joyful-pandas/build/html/%E7%9B%AE%E5%BD%95/ch4.html#id10

In [1]:
import numpy as np
import pandas as pd

## 1. 分组模式及其对象
### 1.1 分组的一般模式   
按照A字段分组，并B字段对作出某个计算。   
df.groupby(分组依据A)[数据来源B].使用操作

In [2]:
df=pd.read_csv('/Users/jie/Documents/Python/joyful-pandas-master/data/learn_pandas.csv')
df.groupby('Gender')['Height'].mean()

Gender
Female    159.19697
Male      173.62549
Name: Height, dtype: float64

### 1.2 分组依据的本质

In [3]:
#根据多个维度分组
df.groupby(['School','Gender'])['Height'].mean()

School                         Gender
Fudan University               Female    158.776923
                               Male      174.212500
Peking University              Female    158.666667
                               Male      172.030000
Shanghai Jiao Tong University  Female    159.122500
                               Male      176.760000
Tsinghua University            Female    159.753333
                               Male      171.638889
Name: Height, dtype: float64

In [4]:
#增加了限制条件的分组
condition=df.Weight>df.Weight.mean()
df.groupby(condition)['Weight'].mean()

Weight
False    47.343750
True     71.114754
Name: Weight, dtype: float64

In [5]:
#练一练
df_ex=df.copy()
df_ex.loc[df_ex.Weight>df_ex.Weight.quantile(0.75), 'group']='high'
df_ex.loc[(df_ex.Weight<=df_ex.Weight.quantile(0.75)) & (df.Weight>df_ex.Weight.quantile(0.25)), 'group']='normal'
df_ex.loc[df_ex.Weight<=df_ex.Weight.quantile(0.25), 'group']='low'
df_ex.groupby('group')['Height'].mean()

group
high      174.935714
low       154.119149
normal    162.255294
Name: Height, dtype: float64

### 1.3 Groupby对象
一些groupby的属性

In [6]:
gb=df.groupby(['School','Grade'])
gb

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x120f1b070>

In [7]:
#n of groups
gb.ngroups

16

In [8]:
#return to index
res=gb.groups
res.keys()

dict_keys([('Fudan University', 'Freshman'), ('Fudan University', 'Junior'), ('Fudan University', 'Senior'), ('Fudan University', 'Sophomore'), ('Peking University', 'Freshman'), ('Peking University', 'Junior'), ('Peking University', 'Senior'), ('Peking University', 'Sophomore'), ('Shanghai Jiao Tong University', 'Freshman'), ('Shanghai Jiao Tong University', 'Junior'), ('Shanghai Jiao Tong University', 'Senior'), ('Shanghai Jiao Tong University', 'Sophomore'), ('Tsinghua University', 'Freshman'), ('Tsinghua University', 'Junior'), ('Tsinghua University', 'Senior'), ('Tsinghua University', 'Sophomore')])

In [9]:
#练一练
#用groups.keys()

In [10]:
#size (n of values)
gb.size()

School                         Grade    
Fudan University               Freshman      9
                               Junior       12
                               Senior       11
                               Sophomore     8
Peking University              Freshman     13
                               Junior        8
                               Senior        8
                               Sophomore     5
Shanghai Jiao Tong University  Freshman     13
                               Junior       17
                               Senior       22
                               Sophomore     5
Tsinghua University            Freshman     17
                               Junior       22
                               Senior       14
                               Sophomore    16
dtype: int64

In [11]:
#row of group
gb.get_group(('Fudan University', 'Freshman')).iloc[:3,:3]

Unnamed: 0,School,Grade,Name
15,Fudan University,Freshman,Changqiang Yang
28,Fudan University,Freshman,Gaoqiang Qin
63,Fudan University,Freshman,Gaofeng Zhao


### 1.4 分组的三大操作
聚合（agg）：返回一个标量值，平均值、中位数、组容量size等；  
变换（transform）：返回一个Series类型，做原序列的标准化处理；   
过滤（filter）：返回DataFrame类型，整个行；

## 2. 聚合函数
### 2.1 内置聚合函数  
包括的函数：max, min, mean, median, count, all, any, idxmax, idxmin, mad(median absolute deviation), nunique, skew(skewness), quantile, sum, std, var, sem(standard error of mean), size, prod.

In [12]:
gb.idxmin()

Unnamed: 0_level_0,Unnamed: 1_level_0,Height,Weight,Test_Number
School,Grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fudan University,Freshman,63,108,70
Fudan University,Junior,195,90,26
Fudan University,Senior,49,49,66
Fudan University,Sophomore,37,3,48
Peking University,Freshman,185,45,1
Peking University,Junior,159,159,20
Peking University,Senior,30,30,116
Peking University,Sophomore,120,120,61
Shanghai Jiao Tong University,Freshman,121,121,0
Shanghai Jiao Tong University,Junior,143,143,31


In [13]:
#练一练
#done

In [14]:
#多个数据来源
gb=df.groupby('Gender')[['Height','Weight']]
gb.max()

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,170.2,63.0
Male,193.9,89.0


### 2.2 agg方法
agg解决四类问题：   
1. 无法同时使用多个函数；   
2. 无法对特定的列使用特定的聚合函数；   
3. 无法使用自定义的聚合函数；   
4. 无法直接对结果的列名在聚合前进行自定义命名；   

In [15]:
#多个函数
gb.agg(['sum','idxmax','skew'])

Unnamed: 0_level_0,Height,Height,Height,Weight,Weight,Weight
Unnamed: 0_level_1,sum,idxmax,skew,sum,idxmax,skew
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,21014.0,28,-0.219253,6469.0,28,-0.268482
Male,8854.9,193,0.437535,3929.0,2,-0.332393


In [16]:
#特定的列--特定的聚合函数
gb.agg({'Height':['mean','max'],'Weight':'count'})

Unnamed: 0_level_0,Height,Height,Weight
Unnamed: 0_level_1,mean,max,count
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,159.19697,170.2,135
Male,173.62549,193.9,54


In [17]:
#练一练
gb.agg({'Height':['sum','idxmax','skew'], 'Weight':['sum','idxmax','skew']})

Unnamed: 0_level_0,Height,Height,Height,Weight,Weight,Weight
Unnamed: 0_level_1,sum,idxmax,skew,sum,idxmax,skew
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,21014.0,28,-0.219253,6469.0,28,-0.268482
Male,8854.9,193,0.437535,3929.0,2,-0.332393


In [18]:
#使用自定义函数，逐列计算
gb.agg(lambda x: x.mean()-x.min())

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,13.79697,13.918519
Male,17.92549,21.759259


In [19]:
#练一练
a=gb.describe().Height
b=gb.describe().Weight
print('Height:\n', a.groupby('Gender')['mean'].sum()-a.groupby('Gender')['min'].sum())
print('Weight:\n', b.groupby('Gender')['mean'].sum()-b.groupby('Gender')['min'].sum())

Height:
 Gender
Female    13.79697
Male      17.92549
dtype: float64
Weight:
 Gender
Female    13.918519
Male      21.759259
dtype: float64


In [20]:
#聚合结果重命名，元组里第一个元素是新的名字，第二个元素是函数（聚合或自定义）
gb.agg([('range', lambda x: x.max()-x.min()), ('my_sum', 'sum')])

Unnamed: 0_level_0,Height,Height,Weight,Weight
Unnamed: 0_level_1,range,my_sum,range,my_sum
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,24.8,21014.0,29.0,6469.0
Male,38.2,8854.9,38.0,3929.0


In [21]:
gb.agg({'Height': [('my_func', lambda x: x.min()), 'sum'],
       'Weight': lambda x:x.max()})

Unnamed: 0_level_0,Height,Height,Weight
Unnamed: 0_level_1,my_func,sum,<lambda>
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,145.4,21014.0,63.0
Male,155.7,8854.9,89.0


## 3. 变换和过滤
### 3.1 变换函数与transform方法
常用的内置变换函数：cumcount, cumsum, cumprod, cummax, cummin

In [22]:
gb.cummax().head()

Unnamed: 0,Height,Weight
0,158.9,46.0
1,166.5,70.0
2,188.9,89.0
3,,46.0
4,188.9,89.0


In [23]:
#练一练
#rank()：对其他非聚类列的属于同类数据进行数值大小排序
df.groupby(['School']).rank(method='min',ascending=False)

Unnamed: 0,Height,Weight,Test_Number
0,38.0,41.0,29.0
1,11.0,6.0,18.0
2,1.0,1.0,11.0
3,,37.0,5.0
4,6.0,4.0,5.0
...,...,...,...
195,29.0,30.0,5.0
196,37.0,40.0,1.0
197,49.0,44.0,29.0
198,9.0,10.0,11.0


In [24]:
#transform方法
gb.transform(lambda x: (x-x.mean())/x.std()).head()

Unnamed: 0,Height,Weight
0,-0.05876,-0.354888
1,-1.010925,-0.355
2,2.167063,2.089498
3,,-1.279789
4,0.053133,0.159631


In [25]:
#练一练
#对指定列使用特定的变换
def func(x):
    res=x.min()
    if x.name =='Weight':
        res=x.sum()
    return res

gb.transform(func).head()

Unnamed: 0,Height,Weight
0,145.4,6469.0
1,155.7,3929.0
2,155.7,3929.0
3,145.4,6469.0
4,155.7,3929.0


### 3.2 组索引与过滤
过滤在分组中是对于组的过滤，索引是对于行的过滤。   
当组内所有行都满足条件才会被保留。

In [26]:
#过滤所有容量大于100的组
gb.filter(lambda x: x.shape[0]>100).head()

Unnamed: 0,Height,Weight
0,158.9,46.0
3,,41.0
5,158.0,51.0
6,162.5,52.0
7,161.9,50.0


In [27]:
#练一练
#使用filter完成loc[]的功能
df.loc[[2,3,4]]

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer,Test_Number,Test_Date,Time_Record
2,Shanghai Jiao Tong University,Senior,Mei Sun,Male,188.9,89.0,N,2,2019/9/12,0:05:22
3,Fudan University,Sophomore,Xiaojuan Sun,Female,,41.0,N,2,2020/1/3,0:04:08
4,Fudan University,Sophomore,Gaojuan You,Male,174.0,74.0,N,2,2019/11/6,0:05:22


In [28]:
df.groupby(df.index.isin(range(2,5))).filter(lambda x: x.name)
#一点一点试出来的。。。需要再review一遍，理解不深刻。大致思路是把每一行都看成一组，然后filter。

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer,Test_Number,Test_Date,Time_Record
2,Shanghai Jiao Tong University,Senior,Mei Sun,Male,188.9,89.0,N,2,2019/9/12,0:05:22
3,Fudan University,Sophomore,Xiaojuan Sun,Female,,41.0,N,2,2020/1/3,0:04:08
4,Fudan University,Sophomore,Gaojuan You,Male,174.0,74.0,N,2,2019/11/6,0:05:22


## 4. 跨列分组
### 4.1 apply的引入
apply函数用于多列数据同时处理。agg函数只能逐列处理！！   
### 4.2 apply的使用

In [29]:
#计算BMI
def BMI(x):
    Height=x['Height']/100
    Weight=x['Weight']
    BMI_value=Weight/Height**2
    return BMI_value.mean()

gb.apply(BMI)

Gender
Female    18.860930
Male      24.318654
dtype: float64

In [30]:
#标量情况，得到Series
gb=df.groupby(['Gender','Test_Number'])[['Height','Weight']]
gb.apply(lambda x:0)

Gender  Test_Number
Female  1              0
        2              0
        3              0
Male    1              0
        2              0
        3              0
dtype: int64

In [31]:
#Series情况，得到DataFrame
gb.apply(lambda x: pd.Series([0,0],index=['a','b']))

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
Gender,Test_Number,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,1,0,0
Female,2,0,0
Female,3,0,0
Male,1,0,0
Male,2,0,0
Male,3,0,0


In [32]:
#练一练
#没看明白题目是什么意思。。。需要Review

In [33]:
#DataFrame情况：得到DataFrame
gb.apply(lambda x: pd.DataFrame(np.ones((2,2)),
                               index=['a','b'],
                               columns=pd.Index([('w','x'),('y','z')])))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,w,y
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,x,z
Gender,Test_Number,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,1,a,1.0,1.0
Female,1,b,1.0,1.0
Female,2,a,1.0,1.0
Female,2,b,1.0,1.0
Female,3,a,1.0,1.0
Female,3,b,1.0,1.0
Male,1,a,1.0,1.0
Male,1,b,1.0,1.0
Male,2,a,1.0,1.0
Male,2,b,1.0,1.0


In [34]:
#练一练
#使用apply函数实现与gb.cov()同样的功能
gb.cov()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Height,Weight
Gender,Test_Number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,1,Height,20.9636,21.452034
Female,1,Weight,21.452034,26.438244
Female,2,Height,31.61568,30.38617
Female,2,Weight,30.38617,34.56825
Female,3,Height,23.582395,20.801307
Female,3,Weight,20.801307,23.22807
Male,1,Height,42.638234,48.785833
Male,1,Weight,48.785833,67.669951
Male,2,Height,57.041732,38.224183
Male,2,Weight,38.224183,37.869281


In [35]:
gb.apply(lambda x: pd.DataFrame((df.Height.cov(df.Weight)),
                               index=['Height','Weight'],
                               columns=['Height','Weight']))

#这个练一练的思路应该是对先把df分组然后对应组求cov，最后拼在一起，待Review

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Height,Weight
Gender,Test_Number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,1,Height,104.652083,104.652083
Female,1,Weight,104.652083,104.652083
Female,2,Height,104.652083,104.652083
Female,2,Weight,104.652083,104.652083
Female,3,Height,104.652083,104.652083
Female,3,Weight,104.652083,104.652083
Male,1,Height,104.652083,104.652083
Male,1,Weight,104.652083,104.652083
Male,2,Height,104.652083,104.652083
Male,2,Weight,104.652083,104.652083


## 5. 练习
### EX1: 汽车数据集

In [36]:
df1=pd.read_csv('/Users/jie/Documents/Python/joyful-pandas-master/data/car.csv')
df1.head(3)

Unnamed: 0,Brand,Price,Country,Reliability,Mileage,Type,Weight,Disp.,HP
0,Eagle Summit 4,8895,USA,4.0,33,Small,2560,97,113
1,Ford Escort 4,7402,USA,2.0,33,Small,2345,114,90
2,Ford Festiva 4,6319,Korea,4.0,37,Small,1845,81,63


In [37]:
#1.
df1_1=df1.copy()
#Step 1
a=df1_1.groupby('Country').filter(lambda x: x.shape[0]>2)
a

Unnamed: 0,Brand,Price,Country,Reliability,Mileage,Type,Weight,Disp.,HP
0,Eagle Summit 4,8895,USA,4.0,33,Small,2560,97,113
1,Ford Escort 4,7402,USA,2.0,33,Small,2345,114,90
2,Ford Festiva 4,6319,Korea,4.0,37,Small,1845,81,63
3,Honda Civic 4,6635,Japan/USA,5.0,32,Small,2260,91,92
4,Mazda Protege 4,6599,Japan,5.0,32,Small,2440,113,103
6,Nissan Sentra 4,7399,Japan/USA,5.0,33,Small,2275,97,90
7,Pontiac LeMans 4,7254,Korea,1.0,28,Small,2350,98,74
8,Subaru Loyale 4,9599,Japan,5.0,25,Small,2295,109,90
9,Subaru Justy 3,5866,Japan,,34,Small,1900,73,73
10,Toyota Corolla 4,8748,Japan/USA,5.0,29,Small,2390,97,102


In [38]:
#Step 2
a.groupby('Country')['Price'].agg(['mean',('Cov', lambda x: x.std()/x.mean()),'count'])

Unnamed: 0_level_0,mean,Cov,count
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,13938.052632,0.387429,19
Japan/USA,10067.571429,0.24004,7
Korea,7857.333333,0.243435,3
USA,12543.269231,0.203344,26


In [39]:
#2.
df1_2=df1.copy()
df1_2.shape[0] #60 rows
df1_2.groupby(df1_2.Country.mask(df1_2.index.isin(range(0,20)),'First').mask(df1_2.index.isin(range(20,40)),'Second').mask(df1_2.index.isin(range(40,60)),'Third'))['Price'].mean()

Country
First      9069.95
Second    13356.40
Third     15420.65
Name: Price, dtype: float64

In [40]:
#3.
df1_3=df1.copy()
gb1_3=df1_3.groupby('Type')[['Price','HP']]
gb1_3.agg({'Price': 'max', 'HP': 'min'})

Unnamed: 0_level_0,Price,HP
Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Compact,18900,95
Large,17257,150
Medium,24760,110
Small,9995,63
Sporty,13945,92
Van,15395,106


In [41]:
#4.
df1_4=df1.copy()
df1_4.groupby('Type')['HP'].transform(lambda x: (x-x.min())/(x.max()-x.min()))

0     1.000000
1     0.540000
2     0.000000
3     0.580000
4     0.800000
5     0.380000
6     0.540000
7     0.220000
8     0.540000
9     0.200000
10    0.780000
11    0.300000
12    0.740000
13    0.586466
14    0.060150
15    1.000000
16    0.135338
17    0.120301
18    0.360902
19    0.360902
20    0.000000
21    0.037594
22    0.276596
23    0.319149
24    0.000000
25    0.978723
26    0.063830
27    0.638298
28    0.319149
29    0.148936
30    1.000000
31    0.914894
32    0.319149
33    0.531915
34    0.744681
35    0.425532
36    0.404255
37    0.625000
38    0.000000
39    0.500000
40    0.462500
41    0.500000
42    0.375000
43    0.375000
44    0.000000
45    0.600000
46    0.625000
47    0.000000
48    0.312500
49    1.000000
50    0.750000
51    1.000000
52    0.000000
53    0.090909
54    1.000000
55    0.886364
56    1.000000
57    0.022727
58    0.727273
59    0.000000
Name: HP, dtype: float64

In [42]:
#5.
df1_5=df1.copy()
gb1_5=df1_5.groupby('Type')[['Disp.','HP']]
gb1_5.corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,Disp.,HP
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Compact,Disp.,1.0,0.586087
Compact,HP,0.586087,1.0
Large,Disp.,1.0,-0.242765
Large,HP,-0.242765,1.0
Medium,Disp.,1.0,0.370491
Medium,HP,0.370491,1.0
Small,Disp.,1.0,0.603916
Small,HP,0.603916,1.0
Sporty,Disp.,1.0,0.871426
Sporty,HP,0.871426,1.0
