# pandas.groupby用法

In [27]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)  # 显示所有列
# pd.set_option('display.max_rows', None)  # 显示所有行
# help(data.groupby)

In [2]:
data=pd.read_csv('./input/test_set.csv')

In [3]:
data.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,25318,51,housemaid,married,unknown,no,174,no,no,telephone,29,jul,308,3,-1,0,unknown
1,25319,32,management,married,tertiary,no,6059,yes,no,cellular,20,nov,110,2,-1,0,unknown
2,25320,60,retired,married,primary,no,0,no,no,telephone,30,jul,130,3,-1,0,unknown
3,25321,32,student,single,tertiary,no,64,no,no,cellular,30,jun,598,4,105,5,failure
4,25322,41,housemaid,married,secondary,no,0,yes,yes,cellular,15,jul,368,4,-1,0,unknown


In [60]:
data.groupby('job')['housing'].value_counts()

job            housing
admin.         yes         736
               no          479
blue-collar    yes        1662
               no          645
entrepreneur   yes         187
               no          142
housemaid      no          222
               yes          94
management     no         1187
               yes        1112
retired        no          430
               yes         125
self-employed  no          208
               yes         183
services       yes         650
               no          316
student        no          188
               yes          61
technician     yes         997
               no          845
unemployed     no          193
               yes         130
unknown        no           58
               yes           2
Name: housing, dtype: int64

In [8]:
data.groupby('job')['age'].mean()

job
admin.           39.622222
blue-collar      40.163849
entrepreneur     42.395137
housemaid        46.892405
management       40.614615
retired          61.318919
self-employed    40.398977
services         38.942029
student          26.353414
technician       39.380565
unemployed       40.399381
unknown          47.316667
Name: age, dtype: float64

![](./img/groupby.png)
```
for name, sub in data.groupby(['job']):
    ...
```
grouby后，就把原来的数据集分成了数个数据子集，如上图

其中一轮循环中：
- name：job的取值同`data.job.unique()`里的其中一个元素，比如`student`
- sub：子数据集，相当于`data[data.job=='student']`

```
for name, sub in data.groupby(['job'])['age']:
    ...
```
其中一轮循环中：
- name：同上
- sub：相当于`data[data.job=='student']['age']`

[pandas：apply和transform方法的性能比较](https://www.cnblogs.com/wkang/p/9794678.html)

## groupby+transform

In [56]:
%%time
# 相当于data[data.job=='student']['age'].sum()
data['job_age_count']=data.groupby(['job'])['age'].transform(lambda x:x.count())
data['job_age_sum']=data.groupby(['job'])['age'].transform(lambda x:x.sum())
data['job_age_max']=data.groupby(['job'])['age'].transform(lambda x:x.max())
data['job_age_min']=data.groupby(['job'])['age'].transform(lambda x:x.min())
data['job_age_mean']=data.groupby(['job'])['age'].transform(lambda x:x.mean())

Wall time: 41 ms


In [66]:
%%time
# python内置方法 速度更快
data['job_age_count1']=data.groupby(['job'])['age'].transform('count')
data['job_age_sum1']=data.groupby(['job'])['age'].transform(sum)
data['job_age_max1']=data.groupby(['job'])['age'].transform(max)
data['job_age_min1']=data.groupby(['job'])['age'].transform(min)
data['job_age_mean1']=data.groupby(['job'])['age'].transform('mean')

Wall time: 16.6 ms


## groupby+apply
apply无法使用python内置方法

In [64]:
%%time
data['job_age_count']=data.groupby(['job'])['age'].apply(lambda x:x.count())
data['job_age_sum']=data.groupby(['job'])['age'].apply(lambda x:x.sum)
data['job_age_max']=data.groupby(['job'])['age'].apply(lambda x:x.max)
data['job_age_min']=data.groupby(['job'])['age'].apply(lambda x:x.min)
data['job_age_mean']=data.groupby(['job'])['age'].apply(lambda x:x.mean())

Wall time: 35.1 ms


## groupby+agg

In [76]:
%%time
data.groupby(['job'])['age'].agg(lambda x:x.count())
data.groupby(['job'])['age'].agg(lambda x:x.sum)
data.groupby(['job'])['age'].agg(lambda x:x.max)
data.groupby(['job'])['age'].agg(lambda x:x.min)
data.groupby(['job'])['age'].agg(lambda x:x.mean())

Wall time: 13.7 ms


In [77]:
%%time
data.groupby(['job'])['age'].agg('count')
data.groupby(['job'])['age'].agg(sum)
data.groupby(['job'])['age'].agg(max)
data.groupby(['job'])['age'].agg(min)
data.groupby(['job'])['age'].agg('mean')

Wall time: 9.76 ms


In [94]:
aggcount=data.groupby(['job'])['age'].agg(_job_age_count2='count',
                                           job_age_sum2=sum,
                                           job_age_max2=max,
                                           job_age_min2=min,
                                           job_age_mean2='mean')

In [95]:
aggcount

Unnamed: 0_level_0,_job_age_count2,job_age_sum2,job_age_max2,job_age_min2,job_age_mean2
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
admin.,1215,48141,69,20,39.622222
blue-collar,2307,92658,75,21,40.163849
entrepreneur,329,13948,84,26,42.395137
housemaid,316,14818,83,23,46.892405
management,2299,93373,81,21,40.614615
retired,555,34032,94,33,61.318919
self-employed,391,15796,76,22,40.398977
services,966,37618,60,20,38.942029
student,249,6562,41,18,26.353414
technician,1842,72539,70,21,39.380565


In [96]:
data=pd.merge(data,aggcount,on='job',how='left')

## groupby 众数特征

In [99]:
from scipy import stats
# help(stats.mode)

In [111]:
data['job_age_mode']=data.groupby(['job'])['age'].transform(lambda x:stats.mode(x)[0][0])
data['marital_age_mode']=data.groupby(['marital'])['age'].transform(lambda x:stats.mode(x)[0][0])

## grouby 多列

```
for name,sub in data.groupby(['job','housing']):
    print(name)
```
其中一轮
- name：('student', 'yes')
- sub：相当于`data[(data.job=='student') & (data.housing=='yes')]`

In [162]:
# data[(data.job=='student') & (data.housing=='yes')]['age']

In [160]:
data.groupby(['job','housing'])['age'].transform('mean')

0        49.648649
1        39.910072
2        62.881395
3        25.984043
4        40.382979
           ...    
10847    39.910072
10848    41.274642
10849    38.004615
10850    38.705163
10851    39.157641
Name: age, Length: 10852, dtype: float64

In [174]:
# [i[1] for j,i in enumerate(t.index)]
# [i[0] if j%2==0 else '' for j,i in enumerate(t.index)]

# transform用法

In [44]:
# transform用法
# help(data.transform)

In [20]:
temp_df = pd.DataFrame({'A': range(3), 'B': range(1, 4)})
temp_df

Unnamed: 0,A,B
0,0,1
1,1,2
2,2,3


In [21]:
temp_df.transform(lambda x: x + 1)

Unnamed: 0,A,B
0,1,2
1,2,3
2,3,4


In [28]:
s = pd.Series(range(3))
s

0    0
1    1
2    2
dtype: int64

In [29]:
s.transform([np.sqrt, np.exp])

Unnamed: 0,sqrt,exp
0,0.0,1.0
1,1.0,2.718282
2,1.414214,7.389056


# transform用法

In [44]:
# transform用法
# help(data.transform)

In [20]:
temp_df = pd.DataFrame({'A': range(3), 'B': range(1, 4)})
temp_df

Unnamed: 0,A,B
0,0,1
1,1,2
2,2,3


In [21]:
temp_df.transform(lambda x: x + 1)

Unnamed: 0,A,B
0,1,2
1,2,3
2,3,4


In [28]:
s = pd.Series(range(3))
s

0    0
1    1
2    2
dtype: int64

In [29]:
s.transform([np.sqrt, np.exp])

Unnamed: 0,sqrt,exp
0,0.0,1.0
1,1.0,2.718282
2,1.414214,7.389056
