In [59]:
# coding:utf8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.max_rows', 10) # 设定df全部显示的行数

# 一个cell做多个输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

## GroupBy机制
> 1、主要是series或者是dataframe的groupby函数，针对不同的列进行分组，在分组的基础上对某一列或者某几列进行聚合运算；    
> 2、如果是对一列进行聚合运算，使用自带的聚合函数(mean,avg,min,max等)或者是agg函数（可以进行多种运算或者指定运算后的字段名称，根据参数样式而定）；    
> 3、如果是对多列进行聚合运算，使用apply函数调用自定义聚合函数或者是自带的聚合函数；    
> 4、provt_table(透视表) 和cross_tab（交叉表）分组聚合属于特殊的聚合，均可以使用groupby函数实现，只不过使用该函数更方便一些 

###   单分组 和 多分组
- 1、单分组：可以先选择数据，然后进行分组；也可以先分组然后对数据进行过滤     
- 2、多分组：形成层次化的索引，使用unstack()函数转换     
- 3、既可以使用已有的数据进行分组，也可以使用给定的数据进行分组（相当于有增一个分组列，最好与原数据有对应关系）    
- 4、分组对象均有size方法  grouded.size()
- 5、GroupBy的对象支持迭代，产生一组二元数组s

In [60]:
# 示例数据
df = pd.DataFrame({
        'key1':['a','a','b','b','a'],
        'key2':['one','two','one','two','one'],
        'data1':np.random.randn(5),
        'data2':np.random.randn(5)
    })

df.head()

# 单分组
grouped = df['data1'].groupby(df['key1'])  #grouped = df.groupby('key1')['data1']
grouped.mean()

# 多分组 
df['data1'].groupby([df['key1'], df['key2']]).mean().unstack()

# 分组键可以为列，也可以玩长度适当（多分组中能构成层次索引）的数组
states = np.array(['Ohio', 'California', 'California', 'Ohio','Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.373109,0.177871
1,a,two,-0.201382,0.30769
2,b,one,-0.871747,-0.115794
3,b,two,0.534387,1.076124
4,a,one,1.384384,-1.746285


key1
a    0.518704
b   -0.168680
Name: data1, dtype: float64

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.878747,-0.201382
b,-0.871747,0.534387


California  2005   -0.201382
            2006   -0.871747
Ohio        2005    0.453748
            2006    1.384384
Name: data1, dtype: float64

###  对分组进行迭代 
groupby对象支持迭代，会产生一个二元元祖（由分组名和数据块组成） 
- groupby 默认在axis=0(纵轴)上进行分组，也可以在其他轴上进行分组 
- 可以对分组的结果选取一个或者一组列（注意，如果是单个列，最好也以[[]]形式进行选取,这样返回的结果是df形式） df.groupby('key1')[['data1']]   
- 可以根据字典或者series数组进行分组
单独定义分组列，不过如果没有映射，感觉意义不大    
- 根据函数进行分组    
主要针对的是index上的函数，根据其返回结果进行分组（可以简化数据的处理流程）   
- 根据索引级别进行分组
针对的是层次化索引，使用参数level进行识别（level=索引名称或者级别编号 从0开始

In [61]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)
    
for (key1,key2),group in df.groupby(['key1','key2']):
    print((key1,key2))
    print(group)

# 根据函数进行分组 
df1 = df.set_index(['key2'])
def isE(str1):
    return 'e' in str1
df1.groupby(isE).sum()

a
  key1 key2     data1     data2
0    a  one  0.373109  0.177871
1    a  two -0.201382  0.307690
4    a  one  1.384384 -1.746285
b
  key1 key2     data1     data2
2    b  one -0.871747 -0.115794
3    b  two  0.534387  1.076124
('a', 'one')
  key1 key2     data1     data2
0    a  one  0.373109  0.177871
4    a  one  1.384384 -1.746285
('a', 'two')
  key1 key2     data1    data2
1    a  two -0.201382  0.30769
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.871747 -0.115794
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.534387  1.076124


Unnamed: 0,data1,data2
False,0.333005,1.383814
True,0.885746,-1.684207


### 选取一列或列的子集
先聚合还是先选取的问题
> 将函数跟数组、列表、字典、Series混合使用都可以作为分分组数组  

```python
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
```


In [62]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.784207
a,two,0.30769
b,one,-0.115794
b,two,1.076124


In [63]:
people = pd.DataFrame(np.random.randn(5, 5),
                columns=['a', 'b', 'c', 'd', 'e'],
                index=['Joe', 'Steve', 'Wes', 'Jim','Travis'])

people.iloc[2:3, [1, 2]] = np.nan 

people

# 直接使用字典进行分组
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,a,b,c,d,e
Joe,0.016321,0.287742,-1.694148,-1.352075,-0.632951
Steve,1.137762,-0.556526,-0.475484,-0.153054,0.971634
Wes,0.688767,,,-0.475025,0.355686
Jim,2.010125,0.903614,0.361482,0.356674,0.004169
Travis,0.572829,2.220143,-0.490563,2.126048,-2.887948


Unnamed: 0,blue,red
Joe,-3.046223,-0.328889
Steve,-0.628538,1.55287
Wes,-0.475025,1.044454
Jim,0.718156,2.917907
Travis,1.635485,-0.094976


### 根据索引级别分组
将函数跟数组、列表、字典、Series混合使用也不是问题，因为任何东西在内部都会被转换为数组;    
主要是level参数的使用（层次索引级别，最外层为0，也可使用名称来指定）

In [64]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP','JP'],
                         [1, 3, 5, 1, 3]],
                        names=['cty', 'tenor'])

hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df.head()

hier_df.groupby(level='cty', axis=1).count() # 相当于level=0

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.518538,-0.651616,0.535095,-1.157795,-0.128922
1,1.375532,0.841721,1.219608,0.953384,1.105049
2,0.378373,1.193113,1.395024,1.333981,-0.056006
3,-1.208415,-0.056799,0.349898,-0.541631,1.189298


cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 数据聚合
1、主要针对的是数值型的列    
2、可以使用自带的聚合运算方法(直接调用或者使用agg/apply调用)，也可以自定义（只能agg/apply调用）    
3、对单列或者多列使用的调取参数不同（agg/apply）

quantile可以计算Series或DataFrame列的样本分位数,但其并非严格的聚合函数；类似的还有describe()函数

常见的聚合运算：  

函数名| 说明
--|--
count|分组中非NA值的数量
mean|
median| 非NA值的算术中位数
std、var| 标准差和方差
min/max|
prod| 非NA值的积
first/last| 第一个和最后一个非NA值 

In [65]:
df.head()

grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)

def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)

grouped.describe()

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.373109,0.177871
1,a,two,-0.201382,0.30769
2,b,one,-0.871747,-0.115794
3,b,two,0.534387,1.076124
4,a,one,1.384384,-1.746285


key1
a    1.182129
b    0.393773
Name: data1, dtype: float64

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.585766,2.053975
b,1.406134,1.191918


Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.518704,0.802846,-0.201382,0.085864,0.373109,0.878747,1.384384,3.0,-0.420241,1.15022,-1.746285,-0.784207,0.177871,0.242781,0.30769
b,2.0,-0.16868,0.994287,-0.871747,-0.520214,-0.16868,0.182853,0.534387,2.0,0.480165,0.842813,-0.115794,0.182185,0.480165,0.778145,1.076124


### 面向列的多函数应用
使用groupby agg方法。参数为dict形式的最为方便~ 可以为每一列指定统计方法，也可每一列指定多个统计方法
```python
grouped.agg({'col_A' : np.max, 'col_B' : 'sum'}) 
grouped.agg({'col_A' : ['min', 'max', 'mean', 'std'], 'col_B' : 'sum'})
```

In [66]:
# 餐馆消费示例
tips = pd.read_csv('./examples/tips.csv')
# tips.head()
# 为防止字段名引起冲突，修改字段名
tips.rename(columns={'size':'sizes'},inplace=True)
# tips.head()

# 添加“小费占总额百分比”的列
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips.head()

# 进行多分组
grouped = tips.groupby(['day', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')  # 单一统计值
grouped_pct.agg(['mean', 'std', peak_to_peak])  # 多分组的 单列 多统计值
# 传入的是一个由(name,function)元组组成的列表，则各元组的第一个元素就会被用作DataFrame的列名（可以将这种二元元组列表看做一个有序映射）
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]) # 还是多分组 的 单列 多统计值，只不过可以指定统计后的列名


Unnamed: 0,total_bill,tip,smoker,day,time,sizes,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [67]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions) # 多列结果多种统计值  这样写 会为每一列统计多种结果值（有些是不必要的 而且结果列名无法自定义）
result
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [68]:
ftuples = [('Durchschnitt', 'mean'),('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples) # 可指定结果列名 但还是无法做到为每一个统计列指定一种统计方法（dict参数可以吗，但是现在不支持了）


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [69]:
grouped = tips.groupby(['day', 'smoker'])
grouped.agg({'tip_pct' : np.max, 'total_bill' : 'sum'}) # 这种方法最灵活，但是不建议使用
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'], 'total_bill' : 'sum'})  # 一列 有多个统计值

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.187735,73.68
Fri,Yes,0.26348,252.2
Sat,No,0.29199,884.78
Sat,Yes,0.325733,893.62
Sun,No,0.252672,1168.88
Sun,Yes,0.710345,458.28
Thur,No,0.266312,770.09
Thur,Yes,0.241255,326.24


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,73.68
Fri,Yes,0.103555,0.26348,0.174783,0.051293,252.2
Sat,No,0.056797,0.29199,0.158048,0.039767,884.78
Sat,Yes,0.035638,0.325733,0.147906,0.061375,893.62
Sun,No,0.059447,0.252672,0.160113,0.042347,1168.88
Sun,Yes,0.06566,0.710345,0.18725,0.154134,458.28
Thur,No,0.072961,0.266312,0.160298,0.038774,770.09
Thur,Yes,0.090014,0.241255,0.163863,0.039389,326.24


### 以“没有行索引”的形式返回聚合数据
groupby 函数参数 as_index=False:分组列不作为结果的行索引，避免层次索引的需要进一步操作；

In [70]:
tips.groupby(['day', 'smoker'], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,sizes,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


## apply:一般性的拆分-应用-合并
> apply调用函数的主参数，是每个分组对象（该点需要注意）

apply方法会将待处理的对象拆分为多个片段，然后对各个片段调用传入的参数，最后尝试将各个片段组合在一起（结果是以分组列为索引）  
如果apply方法调用的是自定义函数，且分组采用的是普通列，会形成MultiIndex  

- 可以使用group_keys=False禁止使用分组键做索引， 避免产生层次化索引 

**疑惑：**
<span class="burk">如果调用的外部函数如下面的top函数，在apply的过程中需要外部传参该怎么办？</span>

In [71]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

top(tips, n=6)

tips.groupby('smoker').apply(top) # 对groupby的每一个子对象适用于指定的函数， 然后将结果组装在一起

Unnamed: 0,total_bill,tip,smoker,day,time,sizes,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,sizes,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


### 禁止分组键
分组键会跟原始对象的索引共同构成结果对象中的层次化索引。将group_keys=False传入groupby即可禁止该效果,功能有些类似as_index

In [72]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,sizes,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


### 分位数和桶分析 
> 所谓的分位数和桶分析就是将数据按照分位数(quantile)或者桶(bucket)进行拆分，然后groupby分组   
> 主要会用到df的一些特殊工具方法：cut()等长分组切分函数   qcut()分位数切分函数  

也可以将数据按照指定的规则进行类别划分，然后根据类别进行分组

- pd.cut 将数据集切分为几组（几类） 类似于hist的bins；其对象可以直接用于groupby 

In [73]:
frame = pd.DataFrame({
        'data1': np.random.randn(100),
        'data2': np.random.randn(100)
    })
factor = pd.cut(frame.data1,4)

factor[:10]

def get_stats(group):
    return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()

0     (0.0597, 1.208]
1      (1.208, 2.357]
2    (-1.089, 0.0597]
3     (0.0597, 1.208]
4     (0.0597, 1.208]
5    (-1.089, 0.0597]
6     (0.0597, 1.208]
7    (-2.242, -1.089]
8      (1.208, 2.357]
9    (-1.089, 0.0597]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.242, -1.089] < (-1.089, 0.0597] < (0.0597, 1.208] < (1.208, 2.357]]

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.242, -1.089]",17.0,1.445063,0.049868,-1.127439
"(-1.089, 0.0597]",32.0,1.535552,0.134834,-2.149252
"(0.0597, 1.208]",38.0,1.491195,0.04697,-2.339475
"(1.208, 2.357]",13.0,0.94634,-0.303124,-1.856765


### 示例：用特定与分组的值填充缺失值 
> eg:用每组剩余值的均值 或者 指定每组缺失值的填充值 来填充改组的缺失值（比直接使用整体均值填充缺失值效果要好一些）  

In [74]:
states = list('ABCDEFGH')
group_keys = ['East']*4 + ['West']*4
data = pd.Series(np.random.randn(8), index=states)
data[['B','D','G']] = np.nan
data.groupby(group_keys).mean()  # 虽然group_keys不在被分组的数据中，二者默认是根据index进行关联后然后在分组

# 定义分组填充函数
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_keys).apply(fill_mean)

# 也可在不同分组中指定填充值
fill_values = {'East':0.5,'West':-1}  # 在不同的组别指定要替换NA的值
fill_func = lambda g:g.fillna(fill_values[g.name])  # 每个分组对象有name、size()等系统属性，可以善加利用
data.groupby(group_keys).apply(fill_func)

East   -0.023104
West    0.614333
dtype: float64

A   -0.244303
B   -0.023104
C    0.198096
D   -0.023104
E   -0.048790
F   -0.239018
G    0.614333
H    2.130807
dtype: float64

A   -0.244303
B    0.500000
C    0.198096
D    0.500000
E   -0.048790
F   -0.239018
G   -1.000000
H    2.130807
dtype: float64

### 示例：随机采样和排列 
> 这一块涉及到样本的选择和模型的交叉验证，需要注意        

- 方法:使用np.random.permutation(N)[k:] N是完整数据的大小，k是期望样本的大小  

In [75]:
# 构建基础扑克牌数据  以牌名（大小+花色）为索引；以数值（0-10，JQK=10）为数据
# 红桃 Hearts 黑桃 Spades 梅花 Clubs  方片 Diamonds
suits = ['H','S','C','D']
card_val = (list(range(1, 11)) + [10] * 3) * 4 # list 相加即为list.append(list)
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in suits:
    cards.extend(str(num)+suit for num in base_names)

deck = pd.Series(card_val, index=cards)    
deck[:13]

# 定义随机抽取函数  随机在draw数据中抽取n个值（放回的，没哟其他的限制条件）
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck)))[:n]

draw(deck)

# 定义从每种花色随机抽取两张 
get_suit = lambda card: card[-1] # 获取花色
deck.groupby(get_suit).apply(draw, n=2)  # 对series而言，groupby针对的是其index进行分组

# 另一种方法
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

AH      1
2H      2
3H      3
4H      4
5H      5
       ..
9H      9
10H    10
JH     10
KH     10
QH     10
Length: 13, dtype: int64

6C     6
JH    10
8C     8
4H     4
9S     9
dtype: int64

C  6C     6
   JC    10
D  8D     8
   2D     2
H  6H     6
   4H     4
S  AS     1
   8S     8
dtype: int64

5C     5
9C     9
6D     6
7D     7
KH    10
8H     8
7S     7
JS    10
dtype: int64

### 示例：分组加权平均数和相关系数 
> 计算df列与列之间、两个series之间的分组加权、相关系数等标准化作业 

In [76]:
df = pd.DataFrame({
        'category':['a']*4+['b']*4,
        'data': np.random.randn(8),
        'weights':np.random.rand(8)
    })
grouped = df.groupby('category')
# 计算分组加权平均数
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)

category
a    0.170021
b   -0.100319
dtype: float64

In [77]:
close_px = pd.read_csv('./examples/stock_px.csv',parse_dates=True, index_col=0)
close_px.head()

# 计算一个由日收益率（通过百分数变化计算）与SPX之间的年度相关系数组成的df
rets = close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x:x.year)
by_year.apply(spx_corr).head()

# 计算相关系数
by_year.apply(lambda g:g['AAPL'].corr(g['MSFT'])).head()

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-01,4.98,7.86,2.87,16.79,4.27,0.51,6.04,328.79,6.12
1990-02-02,5.04,8.0,2.87,16.89,4.37,0.51,6.09,330.92,6.24
1990-02-05,5.07,8.18,2.87,17.32,4.34,0.51,6.05,331.85,6.25
1990-02-06,5.01,8.12,2.88,17.56,4.32,0.51,6.15,329.66,6.23
1990-02-07,5.04,7.77,2.91,17.93,4.38,0.51,6.17,333.75,6.33


Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,0.595024,0.545067,0.752187,0.738361,0.801145,0.586691,0.783168,1.0,0.517586
1991,0.453574,0.365315,0.759607,0.557046,0.646401,0.524225,0.641775,1.0,0.569335
1992,0.39818,0.498732,0.632685,0.262232,0.51574,0.492345,0.473871,1.0,0.318408
1993,0.259069,0.238578,0.447257,0.211269,0.451503,0.425377,0.385089,1.0,0.318952
1994,0.428549,0.26842,0.572996,0.385162,0.372962,0.436585,0.450516,1.0,0.395078


1990    0.408271
1991    0.266807
1992    0.450592
1993    0.236917
1994    0.361638
dtype: float64

### 示例：组级别的线性回归

In [78]:
import statsmodels.api as sm

def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

by_year.apply(regress, 'AAPL', ['SPX'])

Unnamed: 0,SPX,intercept
1990,1.512772,0.001395
1991,1.187351,0.000396
1992,1.832427,0.000164
1993,1.390470,-0.002657
1994,1.190277,0.001617
...,...,...
2007,1.198761,0.003438
2008,0.968016,-0.001110
2009,0.879103,0.002954
2010,1.052608,0.001261


## 透视表与交叉表
- 透视表：pivot_table，选择两个系列(单列或者多个列构成的list)形成横纵坐标，对某一列或者某几列进行计算、累加、求和等运算   参数有data, index, columns, aggfunc等 pd和df均有该函数，不过一般都是df调用    
```python
pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')
```
- 交叉表：crosstab  用于计算分组频数（每个分组项的个数，使用pivot_table+aggfunc=len也能实现）的特殊透视表，pd级别函数   
```python  
# 该函数的values参数基本上是不用的
crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True, normalize=False)
```

> 这两个函数是一种更特殊意义上的分组函数；均可使用groupby函数实现；这两个函数使用场景较多但是使用有较大限制，所以单独出来，方便使用

In [80]:
tips.pivot_table(index=['day', 'smoker'])

tips.pivot_table(['tip_pct', 'sizes'], index=['time', 'day'],columns='smoker')

# fill_value参数对缺失值进行填充
tips.pivot_table(values=['tip_pct','sizes'], index=['time','day'], columns='smoker',aggfunc=sum,margins=True)  # all值根据aggfunc保持一致
tips.pivot_table('tip_pct', index=['time', 'smoker'],columns='day', aggfunc=len, margins=True)

# 对分组进行个数交叉统计
pd.crosstab([tips.time,tips.day], tips.smoker, margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,sizes,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


Unnamed: 0_level_0,Unnamed: 1_level_0,sizes,sizes,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


Unnamed: 0_level_0,Unnamed: 1_level_0,sizes,sizes,sizes,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,6.0,20.0,26,0.418867,1.488126,1.906993
Dinner,Sat,115.0,104.0,219,7.112145,6.212055,13.324199
Dinner,Sun,167.0,49.0,216,9.126438,3.557756,12.684194
Dinner,Thur,2.0,,2,0.159744,,0.159744
Lunch,Fri,3.0,11.0,14,0.187735,1.13362,1.321354
Lunch,Thur,110.0,40.0,150,7.053669,2.785676,9.839345
All,,403.0,224.0,627,24.058598,15.177232,39.23583


Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,3.0,45.0,57.0,1.0,106.0
Dinner,Yes,9.0,42.0,19.0,,70.0
Lunch,No,1.0,,,44.0,45.0
Lunch,Yes,6.0,,,17.0,23.0
All,,19.0,87.0,76.0,62.0,244.0


Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
