# 《利用Python进行数据分析》1st Edition chapter 9

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [2]:
df = DataFrame({'key1':['a', 'a', 'b', 'b', 'a'], 
                'key2':[u'一', u'二', u'一', u'二', u'一'],
                'data1': np.random.randn(5),
                'data2': np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,0.48027,-0.1183,a,一
1,-0.165531,-1.439285,a,二
2,-0.348254,-1.306511,b,一
3,-2.015902,0.761617,b,二
4,0.365189,1.005399,a,一


In [3]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000000000B213550>

In [4]:
grouped.mean()

key1
a    0.226643
b   -1.182078
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [6]:
means

key1  key2
a     一       0.422730
      二      -0.165531
b     一      -0.348254
      二      -2.015902
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,一,二
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.42273,-0.165531
b,-0.348254,-2.015902


In [8]:
# 以上代码中分组标记均为Series对象，实际上分组键可以是任意长度适当的数组

In [9]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,一,0.42273,0.443549
a,二,-0.165531,-1.439285
b,一,-0.348254,-1.306511
b,二,-2.015902,0.761617


In [10]:
df.groupby(['key1']).mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.226643,-0.184062
b,-1.182078,-0.272447


In [11]:
for name, group in df.groupby(['key1']):
    print name
    print group

a
      data1     data2 key1 key2
0  0.480270 -0.118300    a    一
1 -0.165531 -1.439285    a    二
4  0.365189  1.005399    a    一
b
      data1     data2 key1 key2
2 -0.348254 -1.306511    b    一
3 -2.015902  0.761617    b    二


In [12]:
for (k1, k2), gorup in df.groupby(['key1', 'key2']):    # Groupby对象迭代产生一组二元元组，多重键的情况下，元组的第一个元素是由键值组成的元组
    print k1, k2
    print group

a 一
      data1     data2 key1 key2
2 -0.348254 -1.306511    b    一
3 -2.015902  0.761617    b    二
a 二
      data1     data2 key1 key2
2 -0.348254 -1.306511    b    一
3 -2.015902  0.761617    b    二
b 一
      data1     data2 key1 key2
2 -0.348254 -1.306511    b    一
3 -2.015902  0.761617    b    二
b 二
      data1     data2 key1 key2
2 -0.348254 -1.306511    b    一
3 -2.015902  0.761617    b    二


In [15]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':       data1     data2 key1 key2
 0  0.480270 -0.118300    a    一
 1 -0.165531 -1.439285    a    二
 4  0.365189  1.005399    a    一, 'b':       data1     data2 key1 key2
 2 -0.348254 -1.306511    b    一
 3 -2.015902  0.761617    b    二}

In [16]:
pieces['a']      # 默认groupby是在axis=0上进行分组

Unnamed: 0,data1,data2,key1,key2
0,0.48027,-0.1183,a,一
1,-0.165531,-1.439285,a,二
4,0.365189,1.005399,a,一


In [18]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [21]:
grouped = df.groupby(df.dtypes, axis=1)
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000B7241D0>

In [22]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0  0.480270 -0.118300
 1 -0.165531 -1.439285
 2 -0.348254 -1.306511
 3 -2.015902  0.761617
 4  0.365189  1.005399, dtype('O'):   key1 key2
 0    a    一
 1    a    二
 2    b    一
 3    b    二
 4    a    一}

In [26]:
type(df.dtypes)

pandas.core.series.Series

In [27]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.48027,-0.1183,a,一
1,-0.165531,-1.439285,a,二
2,-0.348254,-1.306511,b,一
3,-2.015902,0.761617,b,二
4,0.365189,1.005399,a,一


In [33]:
grouped1 = df['data1'].groupby(df['key1'])
dict(list(grouped1))

{'a': 0    0.480270
 1   -0.165531
 4    0.365189
 Name: data1, dtype: float64, 'b': 2   -0.348254
 3   -2.015902
 Name: data1, dtype: float64}

In [34]:
df.groupby('key1')['data1']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000000000B724CF8>

In [35]:
df.groupby('key2')[['data2']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000B2133C8>

In [36]:
df.groupby('key2')[['data2']].mean()

Unnamed: 0_level_0,data2
key2,Unnamed: 1_level_1
一,-0.139804
二,-0.338834


In [40]:
df[['data2']].groupby(df['key2']).mean()

Unnamed: 0_level_0,data2
key2,Unnamed: 1_level_1
一,-0.139804
二,-0.338834


In [41]:
df.groupby(['key1', 'key2'])['data1'].mean()

key1  key2
a     一       0.422730
      二      -0.165531
b     一      -0.348254
      二      -2.015902
Name: data1, dtype: float64

In [43]:
people = DataFrame(np.random.rand(5, 5), columns=['A', 'B', 'C', 'D', 'E'], index=['zhao', 'qian', 'sun', 'lee', 'zhou'])

In [44]:
people

Unnamed: 0,A,B,C,D,E
zhao,0.54391,0.464061,0.587523,0.25387,0.247382
qian,0.010143,0.114389,0.281588,0.919699,0.552688
sun,0.812748,0.200824,0.113116,0.546018,0.685945
lee,0.657343,0.612778,0.829705,0.491825,0.515259
zhou,0.903453,0.93277,0.215283,0.067131,0.683653


In [50]:
people.iloc[0:3, :]

Unnamed: 0,A,B,C,D,E
zhao,0.54391,0.464061,0.587523,0.25387,0.247382
qian,0.010143,0.114389,0.281588,0.919699,0.552688
sun,0.812748,0.200824,0.113116,0.546018,0.685945


In [56]:
people = DataFrame(np.random.rand(5, 5), columns=['A', 'B', 'C', 'D', 'E'], index=['zhao', 'qian', 'sun', 'lee', 'zhou'])

In [57]:
people.iloc[2:3, 1:2] = np.nan

In [59]:
people.iloc[2:3, 1:3] = np.nan

In [60]:
people

Unnamed: 0,A,B,C,D,E
zhao,0.504444,0.807315,0.122617,0.699247,0.923364
qian,0.116606,0.730907,0.827195,0.048301,0.719618
sun,0.521371,,,0.645175,0.784654
lee,0.643295,0.527388,0.306142,0.404313,0.127879
zhou,0.952948,0.893635,0.144446,0.051907,0.014468


In [61]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}
people.groupby(mapping, axis = 1)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000B753278>

In [62]:
people.mean()

A    0.547733
B    0.739811
C    0.350100
D    0.369788
E    0.513996
dtype: float64

In [63]:
by_column  = people.groupby(mapping, axis = 1)
by_column.sum()

zhao
qian
sun
lee
zhou


In [64]:
map_Series = Series(mapping)
map_Series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [65]:
people.groupby(map_Series, axis=1).count()

zhao
qian
sun
lee
zhou


In [66]:
people.groupby(len).sum()     # len 指人名的长度

Unnamed: 0,A,B,C,D,E
3,1.164666,0.527388,0.306142,1.049488,0.912533
4,1.573998,2.431856,1.094258,0.799454,1.657449


In [67]:
people = DataFrame(np.random.rand(5, 5), columns=['A', 'B', 'C', 'D', 'E'], index=['zhao', 'qian', 'sun', 'lee', 'wu'])

In [68]:
people.groupby(len).sum()

Unnamed: 0,A,B,C,D,E
2,0.837479,0.817902,0.392246,0.082072,0.413685
3,1.764755,0.550375,0.982107,1.506288,0.549586
4,0.531093,0.965326,1.4736,0.787239,1.151913


In [69]:
key_list = ['one', 'one', 'two', 'two', 'one']
people.groupby([len, key_list]).min() 

Unnamed: 0,Unnamed: 1,A,B,C,D,E
2,one,0.837479,0.817902,0.392246,0.082072,0.413685
3,two,0.857862,0.0665,0.374311,0.751401,0.273828
4,one,0.23958,0.401828,0.541286,0.393427,0.167627


In [70]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,A,B,C,D,E
2,two,0.837479,0.817902,0.392246,0.082072,0.413685
3,one,0.906892,0.483876,0.607795,0.754887,0.275758
3,two,0.857862,0.0665,0.374311,0.751401,0.273828
4,one,0.23958,0.401828,0.541286,0.393427,0.167627


In [73]:
people.groupby(len).describe()

Unnamed: 0_level_0,A,A,A,A,A,A,A,A,B,B,...,D,D,E,E,E,E,E,E,E,E
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
2,1.0,0.837479,,0.837479,0.837479,0.837479,0.837479,0.837479,1.0,0.817902,...,0.082072,0.082072,1.0,0.413685,,0.413685,0.413685,0.413685,0.413685,0.413685
3,2.0,0.882377,0.034669,0.857862,0.87012,0.882377,0.894635,0.906892,2.0,0.275188,...,0.754015,0.754887,2.0,0.274793,0.001364,0.273828,0.274311,0.274793,0.275276,0.275758
4,2.0,0.265547,0.036722,0.23958,0.252563,0.265547,0.27853,0.291513,2.0,0.482663,...,0.393716,0.393812,2.0,0.575957,0.577466,0.167627,0.371792,0.575957,0.780122,0.984287


In [74]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'CN', 'CN'],['1','3','5','1','3']], names=['cty', 'tenor'])
columns

MultiIndex(levels=[[u'CN', u'US'], [u'1', u'3', u'5']],
           codes=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=[u'cty', u'tenor'])

In [75]:
hier_df = DataFrame(np.random.randn(4,5), columns=columns)
hier_df

cty,US,US,US,CN,CN
tenor,1,3,5,1,3
0,-0.573135,0.59656,0.459979,-0.278313,-0.533433
1,1.081157,-0.899238,0.550656,-1.226071,0.067305
2,-0.58755,0.451502,0.554521,1.391345,0.161304
3,1.305748,1.057441,-1.289419,0.744225,-0.082199


In [76]:
hier_df.groupby(level='cty', axis=1).count()

cty,CN,US
0,2,3
1,2,3
2,2,3
3,2,3


In [78]:
hier_df.groupby(level='tenor', axis=1).count()

tenor,1,3,5
0,2,2,1
1,2,2,1
2,2,2,1
3,2,2,1


In [79]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.48027,-0.1183,a,一
1,-0.165531,-1.439285,a,二
2,-0.348254,-1.306511,b,一
3,-2.015902,0.761617,b,二
4,0.365189,1.005399,a,一


### 数据聚合

In [80]:
grouped = df.groupby('key1')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000B753048>

In [81]:
grouped['data1'].quantile(0.9)

key1
a    0.457254
b   -0.515018
Name: data1, dtype: float64

In [82]:
def peek_to_peek(arr):
    return arr.max() -arr.min()

In [83]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.226643,0.344472,-0.165531,0.099829,0.365189,0.42273,0.48027,3.0,-0.184062,1.223668,-1.439285,-0.778793,-0.1183,0.443549,1.005399
b,2.0,-1.182078,1.179205,-2.015902,-1.59899,-1.182078,-0.765166,-0.348254,2.0,-0.272447,1.462387,-1.306511,-0.789479,-0.272447,0.244585,0.761617


In [84]:
grouped.agg(peek_to_peek)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.645801,2.444684
b,1.667648,2.068128


In [87]:
tips = pd.read_csv('G:/dataAnalysis/data/tips.csv')

In [88]:
tips[:10]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [89]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [90]:
tips[:10]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
5,25.29,4.71,Male,No,Sun,Dinner,4,0.18624
6,8.77,2.0,Male,No,Sun,Dinner,2,0.22805
7,26.88,3.12,Male,No,Sun,Dinner,4,0.116071
8,15.04,1.96,Male,No,Sun,Dinner,2,0.130319
9,14.78,3.23,Male,No,Sun,Dinner,2,0.218539


In [91]:
grouped = tips.groupby(['sex', 'smoker'])
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000B89C198>

In [93]:
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')     #将函数名直接作为参数传进来

sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64

In [96]:
grouped_pct.agg(['mean', 'std', peek_to_peek])   #传入一组函数名

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peek_to_peek
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


In [97]:
#传入一个（name, func） 格式的元组列表，各元组的第一个元素将会被用作DataFrame的列名
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,0.156921,0.036421
Female,Yes,0.18215,0.071595
Male,No,0.160669,0.041849
Male,Yes,0.152771,0.090588


In [99]:
#对特定的列应用不同的函数
funcs = ['mean', 'count', 'max']
res = grouped['tip_pct', 'tip', 'total_bill'].agg(funcs)
res

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip,tip,tip,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,max,mean,count,max,mean,count,max
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Female,No,0.156921,54,0.252672,2.773519,54,5.2,18.105185,54,35.83
Female,Yes,0.18215,33,0.416667,2.931515,33,6.5,17.977879,33,44.3
Male,No,0.160669,97,0.29199,3.113402,97,9.0,19.791237,97,48.33
Male,Yes,0.152771,60,0.710345,3.051167,60,10.0,22.2845,60,50.81


In [100]:
res['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count,max
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,54,0.252672
Female,Yes,0.18215,33,0.416667
Male,No,0.160669,97,0.29199
Male,Yes,0.152771,60,0.710345


In [102]:
#对不同的列应用不同的函数
grouped.agg({'size':'sum', 'tip':np.max})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,5.2,140
Female,Yes,6.5,74
Male,No,9.0,263
Male,Yes,10.0,150


In [103]:
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,No,0.056797,0.252672,0.156921,0.036421,140
Female,Yes,0.056433,0.416667,0.18215,0.071595,74
Male,No,0.071804,0.29199,0.160669,0.041849,263
Male,Yes,0.035638,0.710345,0.152771,0.090588,150


In [104]:
tips.groupby(['sex', 'smoker'], as_index=False).mean()

Unnamed: 0,sex,smoker,total_bill,tip,size,tip_pct
0,Female,No,18.105185,2.773519,2.592593,0.156921
1,Female,Yes,17.977879,2.931515,2.242424,0.18215
2,Male,No,19.791237,3.113402,2.71134,0.160669
3,Male,Yes,22.2845,3.051167,2.5,0.152771


In [105]:
tips.groupby(['sex', 'smoker']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,No,18.105185,2.773519,2.592593,0.156921
Female,Yes,17.977879,2.931515,2.242424,0.18215
Male,No,19.791237,3.113402,2.71134,0.160669
Male,Yes,22.2845,3.051167,2.5,0.152771


### 分组运算和转换

In [106]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.48027,-0.1183,a,一
1,-0.165531,-1.439285,a,二
2,-0.348254,-1.306511,b,一
3,-2.015902,0.761617,b,二
4,0.365189,1.005399,a,一


In [107]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')

In [108]:
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.226643,-0.184062
b,-1.182078,-0.272447


In [109]:
key = ['one', 'two', 'one', 'two', 'one']

In [110]:
people

Unnamed: 0,A,B,C,D,E
zhao,0.291513,0.401828,0.932314,0.393812,0.984287
qian,0.23958,0.563498,0.541286,0.393427,0.167627
sun,0.906892,0.483876,0.607795,0.754887,0.275758
lee,0.857862,0.0665,0.374311,0.751401,0.273828
wu,0.837479,0.817902,0.392246,0.082072,0.413685


In [111]:
people.groupby(key).transform(np.mean)

Unnamed: 0,A,B,C,D,E
zhao,0.678628,0.567869,0.644119,0.410257,0.55791
qian,0.548721,0.314999,0.457799,0.572414,0.220728
sun,0.678628,0.567869,0.644119,0.410257,0.55791
lee,0.548721,0.314999,0.457799,0.572414,0.220728
wu,0.678628,0.567869,0.644119,0.410257,0.55791


In [112]:
people.groupby(key).mean()

Unnamed: 0,A,B,C,D,E
one,0.678628,0.567869,0.644119,0.410257,0.55791
two,0.548721,0.314999,0.457799,0.572414,0.220728


In [113]:
people

Unnamed: 0,A,B,C,D,E
zhao,0.291513,0.401828,0.932314,0.393812,0.984287
qian,0.23958,0.563498,0.541286,0.393427,0.167627
sun,0.906892,0.483876,0.607795,0.754887,0.275758
lee,0.857862,0.0665,0.374311,0.751401,0.273828
wu,0.837479,0.817902,0.392246,0.082072,0.413685


In [114]:
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(key).transform(demean)
demeaned

Unnamed: 0,A,B,C,D,E
zhao,-0.387115,-0.166041,0.288196,-0.016445,0.426377
qian,-0.309141,0.248499,0.083487,-0.178987,-0.053101
sun,0.228264,-0.083993,-0.036323,0.34463,-0.282152
lee,0.309141,-0.248499,-0.083487,0.178987,0.053101
wu,0.158851,0.250034,-0.251873,-0.328185,-0.144225


In [115]:
demeaned.groupby(key).mean()

Unnamed: 0,A,B,C,D,E
one,0.0,3.700743e-17,-7.401487e-17,-5.5511150000000004e-17,3.700743e-17
two,0.0,0.0,0.0,0.0,0.0


### Apply : 一般性的 “拆分-应用-合并”

In [125]:
# 找出 tips 占比最高的前n列
def top(df, n=5, column='tip_pct'):
    return df.nlargest(n, column)

top(tips, n=5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525


In [126]:
tips.groupby('smoker').apply(top)           #分组键和原始对象的索引共同构成结果对象中的层次化索引

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525


In [128]:
tips.groupby(['smoker', 'day']).apply(top, n=5, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Fri,91,22.49,3.5,Male,No,Fri,Dinner,2,0.155625
No,Fri,223,15.98,3.0,Female,No,Fri,Lunch,3,0.187735
No,Fri,99,12.46,1.5,Male,No,Fri,Dinner,2,0.120385
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sat,59,48.27,6.73,Male,No,Sat,Dinner,4,0.139424
No,Sat,23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
No,Sat,238,35.83,4.67,Female,No,Sat,Dinner,3,0.130338
No,Sat,39,31.27,5.0,Male,No,Sat,Dinner,3,0.159898
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799


In [130]:
tips.groupby('smoker', group_keys=False).apply(top)    #去掉分组键索引

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
