# GroupBy 技术

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
df = DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                'key2': ['one', 'two', 'one', 'two', 'one'],
                'data1': np.random.randn(5),
                'data2': np.random.randn(5)})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,-1.131665,-0.31797,a,one
1,-0.747197,0.460761,a,two
2,0.529429,1.420046,b,one
3,1.462588,0.04698,b,two
4,-0.422757,1.604075,a,one


In [4]:
grouped = df['data1'].groupby(df['key1'])

In [5]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x10c4b8128>

In [6]:
grouped.mean()

key1
a   -0.767207
b    0.996008
Name: data1, dtype: float64

In [7]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [8]:
means

key1  key2
a     one    -0.777211
      two    -0.747197
b     one     0.529429
      two     1.462588
Name: data1, dtype: float64

In [9]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.777211,-0.747197
b,0.529429,1.462588


In [10]:
states = np.array(['Ohio', 'Clifornia', 'California', 'Ohio', 'Ohio'])

In [11]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states, years]).mean()

California  2006    0.529429
Clifornia   2005   -0.747197
Ohio        2005    0.165461
            2006   -0.422757
Name: data1, dtype: float64

In [13]:
df

Unnamed: 0,data1,data2,key1,key2
0,-1.131665,-0.31797,a,one
1,-0.747197,0.460761,a,two
2,0.529429,1.420046,b,one
3,1.462588,0.04698,b,two
4,-0.422757,1.604075,a,one


In [14]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.767207,0.582289
b,0.996008,0.733513


In [15]:
df['data1'].groupby(df['key1']).mean()

key1
a   -0.767207
b    0.996008
Name: data1, dtype: float64

In [16]:
df.groupby('key1').mean()['data1']

key1
a   -0.767207
b    0.996008
Name: data1, dtype: float64

In [17]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.777211,0.643053
a,two,-0.747197,0.460761
b,one,0.529429,1.420046
b,two,1.462588,0.04698


In [18]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## 对分组进行迭代

In [19]:
df.groupby('key1')

<pandas.core.groupby.DataFrameGroupBy object at 0x10c487780>

In [20]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -1.131665 -0.317970    a  one
1 -0.747197  0.460761    a  two
4 -0.422757  1.604075    a  one
b
      data1     data2 key1 key2
2  0.529429  1.420046    b  one
3  1.462588  0.046980    b  two


In [21]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(k1, k2)
    print(group)

a one
      data1     data2 key1 key2
0 -1.131665 -0.317970    a  one
4 -0.422757  1.604075    a  one
a two
      data1     data2 key1 key2
1 -0.747197  0.460761    a  two
b one
      data1     data2 key1 key2
2  0.529429  1.420046    b  one
b two
      data1    data2 key1 key2
3  1.462588  0.04698    b  two


In [22]:
pieces = dict(list(df.groupby('key1')))

In [23]:
pieces

{'a':       data1     data2 key1 key2
 0 -1.131665 -0.317970    a  one
 1 -0.747197  0.460761    a  two
 4 -0.422757  1.604075    a  one, 'b':       data1     data2 key1 key2
 2  0.529429  1.420046    b  one
 3  1.462588  0.046980    b  two}

In [24]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,0.529429,1.420046,b,one
3,1.462588,0.04698,b,two


In [25]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [26]:
grouped = df.groupby(df.dtypes, axis=1)

In [27]:
list(grouped)

[(dtype('float64'),       data1     data2
  0 -1.131665 -0.317970
  1 -0.747197  0.460761
  2  0.529429  1.420046
  3  1.462588  0.046980
  4 -0.422757  1.604075), (dtype('O'),   key1 key2
  0    a  one
  1    a  two
  2    b  one
  3    b  two
  4    a  one)]

In [28]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -1.131665 -0.317970
 1 -0.747197  0.460761
 2  0.529429  1.420046
 3  1.462588  0.046980
 4 -0.422757  1.604075, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

## 选取一个或一组列

In [29]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.643053
a,two,0.460761
b,one,1.420046
b,two,0.04698


In [30]:
type(df.groupby(['key1', 'key2'])[['data2']].mean())

pandas.core.frame.DataFrame

In [31]:
df['data2'].groupby([df['key1'], df['key2']]).mean()

key1  key2
a     one     0.643053
      two     0.460761
b     one     1.420046
      two     0.046980
Name: data2, dtype: float64

In [32]:
type(df['data2'].groupby([df['key1'], df['key2']]).mean())

pandas.core.series.Series

In [33]:
DataFrame(df['data2'].groupby([df['key1'], df['key2']]).mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.643053
a,two,0.460761
b,one,1.420046
b,two,0.04698


In [34]:
s_grouped = df.groupby(['key1', 'key2'])['data2']

In [35]:
s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x10c4d8f60>

In [36]:
s_grouped.mean()

key1  key2
a     one     0.643053
      two     0.460761
b     one     1.420046
      two     0.046980
Name: data2, dtype: float64

In [37]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.643053
a,two,0.460761
b,one,1.420046
b,two,0.04698


In [38]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.643053
      two     0.460761
b     one     1.420046
      two     0.046980
Name: data2, dtype: float64

In [39]:
DataFrame(df.groupby(['key1', 'key2'])['data2'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.643053
a,two,0.460761
b,one,1.420046
b,two,0.04698


## 通过字典或 Series 进行分组

In [40]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [41]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.678536,0.80117,-1.590546,1.48585,0.905955
Steve,-1.24251,0.431962,0.758675,-1.912489,-0.966027
Wes,0.898017,1.587037,-1.019709,-1.654957,0.878539
Jim,1.909997,-1.239043,0.929766,-1.055194,-0.044571
Travis,0.004724,1.218192,-0.614771,-1.09402,-0.232412


In [43]:
people.ix[2:3, ['b', 'c']] = np.nan

In [44]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.678536,0.80117,-1.590546,1.48585,0.905955
Steve,-1.24251,0.431962,0.758675,-1.912489,-0.966027
Wes,0.898017,,,-1.654957,0.878539
Jim,1.909997,-1.239043,0.929766,-1.055194,-0.044571
Travis,0.004724,1.218192,-0.614771,-1.09402,-0.232412


In [45]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

In [46]:
by_column = people.groupby(mapping, axis=1)

In [48]:
by_column

<pandas.core.groupby.DataFrameGroupBy object at 0x10c6c1f28>

In [51]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-0.104696,2.385661
Steve,-1.153814,-1.776575
Wes,-1.654957,1.776557
Jim,-0.125428,0.626383
Travis,-1.708791,0.990504


In [53]:
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [52]:
map_series = Series(mapping)

In [54]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [55]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## 通过函数进行分组

In [56]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,3.48655,-0.437873,-0.66078,-1.224301,1.739923
5,-1.24251,0.431962,0.758675,-1.912489,-0.966027
6,0.004724,1.218192,-0.614771,-1.09402,-0.232412


In [57]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [58]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.678536,0.80117,-1.590546,-1.654957,0.878539
3,two,1.909997,-1.239043,0.929766,-1.055194,-0.044571
5,one,-1.24251,0.431962,0.758675,-1.912489,-0.966027
6,two,0.004724,1.218192,-0.614771,-1.09402,-0.232412


## 根据索引界别分组

In [60]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'CN', 'CN'],
                                    [1, 3, 5, 1, 5]], names=['cty', 'tenor'])

In [62]:
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)

In [65]:
hier_df

cty,US,US,US,CN,CN
tenor,1,3,5,1,5
0,0.962339,0.528003,-0.069246,-0.890043,-0.578803
1,-1.503524,1.639463,0.871461,0.305199,-1.262396
2,-0.519284,0.080873,-1.621404,0.393922,-0.802996
3,1.993114,-0.880089,-0.036966,-0.617675,0.806898


In [67]:
hier_df.groupby(level='cty', axis=1).count()

cty,CN,US
0,2,3
1,2,3
2,2,3
3,2,3
