In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# GroupBy on DataFrames

In [2]:
dframe = DataFrame({'k1': list('XXYYZ'),
                    'k2': ['alpha', 'beta', 'alpha', 'beta', 'alpha'],
                    'dataset1': np.random.randn(5),
                    'dataset2': np.random.randn(5)})
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.551346,-0.205045,X,alpha
1,0.066088,-1.344748,X,beta
2,0.828882,1.724347,Y,alpha
3,1.335327,1.42789,Y,beta
4,-2.680459,-1.776427,Z,alpha


In [3]:
# Grab 'dataset1', group by 'k1'
group1 = dframe['dataset1'].groupby(dframe['k1'])
group1 # 沒辦法直接看到

<pandas.core.groupby.SeriesGroupBy object at 0x1005343d0>

In [4]:
group1.mean()

k1
X   -0.242629
Y    1.082104
Z   -2.680459
Name: dataset1, dtype: float64

In [5]:
cities = np.array(['NY', 'LA', 'LA', 'NY', 'NY'])
month = np.array(['JAN', 'FEB', 'JAN', 'FEB', 'JAN'])

In [6]:
# Grab 'dataset1', group by指定的key，NY-JAN是1.537144和-0.237573，平均為0.649786
dframe['dataset1'].groupby([cities, month]).mean()

LA  FEB    0.066088
    JAN    0.828882
NY  FEB    1.335327
    JAN   -1.615903
Name: dataset1, dtype: float64

In [7]:
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.242629,-0.774897
Y,1.082104,1.576119
Z,-2.680459,-1.776427


In [8]:
dframe.groupby(['k1', 'k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.551346,-0.205045
X,beta,0.066088,-1.344748
Y,alpha,0.828882,1.724347
Y,beta,1.335327,1.42789
Z,alpha,-2.680459,-1.776427


In [9]:
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

### 列印各個group裡的資料

In [10]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.551346,-0.205045,X,alpha
1,0.066088,-1.344748,X,beta
2,0.828882,1.724347,Y,alpha
3,1.335327,1.42789,Y,beta
4,-2.680459,-1.776427,Z,alpha


In [11]:
# dframe.groupby('k1')返回的iterator是[GroupName(型別為str), Group(型別為DataFrame)]
for name, group in dframe.groupby('k1'):
    print 'This is the %s group' %name
    print group
    print '\n'

This is the X group
   dataset1  dataset2 k1     k2
0 -0.551346 -0.205045  X  alpha
1  0.066088 -1.344748  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2  0.828882  1.724347  Y  alpha
3  1.335327  1.427890  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -2.680459 -1.776427  Z  alpha




In [12]:
for (k1, k2), group in dframe.groupby(['k1', 'k2']):
    print "Key1=%s Key2=%s" %(k1, k2)
    print group
    print '\n'

Key1=X Key2=alpha
   dataset1  dataset2 k1     k2
0 -0.551346 -0.205045  X  alpha


Key1=X Key2=beta
   dataset1  dataset2 k1    k2
1  0.066088 -1.344748  X  beta


Key1=Y Key2=alpha
   dataset1  dataset2 k1     k2
2  0.828882  1.724347  Y  alpha


Key1=Y Key2=beta
   dataset1  dataset2 k1    k2
3  1.335327   1.42789  Y  beta


Key1=Z Key2=alpha
   dataset1  dataset2 k1     k2
4 -2.680459 -1.776427  Z  alpha




In [13]:
group_dict = dict(list(dframe.groupby('k1'))) # list element的型別為[str, DataFrame]
group_dict

{'X':    dataset1  dataset2 k1     k2
 0 -0.551346 -0.205045  X  alpha
 1  0.066088 -1.344748  X   beta, 'Y':    dataset1  dataset2 k1     k2
 2  0.828882  1.724347  Y  alpha
 3  1.335327  1.427890  Y   beta, 'Z':    dataset1  dataset2 k1     k2
 4 -2.680459 -1.776427  Z  alpha}

In [14]:
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.551346,-0.205045,X,alpha
1,0.066088,-1.344748,X,beta


In [15]:
# 以data type分group
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes, axis=1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -0.551346 -0.205045
 1  0.066088 -1.344748
 2  0.828882  1.724347
 3  1.335327  1.427890
 4 -2.680459 -1.776427, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [16]:
# group_dict_axis1[np.dtype(np.float64)]
group_dict_axis1[np.dtype(np.object)]

Unnamed: 0,k1,k2
0,X,alpha
1,X,beta
2,Y,alpha
3,Y,beta
4,Z,alpha


In [17]:
# 只取'dataset2'
# [[...]]: DataFrameGroupBy
# [...]: SeriesGroupBy
dataset2_group = dframe.groupby(['k1', 'k2'])[['dataset2']]

In [18]:
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-0.205045
X,beta,-1.344748
Y,alpha,1.724347
Y,beta,1.42789
Z,alpha,-1.776427


# GroupBy Dict and Series

In [19]:
animals = DataFrame(np.arange(16).reshape(4, 4), 
                    columns=list('WXYZ'),
                    index=['Dog', 'Cat', 'Bird', 'Mouse'])
animals

Unnamed: 0,W,X,Y,Z
Dog,0,1,2,3
Cat,4,5,6,7
Bird,8,9,10,11
Mouse,12,13,14,15


In [20]:
animals.ix[1, ['W', 'Y']] = np.nan
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [21]:
# 法一: dict
behavior_map = {'W': 'good', 'X': 'bad', 'Y': 'good', 'Z': 'bad'}

In [22]:
animal_col = animals.groupby(behavior_map, axis=1)

In [23]:
animal_col.sum()

Unnamed: 0,bad,good
Dog,4.0,2.0
Cat,12.0,
Bird,20.0,18.0
Mouse,28.0,26.0


In [24]:
# 法二: Series
behav_series = Series(behavior_map)
behav_series

W    good
X     bad
Y    good
Z     bad
dtype: object

In [25]:
animals.groupby(behav_series, axis=1).sum() # 試試.count()

Unnamed: 0,bad,good
Dog,4.0,2.0
Cat,12.0,
Bird,20.0,18.0
Mouse,28.0,26.0


In [31]:
animals.groupby(behav_series, axis=1).count()

Unnamed: 0,bad,good
Dog,2,2
Cat,2,0
Bird,2,2
Mouse,2,2


In [26]:
# GroupBy length of row name
animals.groupby(len).sum()

Unnamed: 0,W,X,Y,Z
3,0.0,6,2.0,10
4,8.0,9,10.0,11
5,12.0,13,14.0,15


In [27]:
keys = list('AABB')
animals.groupby([len, keys]).max()

Unnamed: 0,Unnamed: 1,W,X,Y,Z
3,A,0.0,5,2.0,7
4,B,8.0,9,10.0,11
5,B,12.0,13,14.0,15


### Groupby with hierarchaly index levels
不使用groupby

In [28]:
#Create a hierarchal column index
hier_col = pd.MultiIndex.from_arrays([['NY', 'NY', 'NY', 'SF', 'SF'],
                                      [1, 2, 3, 1, 2]],
                                      names=['City', 'sub_value'])
hier_col

MultiIndex(levels=[[u'NY', u'SF'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]],
           names=[u'City', u'sub_value'])

In [29]:
# Create a dframe with hierarchal index
dframe_hr = DataFrame(np.arange(25).reshape(5,5), columns=hier_col)
dframe_hr

City,NY,NY,NY,SF,SF
sub_value,1,2,3,1,2
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [30]:
# Multiply values by 100 for clarity
dframe_hr = dframe_hr*100
dframe_hr

City,NY,NY,NY,SF,SF
sub_value,1,2,3,1,2
0,0,100,200,300,400
1,500,600,700,800,900
2,1000,1100,1200,1300,1400
3,1500,1600,1700,1800,1900
4,2000,2100,2200,2300,2400
