In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [2]:
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.151759,0.837681
1,X,beta,0.909495,-0.190872
2,Y,alpha,0.730644,0.766154
3,Y,beta,-1.346827,1.79985
4,Z,alpha,0.133993,-1.004785


In [3]:
# grouping dataset1 by its key1 column

group1 = dframe['dataset1'].groupby(dframe['k1'])
print(group1)

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f3043f67550>


In [4]:
# getting the mean of group1
group1.mean()

k1
X    0.378868
Y   -0.308092
Z    0.133993
Name: dataset1, dtype: float64

In [6]:
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

# grouping dframe using keys not in orignal dframe
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB    0.909495
    JAN    0.730644
NY  FEB   -1.346827
    JAN   -0.008883
Name: dataset1, dtype: float64

In [8]:
# grouping entire dframe by one of its column key
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.378868,0.323405
Y,-0.308092,1.283002
Z,0.133993,-1.004785


In [9]:
# grouping by using a list of keys
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.151759,0.837681
X,beta,0.909495,-0.190872
Y,alpha,0.730644,0.766154
Y,beta,-1.346827,1.79985
Z,alpha,0.133993,-1.004785


In [10]:
# size/row of each keys in a dframe, as shown, Z is only have 1 row, X/Y key has 2 rows each
dframe.groupby('k1').size()

k1
X    2
Y    2
Z    1
dtype: int64

In [13]:
# iterating over the grouped dframe/series

for name,group in dframe.groupby('k1'):
    print('this is the %s group' %name)
    print(group, end='\n\n')

this is the X group
  k1     k2  dataset1  dataset2
0  X  alpha -0.151759  0.837681
1  X   beta  0.909495 -0.190872

this is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha  0.730644  0.766154
3  Y   beta -1.346827  1.799850

this is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha  0.133993 -1.004785



In [15]:
# samething as above but with multikeys 

for (key1,key2), group in dframe.groupby(['k1','k2']):
    print('Key1 = %s Key2 = %s' %(key1,key2))
    print(group, end='\n\n')

Key1 = X Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha -0.151759  0.837681

Key1 = X Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta  0.909495 -0.190872

Key1 = Y Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha  0.730644  0.766154

Key1 = Y Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta -1.346827   1.79985

Key1 = Z Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha  0.133993 -1.004785



In [19]:
# converting the grouped object into a dictionary and indexing by its key 
roup_dict = dict(list(dframe.groupby('k1')))
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.151759,0.837681
1,X,beta,0.909495,-0.190872


In [21]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects! i dont know what for, but just do it bc we can. 
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

#show
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -0.151759  0.837681
 1  0.909495 -0.190872
 2  0.730644  0.766154
 3 -1.346827  1.799850
 4  0.133993 -1.004785,
 dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [24]:
# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,0.837681
X,beta,-0.190872
Y,alpha,0.766154
Y,beta,1.79985
Z,alpha,-1.004785
