In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                   'k2':['alpha','beta','alpha','beta','alpha'],
                   'dataset1':np.random.randn(5),
                   'dataset2':np.random.randn(5)})
dframe 

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.900769,0.109646
1,X,beta,1.687715,-0.211419
2,Y,alpha,0.019073,0.623248
3,Y,beta,1.504111,1.011077
4,Z,alpha,-1.545625,-0.455573


In [5]:
group1 = dframe['dataset1'].groupby(dframe['k1'])
group1


<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001468DA461D0>

In [6]:
group1.mean()

k1
X    0.393473
Y    0.761592
Z   -1.545625
Name: dataset1, dtype: float64

In [7]:
# groupby on series

cities = np.array(['NY','LA','LA','NY','NY'])

month = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [8]:
dframe['dataset1'].groupby([cities, month]).mean()

LA  FEB    1.687715
    JAN    0.019073
NY  FEB    1.504111
    JAN   -1.223197
Name: dataset1, dtype: float64

In [27]:
dframe


Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.900769,0.109646
1,X,beta,1.687715,-0.211419
2,Y,alpha,0.019073,0.623248
3,Y,beta,1.504111,1.011077
4,Z,alpha,-1.545625,-0.455573


In [10]:
#pass column names as group keys

dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.393473,-0.050886
Y,0.761592,0.817163
Z,-1.545625,-0.455573


In [11]:
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.900769,0.109646
X,beta,1.687715,-0.211419
Y,alpha,0.019073,0.623248
Y,beta,1.504111,1.011077
Z,alpha,-1.545625,-0.455573


In [12]:
#groupby method to get group sizes, similar to value_count

dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [19]:
#iterating by a single key

for name,group in dframe.groupby('k1'):
    print ("This is the %s group" %name)
    print (group)
#  

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha -0.900769  0.109646
1  X   beta  1.687715 -0.211419
This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha  0.019073  0.623248
3  Y   beta  1.504111  1.011077
This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha -1.545625 -0.455573


In [21]:
#iterating by multiple keys

for (k1,k2),group in dframe.groupby(['k1','k2']):
    print("Key1 = %s, Key2 = %s" %(k1,k2))
    print(group)


Key1 = X, Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha -0.900769  0.109646
Key1 = X, Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta  1.687715 -0.211419
Key1 = Y, Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha  0.019073  0.623248
Key1 = Y, Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta  1.504111  1.011077
Key1 = Z, Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha -1.545625 -0.455573


In [24]:
## if we want to create a dictionary...

group_dict = dict(list(dframe.groupby('k1')))
group_dict


{'X':   k1     k2  dataset1  dataset2
 0  X  alpha -0.900769  0.109646
 1  X   beta  1.687715 -0.211419,
 'Y':   k1     k2  dataset1  dataset2
 2  Y  alpha  0.019073  0.623248
 3  Y   beta  1.504111  1.011077,
 'Z':   k1     k2  dataset1  dataset2
 4  Z  alpha -1.545625 -0.455573}

In [25]:
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.900769,0.109646
1,X,beta,1.687715,-0.211419


In [28]:
# create dictionary with axis=1, separates data by types of data

group_dict_axis1  = dict(list(dframe.groupby(dframe.dtypes,axis=1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -0.900769  0.109646
 1  1.687715 -0.211419
 2  0.019073  0.623248
 3  1.504111  1.011077
 4 -1.545625 -0.455573,
 dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [29]:
# groupby with columns

dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,0.109646
X,beta,-0.211419
Y,alpha,0.623248
Y,beta,1.011077
Z,alpha,-0.455573


In [30]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-0.900769,0.109646
1,X,beta,1.687715,-0.211419
2,Y,alpha,0.019073,0.623248
3,Y,beta,1.504111,1.011077
4,Z,alpha,-1.545625,-0.455573
