 Groupby using dataframe itself(E.g. column(s) of dataframe) is based on groupby using series (see previous section notes)

In [3]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [4]:
df = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)},columns=['k1','k2','dataset1','dataset2'])

In [5]:
df

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,1.615926,-0.716651
1,X,beta,-2.090121,-2.950341
2,Y,alpha,0.272635,-0.606586
3,Y,beta,-0.086821,0.724526
4,Z,alpha,-0.157,1.50264


In [6]:
# Grab the dataset1 column and group it by the k1 key
group1 = df['dataset1'].groupby(df['k1'])

In [7]:
print(group1)

<pandas.core.groupby.SeriesGroupBy object at 0x000002628E19F278>


In [8]:
for g in group1:
    
    print(g[0],g[1]) #g[0]=组名(e.g. X,Y,Z); g[1]=values

X 0    1.615926
1   -2.090121
Name: dataset1, dtype: float64
Y 2    0.272635
3   -0.086821
Name: dataset1, dtype: float64
Z 4   -0.157
Name: dataset1, dtype: float64


In [9]:
# Perform average on group1-- average within each group(e.g. groupX, Y, Z) of this groupby object
group1.mean()

k1
X   -0.237097
Y    0.092907
Z   -0.157000
Name: dataset1, dtype: float64

In [10]:
#We'll make some arrays and use them as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['Oct','Jun','Jan','Feb','Sep'])

In [12]:
# Use the data from df, group the calculable means by city and month(double indexing)*
# *: dataset1 and dataset2 are all numeric values, so .mean() is calculable, so they are calculated as below; 
#    While k1 and k2 contains text value, .mean() not calculable, thus not shown below.
df.groupby([cities,month]).mean()

Unnamed: 0,Unnamed: 1,dataset1,dataset2
LA,Jan,0.272635,-0.606586
LA,Jun,-2.090121,-2.950341
NY,Feb,-0.086821,0.724526
NY,Oct,1.615926,-0.716651
NY,Sep,-0.157,1.50264


In [13]:
# Pass column names as group keys, will return values that could be calculated*
# *: dataset1 and dataset2 are all numeric values, so .mean() is calculable, so they are calculated as below; 
#    While k1 and k2 contains text value, .mean() not calculable, thus not shown below.
df.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.237097,-1.833496
Y,0.092907,0.05897
Z,-0.157,1.50264


In [14]:
# Or multiple column names
df.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,1.615926,-0.716651
X,beta,-2.090121,-2.950341
Y,alpha,0.272635,-0.606586
Y,beta,-0.086821,0.724526
Z,alpha,-0.157,1.50264


In [15]:
# Another useful groupby method is getting the group sizes
df.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [16]:
# We can also iterate over groups
for group_name,group in df.groupby('k1'):
    print ("This is the %s group" %group_name)
    print (group)
    print ('\n')

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha  1.615926 -0.716651
1  X   beta -2.090121 -2.950341


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha  0.272635 -0.606586
3  Y   beta -0.086821  0.724526


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha    -0.157   1.50264




In [17]:
# or
for g in df.groupby('k1'):
    print ("This is the %s group" %g[0])
    print (g[1])
    print ('\n')

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha  1.615926 -0.716651
1  X   beta -2.090121 -2.950341


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha  0.272635 -0.606586
3  Y   beta -0.086821  0.724526


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha    -0.157   1.50264




In [19]:
df

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,1.615926,-0.716651
1,X,beta,-2.090121,-2.950341
2,Y,alpha,0.272635,-0.606586
3,Y,beta,-0.086821,0.724526
4,Z,alpha,-0.157,1.50264


In [20]:
# We can also iterate groupby object with multiple keys(double indexing)
for (k1,k2) , group in df.groupby(['k1','k2']):
    print ("Key1 = %s Key2 = %s" %(k1,k2))
    print (group)
    print ('\n')

Key1 = X Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha  1.615926 -0.716651


Key1 = X Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta -2.090121 -2.950341


Key1 = Y Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha  0.272635 -0.606586


Key1 = Y Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta -0.086821  0.724526


Key1 = Z Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha    -0.157   1.50264




In [21]:
df

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,1.615926,-0.716651
1,X,beta,-2.090121,-2.950341
2,Y,alpha,0.272635,-0.606586
3,Y,beta,-0.086821,0.724526
4,Z,alpha,-0.157,1.50264


In [23]:
list(df.groupby('k1')) #use list function to present groupby object as a list of tuples, with each group in a tuple, 
                        # with tuple[0]=group name (e.g. 'X'), tuple[1] = group values

[('X',   k1     k2  dataset1  dataset2
  0  X  alpha  1.615926 -0.716651
  1  X   beta -2.090121 -2.950341), ('Y',   k1     k2  dataset1  dataset2
  2  Y  alpha  0.272635 -0.606586
  3  Y   beta -0.086821  0.724526), ('Z',   k1     k2  dataset1  dataset2
  4  Z  alpha    -0.157   1.50264)]

In [24]:
# Present group in a dictionary format-- A possibly useful tactic is creating a dictionary of the data pieces 
group_dict = dict(list(df.groupby('k1')))  #1st element of each tuple shown above(i.e.group name)=key; 2nd element=group values

group_dict

{'X':   k1     k2  dataset1  dataset2
 0  X  alpha  1.615926 -0.716651
 1  X   beta -2.090121 -2.950341, 'Y':   k1     k2  dataset1  dataset2
 2  Y  alpha  0.272635 -0.606586
 3  Y   beta -0.086821  0.724526, 'Z':   k1     k2  dataset1  dataset2
 4  Z  alpha    -0.157   1.50264}

In [25]:
#Show the group with X
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,1.615926,-0.716651
1,X,beta,-2.090121,-2.950341


In [26]:
group_dict['Z']

Unnamed: 0,k1,k2,dataset1,dataset2
4,Z,alpha,-0.157,1.50264


In [27]:
# We could have also chosen to do this with axis = 1
group_dict_axis1 = dict(list(df.groupby(df.dtypes,axis=1)))

In [28]:
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  1.615926 -0.716651
 1 -2.090121 -2.950341
 2  0.272635 -0.606586
 3 -0.086821  0.724526
 4 -0.157000  1.502640, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}