In [3]:
import numpy as np
import pandas as pd

df=pd.DataFrame({
    'key1':['a','a','b','b','a'],
    'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),
    'data2':np.random.randn(5)
})

df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.001919,-1.322323
1,a,two,-0.752714,1.203118
2,b,one,1.566095,-1.625977
3,b,two,0.236824,-0.63799
4,a,one,-0.649708,-1.507464


In [4]:
#  Suppose you wanted to compute the mean of the data1 column using the groups labels
#  from key1. There are a number of ways to do this. One is to access data1 and call
#  groupby with the column (a Series) at key1

grouped=df['data1'].groupby(df['key1'])

grouped.mean()

# The important
#  thing here is that the data (a Series) has been aggregated according to the group key,
#  producing a new Series that is now indexed by the unique values in the key1 column.
#  The result index has the name 'key1' because the DataFrame column df['key1'] did

key1
a   -0.466834
b    0.901460
Name: data1, dtype: float64

In [6]:
#  If instead we had passed multiple arrays as a list, we get something different:
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.323894
      two    -0.752714
b     one     1.566095
      two     0.236824
Name: data1, dtype: float64

In [8]:
#  In this case, we grouped the data using two keys, and the resulting Series now has a
#  hierarchical index consisting of the unique pairs of keys observed

means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.323894,-0.752714
b,1.566095,0.236824


In [9]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005   -0.752714
            2006    1.566095
Ohio        2005    0.119372
            2006   -0.649708
Name: data1, dtype: float64

In [10]:
# Frequently the grouping information to be found in the same DataFrame as the data
#  you want to work on. In that case, you can pass column names as grp keys:
df.groupby(['key1']).mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.466834,-0.542223
b,0.90146,-1.131984


In [11]:
df.groupby(['key1', 'key2']).mean()

# You may have noticed in the first case df.groupby('key1').mean() that there is no
#  key2 column in the result. Because df['key2'] is not numeric data, it is said to be a
#  nuisance column, which is therefore excluded from the result. By default, all of the numeric columns are aggregated, though it is possible to filter down to a subset as you’ll
#  see soon.

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.323894,-1.414893
a,two,-0.752714,1.203118
b,one,1.566095,-1.625977
b,two,0.236824,-0.63799


In [12]:
# Regardless of the objective in using groupby, a generally useful GroupBy method is 
# size which return a Series containing group sizes:
df.groupby([df['key1'],df['key2']]).size()


key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [14]:
# Iterating Over Groups

# The GroupBy object supports iteration, generating a sequence of 2-tuples containing
#  the group name along with the chunk of data

for name,group in df.groupby(df['key1']):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.001919 -1.322323
1    a  two -0.752714  1.203118
4    a  one -0.649708 -1.507464
b
  key1 key2     data1     data2
2    b  one  1.566095 -1.625977
3    b  two  0.236824 -0.637990


In [16]:
#  In the case of multiple keys, the first element in the tuple will be a tuple of key values
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(k1,k2)
    print(group)


a one
  key1 key2     data1     data2
0    a  one  0.001919 -1.322323
4    a  one -0.649708 -1.507464
a two
  key1 key2     data1     data2
1    a  two -0.752714  1.203118
b one
  key1 key2     data1     data2
2    b  one  1.566095 -1.625977
b two
  key1 key2     data1    data2
3    b  two  0.236824 -0.63799


In [19]:
# Of course, you can choose to do whatever you want with the pieces of data. A recipe
#  you may find useful is computing a dict of the data pieces as a one-liner:

pieces=dict(list(df.groupby(df['key1'])))
pieces['a']
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,1.566095,-1.625977
3,b,two,0.236824,-0.63799


In [20]:
# By default groupby groups on axis=0, but you can group on any of the other axes. For
#  example, we could group the columns of our example df here by dtype like so:

df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [21]:
grouped=df.groupby(df.dtypes,axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0  0.001919 -1.322323
 1 -0.752714  1.203118
 2  1.566095 -1.625977
 3  0.236824 -0.637990
 4 -0.649708 -1.507464,
 dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [None]:
# Selecting a Column or Subset of Columns