In [2]:
import numpy as np
import pandas as pd

# Grouping Overview

In [3]:
df = pd.DataFrame([('bird', 'Falconiformes', 389.0),
   ...:                    ('bird', 'Psittaciformes', 24.0),
   ...:                    ('mammal', 'Carnivora', 80.2),
   ...:                    ('mammal', 'Primates', np.nan),
   ...:                    ('mammal', 'Carnivora', 58)],
   ...:                   index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'],
   ...:                   columns=('class', 'order', 'max_speed'))

df

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


Groupby doesn't actually group until needed, initially only creates mapping

In [4]:
grouped=df.groupby('class')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017CBC673048>

Once the functions are invoked on groups grouping will take place

In [5]:
grouped.sum()

Unnamed: 0_level_0,max_speed
class,Unnamed: 1_level_1
bird,413.0
mammal,138.2


In [6]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
   ...:                          'foo', 'bar', 'foo', 'foo'],
   ...:                    'B': ['one', 'one', 'two', 'three',
   ...:                          'two', 'two', 'one', 'three'],
   ...:                    'C': np.random.randn(8),
   ...:                    'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.287035,-0.149983
1,bar,one,-0.669688,0.431451
2,foo,two,-0.562985,-0.819954
3,bar,three,-0.808934,1.706345
4,foo,two,-1.05618,-1.099547
5,bar,two,-0.82821,-1.045286
6,foo,one,-0.839219,-0.279954
7,foo,three,0.245065,-1.31497


In [7]:
grouped=df.groupby(['A','B'])

In [8]:
grouped.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.669688,0.431451
bar,three,-0.808934,1.706345
bar,two,-0.82821,-1.045286
foo,one,-2.126255,-0.429937
foo,three,0.245065,-1.31497
foo,two,-1.619165,-1.919502


Grouping can be done in many ways.  We can group by rows (index) instead of columns

In [9]:
df2=df.set_index(['A','B'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,-1.287035,-0.149983
bar,one,-0.669688,0.431451
foo,two,-0.562985,-0.819954
bar,three,-0.808934,1.706345
foo,two,-1.05618,-1.099547
bar,two,-0.82821,-1.045286
foo,one,-0.839219,-0.279954
foo,three,0.245065,-1.31497


In [10]:
display('Level 1',df2.groupby(level=1).sum(), 'Level 0',df2.groupby(level=0).sum())

'Level 1'

Unnamed: 0_level_0,C,D
B,Unnamed: 1_level_1,Unnamed: 2_level_1
one,-2.795942,0.001514
three,-0.563868,0.391375
two,-2.447376,-2.964788


'Level 0'

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-2.306832,1.09251
foo,-3.500354,-3.664408


In [11]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.287035,-0.149983
1,bar,one,-0.669688,0.431451
2,foo,two,-0.562985,-0.819954
3,bar,three,-0.808934,1.706345
4,foo,two,-1.05618,-1.099547
5,bar,two,-0.82821,-1.045286
6,foo,one,-0.839219,-0.279954
7,foo,three,0.245065,-1.31497


Grouping with functions

In [12]:
def get_letter_type(letter):
   ....:     if letter.lower() in 'aeiou':
   ....:         return 'vowel'
   ....:     else:
   ....:         return 'consonant'

grouped = df.groupby(get_letter_type, axis=1)
grouped.sum()

Unnamed: 0,consonant,vowel
0,-1.437018,foo
1,-0.238237,bar
2,-1.38294,foo
3,0.897411,bar
4,-2.155728,foo
5,-1.873496,bar
6,-1.119174,foo
7,-1.069904,foo


In [13]:
df[['C','D']].sum(axis=1)

0   -1.437018
1   -0.238237
2   -1.382940
3    0.897411
4   -2.155728
5   -1.873496
6   -1.119174
7   -1.069904
dtype: float64

Accessing Groups

In [14]:
df.groupby('A').groups

{'bar': Int64Index([1, 3, 5], dtype='int64'),
 'foo': Int64Index([0, 2, 4, 6, 7], dtype='int64')}

In [15]:
grouped.groups

{'consonant': Index(['B', 'C', 'D'], dtype='object'),
 'vowel': Index(['A'], dtype='object')}

In [16]:
len(grouped)

2

# MultiIndex Gruping

In [17]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
   ....:           ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
s = pd.Series(np.random.randn(8), index=index)
s

first  second
bar    one       0.146448
       two       0.272812
baz    one      -1.570817
       two      -0.094236
foo    one       1.484639
       two       0.586301
qux    one      -1.182386
       two       0.308330
dtype: float64

In [18]:
grouped=s.groupby(level=0)
grouped.sum()

first
bar    0.419261
baz   -1.665053
foo    2.070940
qux   -0.874056
dtype: float64

In [19]:
s.groupby(level=1).sum()

second
one   -1.122116
two    1.073208
dtype: float64

Can access levels by level names

In [20]:
s.groupby(level='second').sum()

second
one   -1.122116
two    1.073208
dtype: float64

Can group by a combination of rows and columns with pd.Grouper

In [21]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
   ....:           ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

index=pd.MultiIndex.from_arrays(arrays,names=['first','second'])

df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
   ....:                    'B': np.arange(8)},
   ....:                   index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [22]:
df.groupby([pd.Grouper(level=1),'A']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


Or can reference levels and columns as is

In [23]:
df.groupby(['second','A']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


Selecting columns is easy after grouping

In [24]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [25]:
grouped_A=df.groupby(level='first')['A']
grouped.sum()

first
bar    0.419261
baz   -1.665053
foo    2.070940
qux   -0.874056
dtype: float64

You can select a group if desired to look at certain groupings

In [26]:
grouped.get_group('bar')

first  second
bar    one       0.146448
       two       0.272812
dtype: float64

# Aggregation

Aggregation functions can be applied with agg API call

In [27]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [28]:
df.reset_index(inplace=True)
df

Unnamed: 0,first,second,A,B
0,bar,one,1,0
1,bar,two,1,1
2,baz,one,1,2
3,baz,two,1,3
4,foo,one,2,4
5,foo,two,2,5
6,qux,one,3,6
7,qux,two,3,7


In [29]:
grouped=df.groupby('first')
grouped.agg(np.sum)

Unnamed: 0_level_0,A,B
first,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,2,1
baz,2,5
foo,4,9
qux,6,13


Columns don't have to be set as index after grouping

In [30]:
grouped=df.groupby('first',as_index=False)
grouped.sum()

Unnamed: 0,first,A,B
0,bar,2,1
1,baz,2,5
2,foo,4,9
3,qux,6,13


similar results can be achieved as following

In [31]:
df.groupby('first').sum().reset_index()

Unnamed: 0,first,A,B
0,bar,2,1
1,baz,2,5
2,foo,4,9
3,qux,6,13


# Applying multiple functions at once

In [32]:
grouped=df.groupby('first')
grouped.agg([np.sum, 'describe'])

Unnamed: 0_level_0,second,second,second,second,second,A,A,A,A,A,A,A,B,B,B,B,B,B,B,B,B
Unnamed: 0_level_1,sum,describe,describe,describe,describe,sum,describe,describe,describe,describe,describe,describe,sum,describe,describe,describe,describe,describe,describe,describe,describe
Unnamed: 0_level_2,second,count,unique,top,freq,A,count,mean,std,min,...,max,B,count,mean,std,min,25%,50%,75%,max
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
bar,onetwo,2,2,two,1,2,2.0,1.0,0.0,1.0,...,1.0,1,2.0,0.5,0.707107,0.0,0.25,0.5,0.75,1.0
baz,onetwo,2,2,two,1,2,2.0,1.0,0.0,1.0,...,1.0,5,2.0,2.5,0.707107,2.0,2.25,2.5,2.75,3.0
foo,onetwo,2,2,two,1,4,2.0,2.0,0.0,2.0,...,2.0,9,2.0,4.5,0.707107,4.0,4.25,4.5,4.75,5.0
qux,onetwo,2,2,two,1,6,2.0,3.0,0.0,3.0,...,3.0,13,2.0,6.5,0.707107,6.0,6.25,6.5,6.75,7.0


In [33]:
grouped.agg([np.sum,np.mean])

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,sum,mean,sum,mean
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,2,1,1,0.5
baz,2,1,5,2.5
foo,4,2,9,4.5
qux,6,3,13,6.5


Resulting column names are named after functions, that can be changed with Named Functions or simply renaming the columns

In [34]:
df.groupby('first').agg([np.mean,np.sum]).rename({'mean':'New Mean','sum':'New Sum'},axis=1)

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,New Mean,New Sum,New Mean,New Sum
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1,2,0.5,1
baz,1,2,2.5,5
foo,2,4,4.5,9
qux,3,6,6.5,13


Similarly using different rename arguements

In [35]:
df.groupby('first').agg([np.mean,np.sum]).rename(columns={'mean':'New Mean','sum':'New Sum'})

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,New Mean,New Sum,New Mean,New Sum
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1,2,0.5,1
baz,1,2,2.5,5
foo,2,4,4.5,9
qux,3,6,6.5,13


User defined functions can also be applied to aggregate

In [36]:
df

Unnamed: 0,first,second,A,B
0,bar,one,1,0
1,bar,two,1,1
2,baz,one,1,2
3,baz,two,1,3
4,foo,one,2,4
5,foo,two,2,5
6,qux,one,3,6
7,qux,two,3,7


In [37]:
np.random.seed(100)
df['A']=np.random.randint(10,size=(df.shape[0],1))
df

Unnamed: 0,first,second,A,B
0,bar,one,8,0
1,bar,two,8,1
2,baz,one,3,2
3,baz,two,7,3
4,foo,one,7,4
5,foo,two,0,5
6,qux,one,4,6
7,qux,two,2,7


When grouping and applying aggregate functions each column is passed as series

In [38]:
df.groupby('second').agg([lambda x: print(x)])

Series([], Name: first, dtype: object)
0    bar
2    baz
4    foo
6    qux
Name: first, dtype: object
1    bar
3    baz
5    foo
7    qux
Name: first, dtype: object
Series([], Name: A, dtype: int32)
0    8
2    3
4    7
6    4
Name: A, dtype: int32
1    8
3    7
5    0
7    2
Name: A, dtype: int32
Series([], Name: B, dtype: int32)
0    0
2    2
4    4
6    6
Name: B, dtype: int32
1    1
3    3
5    5
7    7
Name: B, dtype: int32


Unnamed: 0_level_0,first,A,B
Unnamed: 0_level_1,<lambda>,<lambda>,<lambda>
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
one,,,
two,,,


In [39]:
df.groupby('second').agg([lambda x: x.max() - x.min(),lambda x: x.mean(), lambda x: x.mean()-x.max()])

Unnamed: 0_level_0,A,A,A,B,B,B
Unnamed: 0_level_1,<lambda_0>,<lambda_1>,<lambda_2>,<lambda_0>,<lambda_1>,<lambda_2>
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
one,5,5.5,-2.5,6,3,-3
two,8,4.25,-3.75,6,4,-3


Column names as assigned based on function name

In [40]:
animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
   ....:                         'height': [9.1, 6.0, 9.5, 34.0],
   ....:                         'weight': [7.9, 7.5, 9.9, 198.0]})

animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [41]:
animals.groupby('kind').agg(min_h=pd.NamedAgg(column='height',aggfunc='min'),
                            max_h=pd.NamedAgg(column='height',aggfunc='max'))

Unnamed: 0_level_0,min_h,max_h
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.1,9.5
dog,6.0,34.0


pd.NamedAgg is a named tuple, similarly we can pass a tuple with function name and col name to achieve the same results

In [42]:
animals.groupby('kind').agg(min_h=('height','min'),max_h=('height','max'))

Unnamed: 0_level_0,min_h,max_h
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.1,9.5
dog,6.0,34.0


Similarly can pass a tuple with name of the column and custom function

In [43]:
animals.groupby('kind').agg(spread=('height',lambda x: x.max()-x.min()))

Unnamed: 0_level_0,spread
kind,Unnamed: 1_level_1
cat,0.4
dog,28.0


Different functions can be applied to difference columns at once as well

In [44]:
animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [45]:
animals.groupby('kind').agg({'height':np.sum,'weight':"mean"})

Unnamed: 0_level_0,height,weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,18.6,8.9
dog,40.0,102.75


# Transformation

Transformation is similar to agg, but the resulting table is the same size as the original table

In [46]:
df

Unnamed: 0,first,second,A,B
0,bar,one,8,0
1,bar,two,8,1
2,baz,one,3,2
3,baz,two,7,3
4,foo,one,7,4
5,foo,two,0,5
6,qux,one,4,6
7,qux,two,2,7


In [47]:
df.groupby('first').sum()

Unnamed: 0_level_0,A,B
first,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,16,1
baz,10,5
foo,7,9
qux,6,13


In [48]:
df.groupby('first').transform(lambda x: x.sum())

Unnamed: 0,second,A,B
0,onetwo,16,1
1,onetwo,16,1
2,onetwo,10,5
3,onetwo,10,5
4,onetwo,7,9
5,onetwo,7,9
6,onetwo,6,13
7,onetwo,6,13


It's possible to filter the data based on group properties

In [49]:
dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})
dff

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,b
5,5,b
6,6,c
7,7,c


In [50]:
dff.groupby('B').filter(lambda x: len(x)>2)

Unnamed: 0,A,B
2,2,b
3,3,b
4,4,b
5,5,b


The function has to be applicable across the entire slice, including column you're grouping by.

The following will throw and error because column B isn't an integer

In [52]:
dff.groupby('B').filter(lambda x: x.sum()>2)

TypeError: '>' not supported between instances of 'str' and 'int'

Since function input for filter function isn't a series but a slice of DF this can be fixed with specifying column

In [54]:
dff.groupby('B').filter(lambda x: x['A'].sum()>2)

Unnamed: 0,A,B
2,2,b
3,3,b
4,4,b
5,5,b
6,6,c
7,7,c


Similar results can be achieved with .loc but returned values will be grouped before returning unlike filter

In [55]:
dff.groupby('B').sum().loc[lambda df: df['A']>2,:]

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
b,14
c,13


# Apply

Some operations on the grouped data might not fit into either the aggregate or transform categories. Or, you may simply want GroupBy to infer how to combine the results. For these, use the apply function, which can be substituted for both aggregate and transform in many standard use cases. However, apply can handle some exceptional use cases, for example:

In [56]:
df

Unnamed: 0,first,second,A,B
0,bar,one,8,0
1,bar,two,8,1
2,baz,one,3,2
3,baz,two,7,3
4,foo,one,7,4
5,foo,two,0,5
6,qux,one,4,6
7,qux,two,2,7


In [57]:
df.set_axis(list('ABCD'), axis=1, inplace=True)
df

Unnamed: 0,A,B,C,D
0,bar,one,8,0
1,bar,two,8,1
2,baz,one,3,2
3,baz,two,7,3
4,foo,one,7,4
5,foo,two,0,5
6,qux,one,4,6
7,qux,two,2,7


Note that apply functions take in a slice of df, while agg takes in each column separately

In [58]:
df.groupby('A').apply(lambda x: print(x))

     A    B  C  D
0  bar  one  8  0
1  bar  two  8  1
     A    B  C  D
2  baz  one  3  2
3  baz  two  7  3
     A    B  C  D
4  foo  one  7  4
5  foo  two  0  5
     A    B  C  D
6  qux  one  4  6
7  qux  two  2  7


In [59]:
df.head(3).groupby('A').agg(lambda x: print(x))

Series([], Name: B, dtype: object)
0    one
1    two
Name: B, dtype: object
2    one
Name: B, dtype: object
Series([], Name: C, dtype: int32)
0    8
1    8
Name: C, dtype: int32
2    3
Name: C, dtype: int32
Series([], Name: D, dtype: int32)
0    0
1    1
Name: D, dtype: int32
2    2
Name: D, dtype: int32


Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,,,
baz,,,


In [60]:
df

Unnamed: 0,A,B,C,D
0,bar,one,8,0
1,bar,two,8,1
2,baz,one,3,2
3,baz,two,7,3
4,foo,one,7,4
5,foo,two,0,5
6,qux,one,4,6
7,qux,two,2,7


In [61]:
df[list('ACD')].groupby('A').apply(lambda x: x.sum())

Unnamed: 0_level_0,A,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,barbar,16,1
baz,bazbaz,10,5
foo,foofoo,7,9
qux,quxqux,6,13


In [62]:
df[list('ACD')].groupby('A').agg(lambda x: x.sum())

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,16,1
baz,10,5
foo,7,9
qux,6,13


Note the differences between agg and apply results.  Apply will perform function on all of the columns, similar to filter after groupby

Apply can even return a new table

In [63]:
df

Unnamed: 0,A,B,C,D
0,bar,one,8,0
1,bar,two,8,1
2,baz,one,3,2
3,baz,two,7,3
4,foo,one,7,4
5,foo,two,0,5
6,qux,one,4,6
7,qux,two,2,7


In [64]:
grouped=df.groupby('A')['C']

def f(group):
    return pd.DataFrame({'original':group, 'mean': group.mean()})

grouped.apply(f)

Unnamed: 0,original,mean
0,8,8.0
1,8,8.0
2,3,5.0
3,7,5.0
4,7,3.5
5,0,3.5
6,4,3.0
7,2,3.0


In [65]:
def f(x):
    return pd.Series([x, x ** 2], index=['x', 'x^2'])

s = pd.Series(np.random.rand(5))

s

0    0.186467
1    0.210108
2    0.452740
3    0.870143
4    0.063681
dtype: float64

The series will get upcasted to the DF

In [66]:
s.apply(f)

Unnamed: 0,x,x^2
0,0.186467,0.03477
1,0.210108,0.044145
2,0.45274,0.204973
3,0.870143,0.757148
4,0.063681,0.004055
