# Pandas groupby

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(np.random.randint(0,10,(5,3)), columns=('A B C'.split()))
df['key'] = ('Wood Stone Marble Wood Marble'.split())

In [3]:
df

Unnamed: 0,A,B,C,key
0,3,8,6,Wood
1,1,6,5,Stone
2,1,8,3,Marble
3,9,7,5,Wood
4,6,6,9,Marble


In [4]:
#The groupby method returns a groupby object
df.groupby('key')

<pandas.core.groupby.DataFrameGroupBy object at 0x0000028F5482E550>

In [5]:
#You can apply virtually any Pandas or NumPy aggregation to a groupby object.
df.groupby('key').sum()

Unnamed: 0_level_0,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Marble,7,14,12
Stone,1,6,5
Wood,12,15,11


In [6]:
df.groupby('key').median()

Unnamed: 0_level_0,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Marble,3.5,7.0,6.0
Stone,1.0,6.0,5.0
Wood,6.0,7.5,5.5


In [7]:
#You can access columns of a groupby object
df.groupby('key')['A']

<pandas.core.groupby.SeriesGroupBy object at 0x0000028F548733C8>

In [8]:
df.groupby('key')['A'].sum()

key
Marble     7
Stone      1
Wood      12
Name: A, dtype: int32

In [9]:
df.groupby('key')['A'].describe().unstack()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Marble,2.0,3.5,3.535534,1.0,2.25,3.5,4.75,6.0
Stone,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
Wood,2.0,6.0,4.24264,3.0,4.5,6.0,7.5,9.0


In [10]:
#The aggregate method is a way of computing all the aggregates at once
df.groupby('key').aggregate(['min', np.median, max, np.std])

Unnamed: 0_level_0,A,A,A,A,B,B,B,B,C,C,C,C
Unnamed: 0_level_1,min,median,max,std,min,median,max,std,min,median,max,std
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Marble,1,3.5,6,3.535534,6,7.0,8,1.414214,3,6.0,9,4.242641
Stone,1,1.0,1,,6,6.0,6,,5,5.0,5,
Wood,3,6.0,9,4.242641,7,7.5,8,0.707107,5,5.5,6,0.707107


It is possible to pass a dictionary mapping column names to operations to be applied to that column: 

In [11]:
df.groupby('key').aggregate({'A': min, 'B': max, 'C': np.median})

Unnamed: 0_level_0,C,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Marble,6.0,1,8
Stone,5.0,1,6
Wood,5.5,3,8


### Filtering

In [12]:
#A filter function should return a boolean array specifying whether the group passes the filtering
def filter_function(x): 
    return x['A'].std()>1

In [13]:
df.groupby('key').filter(filter_function)

Unnamed: 0,A,B,C,key
0,3,8,6,Wood
2,1,8,3,Marble
3,9,7,5,Wood
4,6,6,9,Marble


In [14]:
def filter_functwo(x):
    return x['A'].sum()<6

In [15]:
df.groupby('key').filter(filter_functwo)

Unnamed: 0,A,B,C,key
1,1,6,5,Stone


### Transformation

In [16]:
#Centre data by subtracting group-wise means
df.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,A,B,C
0,-3.0,0.5,0.5
1,0.0,0.0,0.0
2,-2.5,1.0,-3.0
3,3.0,-0.5,-0.5
4,2.5,-1.0,3.0


### Apply()

In [17]:
def normalise_by_B(x):
    x['A'] /= x['B'].sum()
    return x

In [18]:
df.groupby('key').apply(normalise_by_B)

Unnamed: 0,A,B,C,key
0,0.2,8,6,Wood
1,0.166667,6,5,Stone
2,0.071429,8,3,Marble
3,0.6,7,5,Wood
4,0.428571,6,9,Marble


The apply method is flexible, the only criterion is that the function takes a dataframe and returns a Pandas object or scalar. 

### Passing lists as keys 

In [19]:
df.groupby([1,2,2,3,3]).sum()

Unnamed: 0,A,B,C
1,3,8,6
2,2,14,8
3,15,13,14


In [20]:
df.groupby('a b b c c'.split()).sum()

Unnamed: 0,A,B,C
a,3,8,6
b,2,14,8
c,15,13,14


In [21]:
#You can pass a dictionary of mapping values to group by. These will map to the index. 
df.set_index('key', inplace=True)
mapping = {'Wood':'Cheap', 'Stone':'Cheap','Marble':'Expensive'}
df.groupby(mapping).sum()

Unnamed: 0,A,B,C
Cheap,13,21,16
Expensive,7,14,12


In [22]:
#You can also pass any Python function that will input the index and output the group
df.groupby(str.lower).sum()

Unnamed: 0,A,B,C
marble,7,14,12
stone,1,6,5
wood,12,15,11


In [23]:
#group by last letter of the key
df.groupby(lambda x: x[-1:]).sum()

Unnamed: 0,A,B,C
d,12,15,11
e,8,20,17


In [24]:
#These can be combined into a multiIndex
df.groupby([str.lower, lambda x: x[-1:]]).sum()

Unnamed: 0,Unnamed: 1,A,B,C
marble,e,7,14,12
stone,e,1,6,5
wood,d,12,15,11
