df.groupby(['list','of','group','columns'])  
df.groupby('single_column')

In [1]:
import pandas as pd
import numpy as np

# Defining an aggregation

In [9]:
flights = pd.read_csv('./data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [4]:
flights.groupby('AIRLINE').agg({
    'ARR_DELAY': 'mean'
}).head()

Unnamed: 0_level_0,ARR_DELAY
AIRLINE,Unnamed: 1_level_1
AA,5.542661
AS,-0.833333
B6,8.692593
DL,0.339691
EV,7.03458


In [5]:
flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [7]:
flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.mean).head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [8]:
flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

# Grouping and aggregating with multiple columns and functions

> As usual with any kind of grouping operation, it helps to identify the three components: **the grouping columns**, **aggregating columns**, and **aggregating functions**.

In [16]:
flights.groupby(['AIRLINE','WEEKDAY'])['CANCELLED'].agg('sum').head(14)

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
         6          21
         7          29
AS       1           0
         2           0
         3           0
         4           0
         5           0
         6           0
         7           0
Name: CANCELLED, dtype: int64

In [18]:
flights.groupby(['AIRLINE','WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum','mean']).head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786
AA,6,21,0.018667,9,0.008
AA,7,29,0.021837,1,0.000753
AS,1,0,0.0,0,0.0
AS,2,0,0.0,0,0.0
AS,3,0,0.0,0,0.0


In [20]:
group_cols = ['ORG_AIR', 'DEST_AIR']
agg_dict = {
    'CANCELLED': ['sum','mean','size'],
    'AIR_TIME': ['mean', 'var']
}
flights.groupby(group_cols).agg(agg_dict)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.000000,31,96.387097,45.778495
ATL,ABQ,0,0.000000,16,170.500000,87.866667
ATL,ABY,0,0.000000,19,28.578947,6.590643
ATL,ACY,0,0.000000,6,91.333333,11.466667
ATL,AEX,0,0.000000,40,78.725000,47.332692
ATL,AGS,0,0.000000,83,28.819277,9.393770
ATL,ALB,0,0.000000,33,108.181818,41.903409
ATL,ANC,0,0.000000,2,438.500000,40.500000
ATL,ASE,0,0.000000,1,192.000000,
ATL,ATW,0,0.000000,10,106.400000,61.377778


There are 4 main syntax:
* Using **agg** with a dictionary is the most flexible and allows you to specify the aggregating function for each column:  

> **df.groupby(['grouping','columns']).agg({
    'agg_cols_1': ['list', 'of', 'functions'],
    'agg_cols_2': ['other','fucntions']
})**

* Using agg with a list of aggregating functions applies each of the functions to each of the aggregating columns:  

> **df.groupby(['grouping','columns'])['aggregating', 'columns'].agg(['aggregating', 'functions'])**

* Directly using a method following the aggregating columns instead of agg, applies just that method to each aggregating column. This way does not allow for multiple aggregating functions:  

> **df.groupby(['grouping','columns'])['aggregating', 'columns'].aggregating_method()**

* If you do not specify the aggregating columns, then the aggregating method will be applied to all the non-grouping columns:  

> **df.groupby(['grouping','columns']).aggregating_method()**

# Removing the MultiIndex after grouping