In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
df = sns.load_dataset('tips')
df.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
55,19.49,3.51,Male,No,Sun,Dinner,2
27,12.69,2.0,Male,No,Sat,Dinner,2
143,27.05,5.0,Female,No,Thur,Lunch,6
172,7.25,5.15,Male,Yes,Sun,Dinner,2
140,17.47,3.5,Female,No,Thur,Lunch,2


In [3]:
df.groupby('sex').mean()

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,20.744076,3.089618,2.630573
Female,18.056897,2.833448,2.45977


In [4]:
df['prot_tip'] = df['tip']/df['total_bill']
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,prot_tip
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587


In [5]:
df.groupby('sex').aggregate({'prot_tip': ['mean', 'median']})

Unnamed: 0_level_0,prot_tip,prot_tip
Unnamed: 0_level_1,mean,median
sex,Unnamed: 1_level_2,Unnamed: 2_level_2
Male,0.157651,0.153492
Female,0.166491,0.155581


In [6]:
df.groupby('sex')[['prot_tip']].describe()

Unnamed: 0_level_0,prot_tip,prot_tip,prot_tip,prot_tip,prot_tip,prot_tip,prot_tip,prot_tip
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Male,157.0,0.157651,0.064778,0.035638,0.121389,0.153492,0.18624,0.710345
Female,87.0,0.166491,0.053632,0.056433,0.140416,0.155581,0.194266,0.416667


In [7]:
def mean_eur2usd(x):
  return np.mean(x) * 1.12

df.groupby('sex')[['total_bill', 'prot_tip']].apply(mean_eur2usd)

Unnamed: 0_level_0,total_bill,prot_tip
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,23.233366,0.176569
Female,20.223724,0.18647


In [8]:
dict_agg = {
    'tip': [min, max],
    'total_bill': [np.mean, mean_eur2usd],
}

# .agg is equal that .aggregate
df.groupby(['sex', 'time']).agg(dict_agg)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,mean_eur2usd
sex,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Male,Lunch,1.44,6.7,18.048485,20.214303
Male,Dinner,1.0,10.0,21.461452,24.036826
Female,Lunch,1.25,5.17,16.339143,18.29984
Female,Dinner,1.0,6.5,19.213077,21.518646


In [9]:
def f_filter(x):
  return mean_eur2usd(x['total_bill']) > 20

df.groupby(['sex', 'time']).filter(f_filter).head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,prot_tip
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


## Pivot
---

In [10]:
df_gp = df.groupby(['sex', 'time'])[['total_bill']].mean().reset_index()
df_gp

Unnamed: 0,sex,time,total_bill
0,Male,Lunch,18.048485
1,Male,Dinner,21.461452
2,Female,Lunch,16.339143
3,Female,Dinner,19.213077


In [11]:
df_gp.pivot_table(index='sex', columns='time', values='total_bill') # Unique values

time,Lunch,Dinner
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,18.048485,21.461452
Female,16.339143,19.213077


In [13]:
# When there are more than one value for same index and columns, uses aggfunc
df_pivot = df.pivot_table(index='sex', columns='time', values='total_bill', aggfunc=['median', np.std])
df_pivot

Unnamed: 0_level_0,median,median,std,std
time,Lunch,Dinner,Lunch,Dinner
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Male,16.58,19.63,7.953435,9.460974
Female,13.42,17.19,7.500803,8.202085


In [21]:
df_pivot.unstack().reset_index()

Unnamed: 0,level_0,time,sex,0
0,median,Lunch,Male,16.58
1,median,Lunch,Female,13.42
2,median,Dinner,Male,19.63
3,median,Dinner,Female,17.19
4,std,Lunch,Male,7.953435
5,std,Lunch,Female,7.500803
6,std,Dinner,Male,9.460974
7,std,Dinner,Female,8.202085
