In [18]:
import seaborn as sns
planets = sns.load_dataset('planets')


In [19]:
import pandas as pd
import numpy as np


In [20]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [21]:
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape = {1}".format(method, group.shape))

Astrometry                     shape = (2, 6)
Eclipse Timing Variations      shape = (9, 6)
Imaging                        shape = (38, 6)
Microlensing                   shape = (23, 6)
Orbital Brightness Modulation  shape = (3, 6)
Pulsar Timing                  shape = (5, 6)
Pulsation Timing Variations    shape = (1, 6)
Radial Velocity                shape = (553, 6)
Transit                        shape = (397, 6)
Transit Timing Variations      shape = (4, 6)


In [22]:
planets.groupby('method')['year'].describe().unstack()

       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64

In [23]:
rng = np.random.RandomState(0)
df = pd.DataFrame({
    'key':['A', 'B', 'C', 'A', 'B', 'C'],
    'data1': np.arange(6),
    'data2': rng.randint(0, 10, 6)
    },
    columns = ['key', 'data1', 'data2']    
    )

In [24]:
# aggregate take str, fn, list thereof
# filter fn
# transform
# apply
def norm_by_data2(x):
    x['data1'] /= x['data2'].sum()
    return x
    
df.groupby('key').apply(norm_by_data2)


  df.groupby('key').apply(norm_by_data2)


Unnamed: 0_level_0,Unnamed: 1_level_0,key,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0,A,0.0,5
A,3,A,0.375,3
B,1,B,0.142857,0
B,4,B,0.571429,7
C,2,C,0.166667,3
C,5,C,0.416667,9


In [25]:
# Create decade series
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'

In [26]:
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0
