In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})
df

Unnamed: 0,data1,data2,key1,key2
0,1.645841,-0.262437,a,one
1,0.964498,1.586054,a,two
2,0.217608,0.893267,b,one
3,0.053414,-1.63335,b,two
4,0.669832,-1.989426,a,one


In [3]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f5047446390>

In [4]:
grouped.mean()

key1
a    1.093390
b    0.135511
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     1.157836
      two     0.964498
b     one     0.217608
      two     0.053414
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.157836,0.964498
b,0.217608,0.053414


In [7]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005    0.964498
            2006    0.217608
Ohio        2005    0.849627
            2006    0.669832
Name: data1, dtype: float64

In [8]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.09339,-0.221936
b,0.135511,-0.370041


In [9]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1.157836,-1.125931
a,two,0.964498,1.586054
b,one,0.217608,0.893267
b,two,0.053414,-1.63335


In [10]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [11]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0  1.645841 -0.262437    a  one
1  0.964498  1.586054    a  two
4  0.669832 -1.989426    a  one
b
      data1     data2 key1 key2
2  0.217608  0.893267    b  one
3  0.053414 -1.633350    b  two


In [12]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(k1, k2)
    print(group)

a one
      data1     data2 key1 key2
0  1.645841 -0.262437    a  one
4  0.669832 -1.989426    a  one
a two
      data1     data2 key1 key2
1  0.964498  1.586054    a  two
b one
      data1     data2 key1 key2
2  0.217608  0.893267    b  one
b two
      data1    data2 key1 key2
3  0.053414 -1.63335    b  two


In [13]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,0.217608,0.893267,b,one
3,0.053414,-1.63335,b,two


In [14]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [15]:
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0  1.645841 -0.262437
 1  0.964498  1.586054
 2  0.217608  0.893267
 3  0.053414 -1.633350
 4  0.669832 -1.989426, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [16]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-1.125931
a,two,1.586054
b,one,0.893267
b,two,-1.63335


In [17]:
s_groupby = df.groupby(['key1', 'key2'])['data2']
s_groupby.mean()

key1  key2
a     one    -1.125931
      two     1.586054
b     one     0.893267
      two    -1.633350
Name: data2, dtype: float64

In [18]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[2:3, ['b', 'c']] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-0.586358,-1.009562,2.175293,0.214429,-1.133004
Steve,0.041974,-0.276482,-0.992938,-1.373828,0.622313
Wes,1.767687,,,-0.675155,0.191531
Jim,1.433809,-0.666006,-1.898428,-0.231877,-0.65956
Travis,-1.147788,1.136415,0.116793,-0.288277,1.02555


In [19]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,2.389722,-2.728924
Steve,-2.366766,0.387805
Wes,-0.675155,1.959219
Jim,-2.130304,0.108243
Travis,-0.171485,1.014177


In [20]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [21]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [23]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,2.615139,-1.675567,0.276866,-0.692603,-1.601033
5,0.041974,-0.276482,-0.992938,-1.373828,0.622313
6,-1.147788,1.136415,0.116793,-0.288277,1.02555


In [24]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.586358,-1.009562,2.175293,-0.675155,-1.133004
3,two,1.433809,-0.666006,-1.898428,-0.231877,-0.65956
5,one,0.041974,-0.276482,-0.992938,-1.373828,0.622313
6,two,-1.147788,1.136415,0.116793,-0.288277,1.02555


In [26]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5, 1, 3]], names=['city', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.297253,0.131586,-0.61004,-0.439291,-1.760348
1,-0.758424,1.331428,1.950107,-0.07685,1.435789
2,-0.723882,0.691902,-0.718217,-0.766972,0.313093
3,0.971588,-2.363349,0.537432,-0.759252,-0.816794


In [27]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
