In [1]:
import pandas as pd
import numpy as np

In [2]:
# グループ演算は下記のプロセスで説明される（分離・適応・結合と呼ばれる）
# 1. key, data
# 2. split
# 3. apply
# 4. combine
# groupbyはこの手順によって実行される
df = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})
df

Unnamed: 0,data1,data2,key1,key2
0,-0.5449,-0.471997,a,one
1,-1.322584,0.529195,a,two
2,-1.836574,0.384372,b,one
3,-1.484546,1.291436,b,two
4,0.304821,-0.126411,a,one


In [3]:
# key1でgroupby
# 手順の~2まで実行する
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f27889416a0>

In [4]:
# 手順の~4まで実行する
grouped.mean()

key1
a   -0.520888
b   -1.660560
Name: data1, dtype: float64

In [5]:
# グループ演算を２つのキーで実行する
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one    -0.120039
      two    -1.322584
b     one    -1.836574
      two    -1.484546
Name: data1, dtype: float64

In [6]:
# unstackでindexをcolumnに変換
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.120039,-1.322584
b,-1.836574,-1.484546


In [7]:
# groupbyに対して同じ長さの配列を渡して、渡した配列によったグルーピングを実行
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005   -1.322584
            2006   -1.836574
Ohio        2005   -1.014723
            2006    0.304821
Name: data1, dtype: float64

In [8]:
# 文字列でキーを指定してグルーピングもできる
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.520888,-0.023071
b,-1.66056,0.837904


In [9]:
# 文字列のキー配列を渡してグルーピングもできる
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.120039,-0.299204
a,two,-1.322584,0.529195
b,one,-1.836574,0.384372
b,two,-1.484546,1.291436


In [10]:
# apply関数にsize関数を渡して各グループの数をカウント
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [11]:
# groupbyのオブジェクトはイテレータブル
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.544900 -0.471997    a  one
1 -1.322584  0.529195    a  two
4  0.304821 -0.126411    a  one
b
      data1     data2 key1 key2
2 -1.836574  0.384372    b  one
3 -1.484546  1.291436    b  two


In [12]:
# 複数のキーでgroupbyしたオブジェクトはキーをセットで持つ
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(k1, k2)
    print(group)

a one
      data1     data2 key1 key2
0 -0.544900 -0.471997    a  one
4  0.304821 -0.126411    a  one
a two
      data1     data2 key1 key2
1 -1.322584  0.529195    a  two
b one
      data1     data2 key1 key2
2 -1.836574  0.384372    b  one
b two
      data1     data2 key1 key2
3 -1.484546  1.291436    b  two


In [13]:
# データセットを用意する
people = pd.DataFrame(np.random.randn(5, 5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[2:3, ['b', 'c']] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,1.880981,0.607738,0.105643,0.190023,1.31829
Steve,-0.020335,0.958504,0.808935,0.937205,0.328506
Wes,0.422935,,,-0.820019,0.476678
Jim,0.919037,0.376603,0.9034,0.547168,-0.517137
Travis,-2.026556,0.829632,-0.8435,1.025287,0.604142


In [14]:
# ディクショナリを使ってgroupbyもできる
# カラムとカラムをグルーピングしている
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,0.295666,3.807009
Steve,1.746139,1.266674
Wes,-0.820019,0.899613
Jim,1.450568,0.778504
Travis,0.181788,-0.592782


In [15]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [16]:
# pd.Seriesを使ってグルーピングもできる
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [17]:
# 関数を用いたグルーピングもできる
# この場合はpeopleのindexとして使われている人物名のlenをとっている
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,3.222953,0.984341,1.009043,-0.082828,1.277831
5,-0.020335,0.958504,0.808935,0.937205,0.328506
6,-2.026556,0.829632,-0.8435,1.025287,0.604142


In [18]:
# 関数キーは配列キーと併用できる
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.422935,0.607738,0.105643,-0.820019,0.476678
3,two,0.919037,0.376603,0.9034,0.547168,-0.517137
5,one,-0.020335,0.958504,0.808935,0.937205,0.328506
6,two,-2.026556,0.829632,-0.8435,1.025287,0.604142


In [19]:
# pd.MultiIndexで複雑なインデックスを設定できる
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5, 1, 3]], names=['city', 'tenor'])
columns

MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
           labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=['city', 'tenor'])

In [20]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.39873,0.149884,-0.582494,0.060863,0.316979
1,-0.27968,-1.28187,-1.579368,2.313284,0.419654
2,1.164734,0.847995,-0.22787,0.434497,0.400231
3,-0.787115,0.209232,0.833277,0.152645,0.829361


In [21]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
