# 資料聚合和分組

## Data Aggregation and Group Operations Part I

### 分組Group是由分裂-套用-合併 split-apply-combine 一系列的動作而成。

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.DataFrame({'key1':['a','a','b','b','a'],
                 'key2': ['one','two','one','two','one'],
                 'data1': np.random.randn(5),
                 'data2': np.random.randn(5)})

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.209082,-1.063916
1,a,two,0.946712,0.605243
2,b,one,-1.098662,0.223535
3,b,two,0.783335,-1.293517
4,a,one,-0.956635,0.492003


### groupby(): 可以用來形成一個groupby的元件，如下為

In [4]:
grouped=df['data1'].groupby(df['key1'])

In [5]:
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f86958dbd90>

In [6]:
grouped.mean()

key1
a   -0.406335
b   -0.157663
Name: data1, dtype: float64

### groupby(list) 如果有2個以上的key，可以放入list。

In [7]:
means=df['data1'].groupby([df['key1'],df['key2']]).mean()

In [8]:
means

key1  key2
a     one    -1.082858
      two     0.946712
b     one    -1.098662
      two     0.783335
Name: data1, dtype: float64

### 2個以上的key值會進行階層式索引，unstack(): 將index轉為column。

In [9]:
means.unstack() 

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.082858,0.946712
b,-1.098662,0.783335


### 分組的內容也可以從其他array輸入

In [10]:
states=np.array(['Ohio','California','California','Ohio','Ohio'])

In [11]:
years=np.array([2005,2005,2006,2005,2006])

In [12]:
df['data1'].groupby([states,years]).mean()

California  2005    0.946712
            2006   -1.098662
Ohio        2005   -0.212873
            2006   -0.956635
Name: data1, dtype: float64

### 如果要分組的資料來自於同一個DataFrame，直接在groupby內寫入欄位名稱即可。

In [13]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.406335,0.01111
b,-0.157663,-0.534991


In [14]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-1.082858,-0.285957
a,two,0.946712,0.605243
b,one,-1.098662,0.223535
b,two,0.783335,-1.293517


### size(): 可以用來計算每組有多少數值，不過，遺失值NAN不會被列入計算之中。

In [15]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 疊代分組

In [16]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.209082,-1.063916
1,a,two,0.946712,0.605243
2,b,one,-1.098662,0.223535
3,b,two,0.783335,-1.293517
4,a,one,-0.956635,0.492003


### 如下會顯示groupby()功能運作的原理，會先找到key1內的a,b。

In [17]:
for name, group in df.groupby('key1'):
    print(name)

a
b


### 再依據a,b的順序進行df全部資料的排序。

In [18]:
for name, group in df.groupby('key1'):
    print(group)

  key1 key2     data1     data2
0    a  one -1.209082 -1.063916
1    a  two  0.946712  0.605243
4    a  one -0.956635  0.492003
  key1 key2     data1     data2
2    b  one -1.098662  0.223535
3    b  two  0.783335 -1.293517


### 先找到(k1,k2)，再依據k1,k2做資料重組。

In [19]:
for (k1,k2), group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

a one
  key1 key2     data1     data2
0    a  one -1.209082 -1.063916
4    a  one -0.956635  0.492003
a two
  key1 key2     data1     data2
1    a  two  0.946712  0.605243
b one
  key1 key2     data1     data2
2    b  one -1.098662  0.223535
b two
  key1 key2     data1     data2
3    b  two  0.783335 -1.293517


### 可以把groupby的物件做list, dict的格式轉換。

In [20]:
pieces=dict(list(df.groupby('key1')))

In [21]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-1.098662,0.223535
3,b,two,0.783335,-1.293517


In [22]:
a=list(df.groupby('key1'))
a

[('a',
    key1 key2     data1     data2
  0    a  one -1.209082 -1.063916
  1    a  two  0.946712  0.605243
  4    a  one -0.956635  0.492003),
 ('b',
    key1 key2     data1     data2
  2    b  one -1.098662  0.223535
  3    b  two  0.783335 -1.293517)]

In [23]:
b=dict(a)
b

{'a':   key1 key2     data1     data2
 0    a  one -1.209082 -1.063916
 1    a  two  0.946712  0.605243
 4    a  one -0.956635  0.492003,
 'b':   key1 key2     data1     data2
 2    b  one -1.098662  0.223535
 3    b  two  0.783335 -1.293517}

In [24]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

### axis=1 以欄為主做資料分群。故float64包含data1,data2 ; object包含key1, key2。

In [25]:
grouped=df.groupby(df.dtypes,axis=1)

In [26]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -1.209082 -1.063916
1  0.946712  0.605243
2 -1.098662  0.223535
3  0.783335 -1.293517
4 -0.956635  0.492003
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### 選取一個或多個欄

In [27]:
df.groupby('key1')['data1'] == df['data1'].groupby(df['key1'])

False

In [28]:
df.groupby('key1')['data1'].mean()

key1
a   -0.406335
b   -0.157663
Name: data1, dtype: float64

In [29]:
df['data1'].groupby(df['key1']).mean()

key1
a   -0.406335
b   -0.157663
Name: data1, dtype: float64

In [30]:
df.groupby('key1')[['data2']] == df[['data2']].groupby(df['key1'])

False

In [31]:
df.groupby('key1')[['data2']].mean()

Unnamed: 0_level_0,data2
key1,Unnamed: 1_level_1
a,0.01111
b,-0.534991


In [32]:
df['data2'].groupby(df['key1']).mean()

key1
a    0.011110
b   -0.534991
Name: data2, dtype: float64

### 將data2進行分組平均。

In [33]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.285957
a,two,0.605243
b,one,0.223535
b,two,-1.293517


### 如果data提供的為list, array，groupby產出會是DataFrame。如果data是單純的數字，groupby產出會是Series。

In [34]:
s_grouped=df.groupby(['key1','key2'])['data2']

In [35]:
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f8695a7c2b0>

In [36]:
s_grouped.mean()

key1  key2
a     one    -0.285957
      two     0.605243
b     one     0.223535
      two    -1.293517
Name: data2, dtype: float64

### 用Dict和Series進行分組

In [37]:
people=pd.DataFrame(np.random.randn(5,5),
                   columns=['a','b','c','d','e'],
                   index=['Joe','Steve','Wes','Jim','Tres'])

In [38]:
people.iloc[2:3, [1,2]]=np.nan

In [39]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.027396,-0.258384,0.652083,-1.612787,0.293035
Steve,1.355604,-1.86845,-1.300112,-1.268887,1.936118
Wes,-1.387468,,,-0.668607,-0.914216
Jim,-1.530624,0.457796,0.447499,-1.600257,-0.087642
Tres,-0.68635,1.170687,0.372309,-0.443858,-0.746222


In [40]:
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}

In [41]:
by_column=people.groupby(mapping, axis=1)

In [42]:
for name, group in people.groupby(mapping, axis=1):
    print(name)
    print(group)

blue
              c         d
Joe    0.652083 -1.612787
Steve -1.300112 -1.268887
Wes         NaN -0.668607
Jim    0.447499 -1.600257
Tres   0.372309 -0.443858
red
              a         b         e
Joe   -1.027396 -0.258384  0.293035
Steve  1.355604 -1.868450  1.936118
Wes   -1.387468       NaN -0.914216
Jim   -1.530624  0.457796 -0.087642
Tres  -0.686350  1.170687 -0.746222


### dict的key=df的column name(axis=1)
### dict的value會直接變為groupby分組名稱，對應不到的ｆ，不會出現。
### 即使people內有nan，也不影響加總。

In [43]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-0.960705,-0.992746
Steve,-2.568999,1.423272
Wes,-0.668607,-2.301684
Jim,-1.152758,-1.16047
Tres,-0.071548,-0.261884


In [44]:
map_series=pd.Series(mapping)

In [45]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

### 將mapping從原先的dict改為series，也不會影響groupby的值。

In [46]:
people.groupby(map_series,axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Tres,2,3


### 用函式分組

### groupby()內也可以放入函式。
### len() 在這裡會計算英文名稱的長度。
### 由於groupby()預設值axis=0，所以每列為單位做運算。

In [47]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.027396,-0.258384,0.652083,-1.612787,0.293035
Steve,1.355604,-1.86845,-1.300112,-1.268887,1.936118
Wes,-1.387468,,,-0.668607,-0.914216
Jim,-1.530624,0.457796,0.447499,-1.600257,-0.087642
Tres,-0.68635,1.170687,0.372309,-0.443858,-0.746222


In [48]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-3.945489,0.199412,1.099582,-3.881651,-0.708823
4,-0.68635,1.170687,0.372309,-0.443858,-0.746222
5,1.355604,-1.86845,-1.300112,-1.268887,1.936118


In [49]:
key_list=['one','one','one','two','two']

### groupby內放入list，表示有2個分組依據。
### 由於預設值是axis=0，所以key_list直接對應到每一列。

In [50]:
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.387468,-0.258384,0.652083,-1.612787,-0.914216
3,two,-1.530624,0.457796,0.447499,-1.600257,-0.087642
4,two,-0.68635,1.170687,0.372309,-0.443858,-0.746222
5,one,1.355604,-1.86845,-1.300112,-1.268887,1.936118


### 用索引層級分組

In [51]:
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['country','number'])

In [52]:
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['country', 'number'])

In [53]:
hier_df=pd.DataFrame(np.random.randn(4,5), columns=columns)

In [54]:
hier_df

country,US,US,US,JP,JP
number,1,3,5,1,3
0,0.38904,0.584932,1.046823,-1.077229,0.609573
1,-0.424024,-0.799183,-0.764295,-0.843841,1.67226
2,0.318307,-1.267935,-0.495333,1.061411,-1.118211
3,-0.530639,1.337145,-1.118584,1.334281,0.560864


In [55]:
hier_df.groupby(level='country', axis=1).count()

country,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
