![jupyter](./data/ch6/Pandas_groupby1.png)

In [1]:
import pandas as pd 
import numpy as np

In [2]:
#groupby
df=pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],
                 'data1':np.random.randn(5),'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.944903,-0.169174
1,a,two,0.717081,-0.685001
2,b,one,-0.17331,0.082423
3,b,two,-0.968154,0.662044
4,a,one,-0.084131,1.350971


In [8]:
grouped=df['data1'].groupby(df['key1']) # 輸出為一個groupby物件
print(grouped.mean()) # 每組平均

key1
a   -0.103984
b   -0.570732
Name: data1, dtype: float64


In [10]:
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.514517
      two     0.717081
b     one    -0.173310
      two    -0.968154
Name: data1, dtype: float64

In [11]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.514517,0.717081
b,-0.17331,-0.968154


In [13]:
# 可以用兩個相同長度的陣列做groupby
states=['ohio','texas','texas','ohio','ohio']
average_income=[13699,23658,45218,15632,33256]
df['data1'].groupby([states,average_income]).mean()

ohio   13699   -0.944903
       15632   -0.968154
       33256   -0.084131
texas  23658    0.717081
       45218   -0.173310
Name: data1, dtype: float64

In [15]:
# 傳入dataframe的欄位名稱當作分組的key
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.103984,0.165599
b,-0.570732,0.372234


In [17]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.514517,0.590899
a,two,0.717081,-0.685001
b,one,-0.17331,0.082423
b,two,-0.968154,0.662044


In [19]:
# 計算分組的大小
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [20]:
# 疊代分組(使用單一key分組)
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.944903 -0.169174
1    a  two  0.717081 -0.685001
4    a  one -0.084131  1.350971
b
  key1 key2     data1     data2
2    b  one -0.173310  0.082423
3    b  two -0.968154  0.662044


In [23]:
# 使用多個key做分組，要用tuple傳入
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)


('a', 'one')
  key1 key2     data1     data2
0    a  one -0.944903 -0.169174
4    a  one -0.084131  1.350971
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.717081 -0.685001
('b', 'one')
  key1 key2    data1     data2
2    b  one -0.17331  0.082423
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.968154  0.662044


In [32]:
# axis用來為超過一維的數組定義的屬性，二維數據擁有兩個axis：axis=0沿著行的垂直往下，axis=1沿著列的方向水平延伸
grouped=df.groupby(df.dtypes,axis=1)
for dtype,group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.944903 -0.169174
1  0.717081 -0.685001
2 -0.173310  0.082423
3 -0.968154  0.662044
4 -0.084131  1.350971
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [38]:
# 選取一個或多個欄
# 只把data1做分組平均數的計算,return dataframe
df.groupby(['key1','key2'])[['data1']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key1,key2,Unnamed: 2_level_1
a,one,-0.514517
a,two,0.717081
b,one,-0.17331
b,two,-0.968154


In [39]:
# return series
df.groupby(['key1','key2'])['data2'].mean()

key1  key2
a     one     0.590899
      two    -0.685001
b     one     0.082423
      two     0.662044
Name: data2, dtype: float64

In [49]:
# 用dict,series分組
players=pd.DataFrame(np.random.randn(5,5),columns=list('abcde'),index=['james','kobe','curry','jokic','green'])
players

Unnamed: 0,a,b,c,d,e
james,-0.604764,0.635977,-0.204906,-1.504221,0.059712
kobe,-0.897232,0.585009,-0.696538,-0.86012,1.118853
curry,1.508082,-0.023367,0.253561,-1.722062,0.047908
jokic,-1.902776,0.551617,0.560112,1.344095,0.677907
green,1.756939,-1.085372,-0.390131,0.318596,1.2471


In [51]:
mapping={'a':'PG','b':'SG','c':'SF','d':'PF','e':'C'}
players.groupby(mapping,axis=1).sum()

Unnamed: 0,C,PF,PG,SF,SG
james,0.059712,-1.504221,-0.604764,-0.204906,0.635977
kobe,1.118853,-0.86012,-0.897232,-0.696538,0.585009
curry,0.047908,-1.722062,1.508082,0.253561,-0.023367
jokic,0.677907,1.344095,-1.902776,0.560112,0.551617
green,1.2471,0.318596,1.756939,-0.390131,-1.085372


In [56]:
map_series=pd.Series(mapping)
players.groupby(map_series,axis=1).sum()

Unnamed: 0,C,PF,PG,SF,SG
james,0.059712,-1.504221,-0.604764,-0.204906,0.635977
kobe,1.118853,-0.86012,-0.897232,-0.696538,0.585009
curry,0.047908,-1.722062,1.508082,0.253561,-0.023367
jokic,0.677907,1.344095,-1.902776,0.560112,0.551617
green,1.2471,0.318596,1.756939,-0.390131,-1.085372


In [57]:
# 用function分組
# 用名字(index)長度做分組
players.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
4,-0.897232,0.585009,-0.696538,-0.86012,1.118853
5,0.757481,0.078855,0.218636,-1.563594,2.032626


In [70]:
# 混合list,dict,series
key_list=['james','kobe','curry','jokic','green']
players.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
4,kobe,-0.897232,0.585009,-0.696538,-0.86012,1.118853
5,curry,1.508082,-0.023367,0.253561,-1.722062,0.047908
5,green,1.756939,-1.085372,-0.390131,0.318596,1.2471
5,james,-0.604764,0.635977,-0.204906,-1.504221,0.059712
5,jokic,-1.902776,0.551617,0.560112,1.344095,0.677907


In [76]:
# 用索引層級分組
columns=pd.MultiIndex.from_arrays([['us','us','us','jp','jp'],[19,23,98,57,65]],names=['country','age'])
columns

MultiIndex(levels=[['jp', 'us'], [19, 23, 57, 65, 98]],
           codes=[[1, 1, 1, 0, 0], [0, 1, 4, 2, 3]],
           names=['country', 'age'])

In [99]:
h_df=pd.DataFrame(np.random.randn(4,5),columns=columns)
h_df.iloc[1:3,[1,2]]=np.nan
h_df

country,us,us,us,jp,jp
age,19,23,98,57,65
0,0.713485,1.019068,-1.28133,1.715216,0.416657
1,-0.367662,,,-0.205146,1.291427
2,-1.219214,,,0.20495,-0.597382
3,-0.186802,-0.56341,0.375594,1.860513,1.346304


In [106]:
# level傳入要進行分組的編號or該層名稱
# h_df.groupby(level='country',axis=1).count()
print(h_df.groupby(level=0,axis=1).count())
h_df.groupby(level=1,axis=1).count()

country  jp  us
0         2   3
1         2   1
2         2   1
3         2   3


age,19,23,57,65,98
0,1,1,1,1,1
1,1,0,1,1,0
2,1,0,1,1,0
3,1,1,1,1,1


In [107]:
# 對資料做轉換得到常數的動作稱為資料聚合
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.944903,-0.169174
1,a,two,0.717081,-0.685001
2,b,one,-0.17331,0.082423
3,b,two,-0.968154,0.662044
4,a,one,-0.084131,1.350971


In [120]:
grouped=df.groupby('key1') 
grouped['data1'].quantile() # 預設中位數

key1
a   -0.084131
b   -0.570732
Name: data1, dtype: float64

In [132]:
# 使用自訂的聚合函式
def peak_to_peak(arr):
    return arr.max()-arr.min()

grouped.agg(peak_to_peak) # 把自訂的聚合函式當agg()的參數，才可使用

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,1,2.0,0.067349,10.29
Fri,Yes,3,3.73,0.159925,34.42
Sat,No,3,8.0,0.235193,41.08
Sat,Yes,4,9.0,0.290095,47.74
Sun,No,4,4.99,0.193226,39.4
Sun,Yes,3,5.0,0.644685,38.1
Thur,No,5,5.45,0.19335,33.68
Thur,Yes,2,3.0,0.15124,32.77


In [122]:
# 簡易描述資料分布
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.103984,0.83117,-0.944903,-0.514517,-0.084131,0.316475,0.717081,3.0,0.165599,1.058466,-0.685001,-0.427087,-0.169174,0.590899,1.350971
b,2.0,-0.570732,0.56204,-0.968154,-0.769443,-0.570732,-0.372021,-0.17331,2.0,0.372234,0.409854,0.082423,0.227328,0.372234,0.517139,0.662044


In [3]:
# 欄方向的多功能應用
tips=pd.read_csv('./examples/tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
tips[:5]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [155]:
# 不同時間和抽不抽菸給小費的關係
grouped=tips.groupby(['day','smoker'])
grouped_pct=grouped['tip_pct']
grouped_pct.agg('mean') # 傳入函式名稱　mean()

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [135]:
# 傳入多個函式['mean','sum','min']，欄位會顯示函式名稱，自訂函式不用加''
grouped_pct.agg(['mean','std',peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [148]:
# 可以使用tuple傳入，欄位名稱可以自訂
grouped_pct.agg([('average','mean'),('standard_diviation','std')])

Unnamed: 0_level_0,Unnamed: 1_level_0,average,standard_diviation
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [149]:
functions=[('size','count'),('average','mean'),('maximum','max')]
result=grouped['tip_pct','total_bill'].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,size,average,maximum,size,average,maximum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [150]:
# 輸出結果已經concat()
result['total_bill']

Unnamed: 0_level_0,Unnamed: 1_level_0,size,average,maximum
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,18.42,22.75
Fri,Yes,15,16.813333,40.17
Sat,No,45,19.661778,48.33
Sat,Yes,42,21.276667,50.81
Sun,No,57,20.506667,48.17
Sun,Yes,19,24.12,45.35
Thur,No,45,17.113111,41.19
Thur,Yes,17,19.190588,43.11


In [153]:
# 把多個不同的函式套用到指定的欄位，要傳入dict{'欄位':'函式'}
grouped.agg({'tip':np.max,'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [156]:
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


In [163]:
tips.groupby(['day','smoker']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,18.42,2.8125,2.25,0.15165
Fri,Yes,16.813333,2.714,2.066667,0.174783
Sat,No,19.661778,3.102889,2.555556,0.158048
Sat,Yes,21.276667,2.875476,2.47619,0.147906
Sun,No,20.506667,3.167895,2.929825,0.160113
Sun,Yes,24.12,3.516842,2.578947,0.18725
Thur,No,17.113111,2.673778,2.488889,0.160298
Thur,Yes,19.190588,3.03,2.352941,0.163863


In [164]:
# 回傳聚合資料不要列出索引
tips.groupby(['day','smoker'],as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


In [7]:
# apply:分裂－套用－合併
# 依條件分組後的tip_pct值前五名
def return_top5(df,n=5,columns='tip_pct'):
    return df.sort_values(by=columns)[:n] # sort_values()由小到大
return_top5(tips)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
237,32.83,1.17,Yes,Sat,Dinner,2,0.035638
102,44.3,2.5,Yes,Sat,Dinner,3,0.056433
57,26.41,1.5,No,Sat,Dinner,2,0.056797
0,16.99,1.01,No,Sun,Dinner,2,0.059447
187,30.46,2.0,Yes,Sun,Dinner,5,0.06566


In [180]:
# 用smoker做分組，並把先前自訂的函式用apply()傳入
tips.groupby('smoker').apply(return_top5)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,57,26.41,1.5,No,Sat,Dinner,2,0.056797
No,0,16.99,1.01,No,Sun,Dinner,2,0.059447
No,48,28.55,2.05,No,Sun,Dinner,3,0.071804
No,146,18.64,1.36,No,Thur,Lunch,3,0.072961
No,130,19.08,1.5,No,Thur,Lunch,2,0.078616
Yes,237,32.83,1.17,Yes,Sat,Dinner,2,0.035638
Yes,102,44.3,2.5,Yes,Sat,Dinner,3,0.056433
Yes,187,30.46,2.0,Yes,Sun,Dinner,5,0.06566
Yes,210,30.06,2.0,Yes,Sat,Dinner,3,0.066534
Yes,240,27.18,2.0,Yes,Sat,Dinner,2,0.073584


In [182]:
tips.groupby('smoker').apply(return_top5,n=1,columns='total_bill') # 若要傳入其他參數，可以寫在apply函式的後面

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,111,7.25,1.0,No,Sat,Dinner,1,0.137931
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733


In [5]:
tips.groupby('smoker')['tip_pct'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [6]:
tips.groupby('smoker')['tip_pct'].describe().unstack()

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [9]:
# 消除分組key索引
tips.groupby('smoker',group_keys=False).apply(return_top5) # 前面smoker分組，yes/no消除

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
57,26.41,1.5,No,Sat,Dinner,2,0.056797
0,16.99,1.01,No,Sun,Dinner,2,0.059447
48,28.55,2.05,No,Sun,Dinner,3,0.071804
146,18.64,1.36,No,Thur,Lunch,3,0.072961
130,19.08,1.5,No,Thur,Lunch,2,0.078616
237,32.83,1.17,Yes,Sat,Dinner,2,0.035638
102,44.3,2.5,Yes,Sat,Dinner,3,0.056433
187,30.46,2.0,Yes,Sun,Dinner,5,0.06566
210,30.06,2.0,Yes,Sat,Dinner,3,0.066534
240,27.18,2.0,Yes,Sat,Dinner,2,0.073584


In [16]:
# 分位數/欄分析
frame=pd.DataFrame(({'data1':np.random.randn(1000),'data2':np.random.randn(1000)}))
frame[:5]

Unnamed: 0,data1,data2
0,0.058144,-0.352244
1,-0.852917,-2.266391
2,-1.721194,-1.499813
3,1.002994,-0.672706
4,0.512679,1.52447


In [19]:
quarter=pd.cut(frame.data1,4) # 資料長度切4等分，按值域大小切分
quarter[:5]

0     (-0.0581, 1.519]
1    (-1.635, -0.0581]
2     (-3.219, -1.635]
3     (-0.0581, 1.519]
4     (-0.0581, 1.519]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.219, -1.635] < (-1.635, -0.0581] < (-0.0581, 1.519] < (1.519, 3.096]]

In [44]:
# 配合函式
def return_min_max_count_mean(df):
    return {'min':df.min(),'max':df.max(),'count':df.count(),'mean':df.mean()}
df=frame.data1.groupby(quarter)
df.apply(return_min_max_count_mean).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.219, -1.635]",55.0,-1.64309,-2.079854,-3.212512
"(-1.635, -0.0581]",431.0,-0.058887,-0.707844,-1.633801
"(-0.0581, 1.519]",455.0,1.505456,0.585341,-0.056459
"(1.519, 3.096]",59.0,3.09637,1.948871,1.53449


In [45]:
# qcut()，按資料個數切分
grouping=pd.qcut(frame.data1,4)
temp=frame.data1.groupby(grouping)
temp.apply(return_min_max_count_mean).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.214, -0.694]",250.0,-0.704286,-1.329862,-3.212512
"(-0.694, -0.0209]",250.0,-0.021044,-0.350331,-0.690447
"(-0.0209, 0.698]",250.0,0.697952,0.292173,-0.020826
"(0.698, 3.096]",250.0,3.09637,1.235383,0.698742


In [36]:
# 依分組指定填入遺失值
s=pd.Series(np.random.randn(6))
s[::2]=np.NaN
s

0         NaN
1   -0.226089
2         NaN
3   -0.712044
4         NaN
5    0.147425
dtype: float64

In [37]:
s.fillna(s.mean())

0   -0.263569
1   -0.226089
2   -0.263569
3   -0.712044
4   -0.263569
5    0.147425
dtype: float64

In [47]:
states=['ohio','new york','vermont','florida','oregon','nevada','california','idaho']
group_key=['EAST']*4 + ['WEST']*4
data=pd.Series(np.random.randn(8),index=states)
data[::2]=np.NaN
data


ohio               NaN
new york      0.932222
vermont            NaN
florida       3.058547
oregon             NaN
nevada       -0.088917
california         NaN
idaho        -0.498178
dtype: float64

In [49]:
data.groupby(group_key).mean()

EAST    1.995384
WEST   -0.293548
dtype: float64

In [50]:
# 用函式填入遺失值
fill_mean=lambda x: x.fillna(x.mean())
data.groupby(group_key).apply(fill_mean)


ohio          1.995384
new york      0.932222
vermont       1.995384
florida       3.058547
oregon       -0.293548
nevada       -0.088917
california   -0.293548
idaho        -0.498178
dtype: float64

In [51]:
# 填入預設的填充值(利用分組時內部產生的name屬性來做對應)
fill_value={'EAST':11,'WEST':22}
fill_func=lambda x: x.fillna(fill_value[x.name])
data.groupby(group_key).apply(fill_func)


ohio          11.000000
new york       0.932222
vermont       11.000000
florida        3.058547
oregon        22.000000
nevada        -0.088917
california    22.000000
idaho         -0.498178
dtype: float64

In [65]:
# 隨機取樣、排列
# 建立撲克牌
suits=['紅心','黑桃','梅花','方塊'] # 4種花色
card_value=list(range(1,11)) # 點數1-10
above_ten=list(range(10,14))
# above_ten

# 待補...............................

In [67]:
# 加權平均和關聯性
df=pd.DataFrame({'category':['a','a','a','a','b','b','c','c','c','c'],
                'data':np.random.randn(10),'weights':np.random.randn(10)})
df

Unnamed: 0,category,data,weights
0,a,-0.314667,0.541061
1,a,-1.522915,0.204473
2,a,-0.860578,0.546867
3,a,0.258643,-0.011502
4,b,1.87836,-0.198773
5,b,-0.124118,0.043415
6,c,-1.811147,0.479509
7,c,0.69411,-1.093841
8,c,-0.339665,-1.869753
9,c,-0.197709,0.543463


In [73]:
# df.groupby('category').count()
def weight_func(df):
    return np.average(df['data'],weights=df['weights'])
# weight_func=lambda x: np.average(x['data'],weights=x['weights'])
get_weight=df.groupby('category').apply(weight_func)
get_weight


category
a   -0.745762
b    2.437947
c    0.566862
dtype: float64

In [78]:
# example stock
close_px=pd.read_csv('examples/stock_px_2.csv',parse_dates=True,index_col=0)
close_px[:3]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-02,7.4,21.11,29.22,909.03
2003-01-03,7.45,21.14,29.24,908.59
2003-01-06,7.45,21.52,29.96,929.01


In [77]:
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL    2214 non-null float64
MSFT    2214 non-null float64
XOM     2214 non-null float64
SPX     2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB


In [82]:
# corrwith()找兩個DataFrame對象之間的相關性(axis=0 or 1)
spx_corrwith=lambda x : x.corrwith(x['SPX'])
pct=close_px.pct_change().dropna() # 用第一筆資料做基準，進行百分比變化轉換，並刪除空值
pct[:3]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-03,0.006757,0.001421,0.000684,-0.000484
2003-01-06,0.0,0.017975,0.024624,0.022474
2003-01-07,-0.002685,0.019052,-0.033712,-0.006545


In [86]:
# 把資料整合為以年份區分
get_year=lambda x : x.year
pct.groupby(get_year).apply(spx_corrwith)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [95]:
# 分組線性回歸
import statsmodels.api as sm

def regress(data,y_var,x_var):
    y=data[y_var]
    x=data[x_var]
    x['intercept']=1
    res=sm.OLS(y,x).fit()
    return res.params

pct.groupby(get_year).apply(regress,['AAPL'],['SPX'])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


In [3]:
# 樞紐關係表(pivot table),用多個key聚合一張表格資料
# 以tips為例子
tips=pd.read_csv('examples/tips.csv')
tips[:3]

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3


In [101]:
# 指定用來分組的欄位
tips.pivot_table(index=['day','time'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,total_bill
day,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,Dinner,2.166667,2.94,19.663333
Fri,Lunch,2.0,2.382857,12.845714
Sat,Dinner,2.517241,2.993103,20.441379
Sun,Dinner,2.842105,3.255132,21.41
Thur,Dinner,2.0,3.0,18.78
Thur,Lunch,2.459016,2.767705,17.664754


In [104]:
# 
tips.pivot_table(['tip','size'],index=['time','day'],columns='smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip,tip
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,2.75,3.003333
Dinner,Sat,2.555556,2.47619,3.102889,2.875476
Dinner,Sun,2.929825,2.578947,3.167895,3.516842
Dinner,Thur,2.0,,3.0,
Lunch,Fri,3.0,1.833333,3.0,2.28
Lunch,Thur,2.5,2.352941,2.666364,3.03


In [5]:
# margin,幫列和欄加上ALL標籤(no/yes的加總平均)
tips.pivot_table(['tip','size'],index=['time','day'],columns='smoker',margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip,tip,tip
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,2.75,3.003333,2.94
Dinner,Sat,2.555556,2.47619,2.517241,3.102889,2.875476,2.993103
Dinner,Sun,2.929825,2.578947,2.842105,3.167895,3.516842,3.255132
Dinner,Thur,2.0,,2.0,3.0,,3.0
Lunch,Fri,3.0,1.833333,2.0,3.0,2.28,2.382857
Lunch,Thur,2.5,2.352941,2.459016,2.666364,3.03,2.767705
All,,2.668874,2.408602,2.569672,2.991854,3.00871,2.998279


In [None]:
# 待補p349,350

In [93]:
# crosstab,計算分組的頻率
data=pd.DataFrame(index=np.arange(10),columns=['SAMPLE','NATIONALITY','HANDDENESS'])
data['SAMPLE']=range(10)
NATIONALITY=['usa','japan']
HANDDENESS=['left','right']
# data.iloc[:,1]=['aaa']
data

Unnamed: 0,SAMPLE,NATIONALITY,HANDDENESS
0,0,,
1,1,,
2,2,,
3,3,,
4,4,,
5,5,,
6,6,,
7,7,,
8,8,,
9,9,,


In [95]:
# 建立隨機資料
for i in range(10):
    step=np.random.randint(0,2)
    data.iloc[i,1]=NATIONALITY[step]
    
for i in range(10):
    step=np.random.randint(0,2)
    data.iloc[i,2]=HANDDENESS[step]
data

Unnamed: 0,SAMPLE,NATIONALITY,HANDDENESS
0,0,japan,left
1,1,japan,right
2,2,usa,left
3,3,usa,right
4,4,usa,left
5,5,japan,left
6,6,japan,right
7,7,usa,left
8,8,japan,right
9,9,usa,left


In [96]:
# 用國籍和慣用手區分
pd.crosstab(data.NATIONALITY,data.HANDDENESS,margins=True)

HANDDENESS,left,right,All
NATIONALITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
japan,2,3,5
usa,4,1,5
All,6,4,10


In [97]:
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
