# Chapter10 資料聚合和分組

## Data Aggregation and Group Operations III

### Apply: 分裂-套用-合併 （Split-Apply-Combine)

In [1]:
import pandas as pd
import numpy as np

In [2]:
tips=pd.read_csv('examples/tips.csv')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [4]:
tips['tip_pct']=tips['tip']/tips['total_bill']-tips['tip']

In [5]:
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,-0.950553
1,10.34,1.66,No,Sun,Dinner,3,-1.499458
2,21.01,3.5,No,Sun,Dinner,3,-3.333413
3,23.68,3.31,No,Sun,Dinner,2,-3.17022
4,24.59,3.61,No,Sun,Dinner,4,-3.463192
5,25.29,4.71,No,Sun,Dinner,4,-4.52376


In [6]:
tips.sort_values(by='tip_pct')

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
170,50.81,10.00,Yes,Sat,Dinner,3,-9.803188
212,48.33,9.00,No,Sat,Dinner,4,-8.813780
23,39.42,7.58,No,Sat,Dinner,4,-7.387712
59,48.27,6.73,No,Sat,Dinner,4,-6.590576
141,34.30,6.70,No,Thur,Lunch,6,-6.504665
...,...,...,...,...,...,...,...
0,16.99,1.01,No,Sun,Dinner,2,-0.950553
236,12.60,1.00,Yes,Sat,Dinner,2,-0.920635
111,7.25,1.00,No,Sat,Dinner,1,-0.862069
92,5.75,1.00,Yes,Fri,Dinner,2,-0.826087


### 自定義一個函式叫做top，依據tip_pct的值由小到大排序，取出倒數５個數值。

In [7]:
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]

### 原先top定義的n=5，可以換成6。

In [8]:
top(tips,6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
215,12.9,1.1,Yes,Sat,Dinner,2,-1.014729
0,16.99,1.01,No,Sun,Dinner,2,-0.950553
236,12.6,1.0,Yes,Sat,Dinner,2,-0.920635
111,7.25,1.0,No,Sat,Dinner,1,-0.862069
92,5.75,1.0,Yes,Fri,Dinner,2,-0.826087
67,3.07,1.0,Yes,Sat,Dinner,1,-0.674267


### apply(函式)：執行函式的功能。
### 如下的運作方式，會先找到smoker=yea,no進行分組，再就各別的2組，找到tip_pct倒數5名的資料。

In [9]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,75,10.51,1.25,No,Sat,Dinner,2,-1.131066
No,235,10.07,1.25,No,Sat,Dinner,2,-1.125869
No,135,8.51,1.25,No,Thur,Lunch,2,-1.103114
No,0,16.99,1.01,No,Sun,Dinner,2,-0.950553
No,111,7.25,1.0,No,Sat,Dinner,1,-0.862069
Yes,237,32.83,1.17,Yes,Sat,Dinner,2,-1.134362
Yes,215,12.9,1.1,Yes,Sat,Dinner,2,-1.014729
Yes,236,12.6,1.0,Yes,Sat,Dinner,2,-0.920635
Yes,92,5.75,1.0,Yes,Fri,Dinner,2,-0.826087
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,-0.674267


### 先按照smoker, day將資料分組，再進行top函式的運作。
### 如果要替換參數值，可以直接寫在apply()內。

In [10]:
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,-3.107143
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,-8.81378
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,-4.896201
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,-4.878611
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,-4.61225
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,-9.803188
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,-3.422822
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,-4.884018


In [11]:
result=tips.groupby('smoker')['tip_pct'].describe()

In [12]:
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,-2.832526,1.363754,-8.81378,-3.341225,-2.553262,-1.849044,-0.862069
Yes,93.0,-2.845514,1.371587,-9.803188,-3.554423,-2.828669,-1.861878,-0.674267


In [13]:
result.unstack() #將欄轉為列。

       smoker
count  No        151.000000
       Yes        93.000000
mean   No         -2.832526
       Yes        -2.845514
std    No          1.363754
       Yes         1.371587
min    No         -8.813780
       Yes        -9.803188
25%    No         -3.341225
       Yes        -3.554423
50%    No         -2.553262
       Yes        -2.828669
75%    No         -1.849044
       Yes        -1.861878
max    No         -0.862069
       Yes        -0.674267
dtype: float64

### 消除分組key索引

In [14]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,75,10.51,1.25,No,Sat,Dinner,2,-1.131066
No,235,10.07,1.25,No,Sat,Dinner,2,-1.125869
No,135,8.51,1.25,No,Thur,Lunch,2,-1.103114
No,0,16.99,1.01,No,Sun,Dinner,2,-0.950553
No,111,7.25,1.0,No,Sat,Dinner,1,-0.862069
Yes,237,32.83,1.17,Yes,Sat,Dinner,2,-1.134362
Yes,215,12.9,1.1,Yes,Sat,Dinner,2,-1.014729
Yes,236,12.6,1.0,Yes,Sat,Dinner,2,-0.920635
Yes,92,5.75,1.0,Yes,Fri,Dinner,2,-0.826087
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,-0.674267


### 預設值smoker會變成index，如果要取消，可以寫上group_keys=False

In [15]:
tips.groupby('smoker',group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
75,10.51,1.25,No,Sat,Dinner,2,-1.131066
235,10.07,1.25,No,Sat,Dinner,2,-1.125869
135,8.51,1.25,No,Thur,Lunch,2,-1.103114
0,16.99,1.01,No,Sun,Dinner,2,-0.950553
111,7.25,1.0,No,Sat,Dinner,1,-0.862069
237,32.83,1.17,Yes,Sat,Dinner,2,-1.134362
215,12.9,1.1,Yes,Sat,Dinner,2,-1.014729
236,12.6,1.0,Yes,Sat,Dinner,2,-0.920635
92,5.75,1.0,Yes,Fri,Dinner,2,-0.826087
67,3.07,1.0,Yes,Sat,Dinner,1,-0.674267


### 分位數與欄分析

In [16]:
frame=pd.DataFrame({'data1':np.random.randn(1000),
                    'data2':np.random.randn(1000)})

In [17]:
frame

Unnamed: 0,data1,data2
0,0.160440,-0.548098
1,0.672628,-0.439868
2,-0.235320,1.255635
3,0.028489,-0.441929
4,-0.551960,-0.186347
...,...,...
995,0.310051,-1.407302
996,-0.743511,-1.126863
997,-0.387277,-0.710601
998,-0.813141,-1.081517


### pd.cut(data, n): 將資料分成4組。

In [18]:
quartiles=pd.cut(frame.data1,4)

In [19]:
quartiles[:10]

0    (-1.262, 0.197]
1     (0.197, 1.657]
2    (-1.262, 0.197]
3    (-1.262, 0.197]
4    (-1.262, 0.197]
5    (-1.262, 0.197]
6     (0.197, 1.657]
7    (-1.262, 0.197]
8     (1.657, 3.116]
9    (-1.262, 0.197]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-2.727, -1.262] < (-1.262, 0.197] < (0.197, 1.657] < (1.657, 3.116]]

### 定義函式：將一組資料進行min, max, count, mean的運算。

In [20]:
def get_stats(group):
    return {'min':group.min(), 'max':group.max(),
            'count':group.count(), 'mean':group.mean()}

### frame.data2: 先取出data2的資料
### groupby(quartiles): 再以quartiles的Cateogries進行分群。

In [21]:
grouped=frame.data2.groupby(quartiles)

### apply(get_stats): 使用get_stats這個函數。
### unstacked(): 原本min, max, count, mean從列轉成欄。

In [22]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.727, -1.262]",-2.204393,2.522899,104.0,-0.071123
"(-1.262, 0.197]",-3.514321,2.924395,493.0,-0.052354
"(0.197, 1.657]",-2.743756,2.615454,355.0,0.040518
"(1.657, 3.116]",-1.945896,2.072024,48.0,-0.086249


### pd.qcut: 可以將資料分成10組，每組的數字數量一樣。

In [23]:
grouping=pd.qcut(frame.data1,10)

In [24]:
grouping[:10]

0     (-0.064, 0.203]
1      (0.466, 0.821]
2    (-0.306, -0.064]
3     (-0.064, 0.203]
4    (-0.845, -0.536]
5     (-0.064, 0.203]
6      (0.203, 0.466]
7    (-0.306, -0.064]
8      (1.241, 3.116]
9    (-1.276, -0.845]
Name: data1, dtype: category
Categories (10, interval[float64, right]): [(-2.722, -1.276] < (-1.276, -0.845] < (-0.845, -0.536] < (-0.536, -0.306] ... (0.203, 0.466] < (0.466, 0.821] < (0.821, 1.241] < (1.241, 3.116]]

### labels=False 原本資料會顯示每一個數值分在哪一組，如果寫上False，系統會以2代表(-0.831, -0.525]

In [25]:
grouping=pd.qcut(frame.data1,10, labels=False)

In [26]:
grouping[:10]

0    5
1    7
2    4
3    5
4    2
5    5
6    6
7    4
8    9
9    1
Name: data1, dtype: int64

### groupby(grouping): 以grouping的categorical為分群依據。
### 因為grouping內有寫labels=False，所以categorical變成編碼。

In [27]:
grouped=frame.data2.groupby(grouping)

In [28]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-2.204393,2.522899,100.0,-0.052593
1,-1.985962,1.589922,100.0,-0.054943
2,-3.514321,2.321946,100.0,-0.232312
3,-2.714034,2.38334,100.0,0.084747
4,-2.249615,2.108032,100.0,-0.028537
5,-2.105599,2.924395,100.0,-0.042001
6,-2.559197,2.233081,100.0,-0.10674
7,-2.19784,2.507791,100.0,0.00568
8,-2.743756,2.504378,100.0,0.165522
9,-2.186748,2.615454,100.0,0.031543


### 範例：依分組指定填充遺失值。

In [29]:
s=pd.Series(np.random.randn(6))

In [30]:
s[::2]=np.nan

In [31]:
s

0         NaN
1    0.138327
2         NaN
3   -0.566047
4         NaN
5   -1.172529
dtype: float64

### fillna(s.mean()): 將NAN用s的平均值填上。

In [32]:
s.fillna(s.mean())

0   -0.533416
1    0.138327
2   -0.533416
3   -0.566047
4   -0.533416
5   -1.172529
dtype: float64

In [33]:
states=['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']

In [34]:
group_key=['East']*4+['West']*4

In [35]:
group_key

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [36]:
data=pd.Series(np.random.randn(8),index=states)

In [37]:
data

Ohio         -1.410549
New York      0.113832
Vermont      -0.098002
Florida      -0.657715
Oregon        0.434203
Nevada        1.947943
California    1.619800
Idaho        -0.563303
dtype: float64

### 將data部分值變成遺失值。

In [38]:
data[['Vermont','Nevada','Idaho']]=np.nan

In [39]:
data

Ohio         -1.410549
New York      0.113832
Vermont            NaN
Florida      -0.657715
Oregon        0.434203
Nevada             NaN
California    1.619800
Idaho              NaN
dtype: float64

### 定義一個函式：如果遇到遺失值，填入該組資料的平均數。

In [40]:
fill_mean=lambda g:g.fillna(g.mean())

In [41]:
data.groupby(group_key).apply(fill_mean)

Ohio         -1.410549
New York      0.113832
Vermont      -0.651477
Florida      -0.657715
Oregon        0.434203
Nevada        1.027002
California    1.619800
Idaho         1.027002
dtype: float64

In [42]:
fill_values={'East':0.5, 'West':-1}

### fill_values[g.name]: 可以從g.name找到east，在從fill_values['East']=0.5，遇到East就填上0.5。

In [43]:
fill_func= lambda g:g.fillna(fill_values[g.name])

In [44]:
for name, group in data.groupby(group_key):
    print(name)
    print(group)

East
Ohio       -1.410549
New York    0.113832
Vermont          NaN
Florida    -0.657715
dtype: float64
West
Oregon        0.434203
Nevada             NaN
California    1.619800
Idaho              NaN
dtype: float64


### groupby(group_key): 以Ease, West做資料分群。
### apply(fill_func): 執行fill_func的函數。

In [45]:
data.groupby(group_key).apply(fill_func)

Ohio         -1.410549
New York      0.113832
Vermont       0.500000
Florida      -0.657715
Oregon        0.434203
Nevada       -1.000000
California    1.619800
Idaho        -1.000000
dtype: float64

### 範例：隨機取樣和排列

### 建立英文版本的一疊鋪克牌。

In [46]:
suits=['H','S','C','D'] # 紅心、黑桃、梅花和方塊

card_val=(list(range(1,11))+[10]*3)*4

base_names=['A']+list(range(2,11))+['J','Q','K']
cards=[]
for suit in ['H','S','C','D']:
    cards.extend(str(num)+suit for num in base_names)

deck=pd.Series(card_val,index=cards)

In [47]:
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
QH     10
KH     10
dtype: int64

In [48]:
base_names

['A', 2, 3, 4, 5, 6, 7, 8, 9, 10, 'J', 'Q', 'K']

### data.sample(n): 從data內隨機抽n個數字。

In [49]:
def draw(deck,n=5):
    return deck.sample(n)

In [50]:
draw(deck)

7C     7
5H     5
AD     1
JS    10
4C     4
dtype: int64

### 定義一個函式，card[-1]: 可以提出最後一個字母，也就是花色。

In [51]:
get_suit=lambda card:card[-1]

### 從每個花色各抽出2張牌。

In [52]:
deck.groupby(get_suit).apply(draw,n=2)

C  AC     1
   7C     7
D  8D     8
   JD    10
H  AH     1
   KH    10
S  6S     6
   4S     4
dtype: int64

### 每個花色各抽出兩張牌，按照花色排序，但是花色並不成為index。

In [53]:
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)

2C      2
10C    10
5D      5
KD     10
QH     10
5H      5
2S      2
4S      4
dtype: int64

### 範例：加權平均和關聯性

### 加權平均值 = (100*4+200*3+300*2+400*1)/(4+3+2+1)

In [54]:
df=pd.DataFrame({'category':['a','a','a','a','b','b','b','b'],
                 'data':np.arange(2,10)*8,
                 'weights':np.arange(2,10)})

In [55]:
df

Unnamed: 0,category,data,weights
0,a,16,2
1,a,24,3
2,a,32,4
3,a,40,5
4,b,48,6
5,b,56,7
6,b,64,8
7,b,72,9


### 先將df依據a,b進行分組。

In [56]:
grouped=df.groupby('category')

### np.average(data, weights=[], returned=True)
### data: 被加權的資料。
### weights: 要加權的比重。
### returned=True 是否要顯示加權比重的總和。例如1+2+3+4=10

In [57]:
get_wavg=lambda g:np.average(g['data'],weights=g['weights'],returned=True)

### 計算a,b兩組的加權平均數以及加權的總和。

In [58]:
grouped.apply(get_wavg)

category
a    (30.857142857142858, 14.0)
b    (61.333333333333336, 30.0)
dtype: object

In [59]:
close_px=pd.read_csv('examples/stock_px_2.csv',parse_dates=True)

In [60]:
close_px.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2214 entries, 0 to 2213
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2214 non-null   object 
 1   AAPL        2214 non-null   float64
 2   MSFT        2214 non-null   float64
 3   XOM         2214 non-null   float64
 4   SPX         2214 non-null   float64
dtypes: float64(4), object(1)
memory usage: 86.6+ KB


In [61]:
close_px[-4:]

Unnamed: 0.1,Unnamed: 0,AAPL,MSFT,XOM,SPX
2210,2011/10/11 0:00,400.29,27.0,76.27,1195.54
2211,2011/10/12 0:00,402.19,26.96,77.16,1207.25
2212,2011/10/13 0:00,408.43,27.18,76.37,1203.66
2213,2011/10/14 0:00,422.0,27.27,78.11,1224.58


In [62]:
spx_corr=lambda x:x.corrwith(x['SPX'])

In [63]:
a=pd.DataFrame([[10,20,30],[40,50,60],[70,80,90]])
a

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90


In [64]:
a.pct_change()

Unnamed: 0,0,1,2
0,,,
1,3.0,1.5,1.0
2,0.75,0.6,0.5


In [65]:
### rets=close_px.pct_change().dropna() 無法運作

### 範例：分組線性回歸

In [66]:
import statsmodels.api as sm

In [67]:
def regress(data, yvar, xvars):
    Y=data[yvar]
    X=data[xvars]
    X['intercept']=1
    result=sm.OLS(Y,X).fit()
    return result.params

In [68]:
### by_year.apply(regress, 'AAPL', ['SPX']) 不能用

### 樞紐分析表和交叉表

In [69]:
tips=pd.read_csv('examples/tips.csv')

In [70]:
tips['tip_pct']=tips['tip']/(tips['total_bill']-tips['tip'])

### pivot_table(index=) 可以將欄位改為index。

In [71]:
tips.pivot_table(index=['day','smoker']) 

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.17974,18.42
Fri,Yes,2.066667,2.714,0.216293,16.813333
Sat,No,2.555556,3.102889,0.190412,19.661778
Sat,Yes,2.47619,2.875476,0.179833,21.276667
Sun,No,2.929825,3.167895,0.193617,20.506667
Sun,Yes,2.578947,3.516842,0.322021,24.12
Thur,No,2.488889,2.673778,0.193424,17.113111
Thur,Yes,2.352941,3.03,0.198508,19.190588


### pivot_table(值, index, columns)

In [72]:
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.162612,0.202545
Dinner,Sat,2.555556,2.47619,0.190412,0.179833
Dinner,Sun,2.929825,2.578947,0.193617,0.322021
Dinner,Thur,2.0,,0.190114,
Lunch,Fri,3.0,1.833333,0.231125,0.236915
Lunch,Thur,2.5,2.352941,0.193499,0.198508


### margin=True 可以加入部份合計，表格會出現ALL。

In [73]:
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker',margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.162612,0.202545,0.192562
Dinner,Sat,2.555556,2.47619,2.517241,0.190412,0.179833,0.185305
Dinner,Sun,2.929825,2.578947,2.842105,0.193617,0.322021,0.225718
Dinner,Thur,2.0,,2.0,0.190114,,0.190114
Lunch,Fri,3.0,1.833333,2.0,0.231125,0.236915,0.236088
Lunch,Thur,2.5,2.352941,2.459016,0.193499,0.198508,0.194895
All,,2.668874,2.408602,2.569672,0.192237,0.218176,0.202123


### aggfunc='count','len' 可以讓maring=True 執行聚合函式。

### len可以計算分組的數量。

In [74]:
tips.pivot_table('tip_pct',index=['time','smoker'],columns='day',aggfunc=len,margins=True)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,3.0,45.0,57.0,1.0,106
Dinner,Yes,9.0,42.0,19.0,,70
Lunch,No,1.0,,,44.0,45
Lunch,Yes,6.0,,,17.0,23
All,,19.0,87.0,76.0,62.0,244


### fill_value=0 可以對遺失值補上0。

In [75]:
tips.pivot_table('tip_pct',index=['time','size','smoker'],columns='day',aggfunc='mean',fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,1,No,0.0,0.16,0.0,0.0
Dinner,1,Yes,0.0,0.483092,0.0,0.0
Dinner,2,No,0.162612,0.198319,0.206535,0.190114
Dinner,2,Yes,0.21118,0.178877,0.400522,0.0
Dinner,3,No,0.0,0.18387,0.182962,0.0
Dinner,3,Yes,0.0,0.176599,0.183278,0.0
Dinner,4,No,0.0,0.177734,0.175289,0.0
Dinner,4,Yes,0.133465,0.147074,0.254373,0.0
Dinner,5,No,0.0,0.0,0.263344,0.0
Dinner,5,Yes,0.0,0.119284,0.070274,0.0


In [76]:
### dropna=True 捨棄整欄都是NA的欄。

In [77]:
tips.pivot_table('tip_pct',index=['time','size','smoker'],columns='day',aggfunc='mean', dropna=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,1,No,,0.16,,
Dinner,1,Yes,,0.483092,,
Dinner,2,No,0.162612,0.198319,0.206535,0.190114
Dinner,2,Yes,0.21118,0.178877,0.400522,
Dinner,3,No,,0.18387,0.182962,
Dinner,3,Yes,,0.176599,0.183278,
Dinner,4,No,,0.177734,0.175289,
Dinner,4,Yes,0.133465,0.147074,0.254373,
Dinner,5,No,,,0.263344,
Dinner,5,Yes,,0.119284,0.070274,


### 交叉表 Crosstab: 用於計算分組頻率。

In [78]:
data=pd.DataFrame({'Sample':np.arange(1,11),
                   'Nationality':['USA','Japan','USA','Japan','Japan','Japan','USA','USA','Japan','USA'],
                   'Handedness':['Right','Left','Right','Right','Left','Right','Right','Left','Right','Right']})

In [79]:
data

Unnamed: 0,Sample,Nationality,Handedness
0,1,USA,Right
1,2,Japan,Left
2,3,USA,Right
3,4,Japan,Right
4,5,Japan,Left
5,6,Japan,Right
6,7,USA,Right
7,8,USA,Left
8,9,Japan,Right
9,10,USA,Right


### pd.crosstab(index, columns, margins=True) 

In [80]:
pd.crosstab(data.Nationality, data.Handedness, margins=True)

Handedness,Left,Right,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10


In [81]:
pd.crosstab([tips.time,tips.day],tips.smoker, margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
