In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
# 集計以外のグループ演算や操作についていろいろ見ていく
df = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})
df

Unnamed: 0,data1,data2,key1,key2
0,0.723273,0.510807,a,one
1,-0.728093,-0.831354,a,two
2,0.48093,-0.717987,b,one
3,1.697216,-0.545516,b,two
4,-0.635222,-2.353577,a,one


In [3]:
# カラムにプレフィックスを付けることができる
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.213347,-0.891375
b,1.089073,-0.631752


In [4]:
# groupbyで作った集計データフレームを元のデータフレームにマージする
pd.merge(df, k1_means, left_on='key1', right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,0.723273,0.510807,a,one,-0.213347,-0.891375
1,-0.728093,-0.831354,a,two,-0.213347,-0.891375
4,-0.635222,-2.353577,a,one,-0.213347,-0.891375
2,0.48093,-0.717987,b,one,1.089073,-0.631752
3,1.697216,-0.545516,b,two,1.089073,-0.631752


In [5]:
# transform関数を見ていく
people = pd.DataFrame(np.random.randn(5, 5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[2:3, ['b', 'c']] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-0.645685,0.190492,0.732909,-0.52525,0.796726
Steve,-1.301664,-0.85142,-1.269515,-1.53946,-0.74843
Wes,0.355509,,,0.648569,-1.000122
Jim,-0.414742,0.089521,-0.841492,-0.596895,0.705896
Travis,0.054515,-0.615809,1.151158,0.024074,1.771124


In [6]:
# 普通のgroupby + mean実行
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-0.078554,-0.212658,0.942033,0.049131,0.522576
two,-0.858203,-0.38095,-1.055503,-1.068177,-0.021267


In [7]:
# transform関数によるgroupby + transform実行
# 元のDataFrameの形を保っていることがわかる
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,-0.078554,-0.212658,0.942033,0.049131,0.522576
Steve,-0.858203,-0.38095,-1.055503,-1.068177,-0.021267
Wes,-0.078554,-0.212658,0.942033,0.049131,0.522576
Jim,-0.858203,-0.38095,-1.055503,-1.068177,-0.021267
Travis,-0.078554,-0.212658,0.942033,0.049131,0.522576


In [8]:
# 各値から平均を引いた関数をtransform関数で実行
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(key).transform(demean)
demeaned

Unnamed: 0,a,b,c,d,e
Joe,-0.567132,0.40315,-0.209125,-0.574381,0.27415
Steve,-0.443461,-0.470471,-0.214011,-0.471283,-0.727163
Wes,0.434063,,,0.599438,-1.522698
Jim,0.443461,0.470471,0.214011,0.471283,0.727163
Travis,0.133069,-0.40315,0.209125,-0.025057,1.248548


In [9]:
# 各値から平均値を引いているので、さらに平均をとると0になるはず
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-1.850372e-17,0.0,5.5511150000000004e-17,1.619075e-17,0.0
two,-2.775558e-17,0.0,-5.5511150000000004e-17,0.0,0.0


In [10]:
# apply関数について見ていく
tips = pd.read_csv('../plot/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head(6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
5,25.29,4.71,Male,No,Sun,Dinner,4,0.18624


In [11]:
# tip_pctでsortして値が大きいレコードを抽出する関数
# まずは普通に実行
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [12]:
# apply関数でをgroupby + topを実行
# グループごとにtopが実行される
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [13]:
# 階層型groupby　+ topを実行
# applyにつけた引数はtop関数に渡される
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


In [14]:
# group_keys=Falseにするとグルーピングのインデックス生成を抑制できる
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [15]:
# 値を元にビン分割: cut()
# 量を元にビン分割: qcut()
# cut関数をつかってビン分析
# cut関数は値の範囲を等分し、各ビンの最大値と最小値の差は同じになる。レンジごとの件数を数えたい時によく使う
frame = pd.DataFrame({'data1': np.random.randn(1000), 'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)
factor.head(10)

0     (0.242, 1.656]
1     (0.242, 1.656]
2    (-1.172, 0.242]
3    (-1.172, 0.242]
4    (-1.172, 0.242]
5    (-1.172, 0.242]
6    (-1.172, 0.242]
7     (0.242, 1.656]
8    (-1.172, 0.242]
9     (0.242, 1.656]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.592, -1.172] < (-1.172, 0.242] < (0.242, 1.656] < (1.656, 3.071]]

In [16]:
# 4つのビンでグルーピングし、get_stats関数でそれぞれのビンの統計量を見る
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.592, -1.172]",131.0,2.896351,0.045834,-2.053351
"(-1.172, 0.242]",491.0,3.321153,0.025626,-2.708158
"(0.242, 1.656]",326.0,2.629771,0.000551,-2.255963
"(1.656, 3.071]",52.0,1.98294,-0.009942,-2.462772


In [17]:
# qcut関数をつかって分位点によるビン分析
# qcutは値の個数を等分し、各グループの中身の数は同じになる。値の大きさ順にデータをn等分するためデータに重複があると挙動が怪しくなる
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.179855,0.063204,-2.053351
1,100.0,3.321153,0.00339,-2.058195
2,100.0,2.032475,0.197911,-2.194489
3,100.0,1.989513,0.035805,-2.708158
4,100.0,1.977984,-0.002078,-2.650298
5,100.0,2.091296,-0.118984,-2.6562
6,100.0,2.042168,0.053452,-2.011548
7,100.0,2.629771,0.04884,-2.255963
8,100.0,2.468363,-0.008327,-2.047205
9,100.0,1.98294,-0.09072,-2.462772


In [18]:
# グループごとに指定した値で欠損値を埋める
# 試しにグループを指定しないやり方を試す
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    0.995419
2         NaN
3    1.128391
4         NaN
5    0.689766
dtype: float64

In [19]:
s.fillna(s.mean())

0    0.937859
1    0.995419
2    0.937859
3    1.128391
4    0.937859
5    0.689766
dtype: float64

In [20]:
# グループごとに指定した値で欠損値を埋める
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
data = pd.Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -0.795571
New York     -0.953406
Vermont            NaN
Florida       2.313142
Oregon       -0.517925
Nevada             NaN
California   -0.665244
Idaho              NaN
dtype: float64

In [21]:
# グループごとの平均を計算する
group_key = ['East'] * 4 + ['West'] * 4
data.groupby(group_key).mean()

East    0.188055
West   -0.591584
dtype: float64

In [22]:
# 欠損値をグループごとの平均で埋める
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio         -0.795571
New York     -0.953406
Vermont       0.188055
Florida       2.313142
Oregon       -0.517925
Nevada       -0.591584
California   -0.665244
Idaho        -0.591584
dtype: float64

In [23]:
# ランダムサンプリングをトランプのブラックジャックで説明
# スート
suits = ['H', 'S', 'C', 'D']
# カードの点数
card_val = (list(range(1, 11)) + [10] * 3) * 4
# カードのランク
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
# スートとランクを連結してカードを作る
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

# カード名をラベルに、点数をバリューに
deck = pd.Series(card_val, index=cards)
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [24]:
# 指定した個数だけランダムサンプリングする関数
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n])

draw(deck)

10D    10
5H      5
6S      6
9S      9
8C      8
dtype: int64

In [25]:
# ２つのランダムなカードをそれぞれのスーツから抽出したい
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, n=2)

C  AC     1
   QC    10
D  7D     7
   8D     8
H  QH    10
   9H     9
S  9S     9
   JS    10
dtype: int64

In [26]:
# ２つのランダムなカードをそれぞれのスーツから抽出したい
# group_keys=Falseにする
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

KC    10
QC    10
6D     6
4D     4
5H     5
7H     7
4S     4
7S     7
dtype: int64

In [27]:
# グループの加重平均と相関
df = pd.DataFrame({'category': ['a', 'a','a', 'a', 'b', 'b', 'b', 'b'], 'data': np.random.randn(8), 'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,-0.150586,0.976334
1,a,-0.945706,0.426617
2,a,0.513303,0.508867
3,a,1.159027,0.763709
4,b,-1.149819,0.566295
5,b,0.398166,0.867049
6,b,-1.774646,0.271546
7,b,-0.584717,0.508683


In [28]:
# 加重平均を計算
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)

category
a    0.222717
b   -0.490266
dtype: float64

In [29]:
# 加重平均について、もう少し大きいデータセットで試してみる
close_px = pd.read_csv('stock_px.csv', parse_dates=True, index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL    2214 non-null float64
MSFT    2214 non-null float64
XOM     2214 non-null float64
SPX     2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB


In [30]:
close_px.tail(4)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [31]:
# インデックス投資(SPX)と各企業(アップル・マイクロソフト・エクソンモービル)の相関
rets = close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x: x.year)
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [32]:
# アップルとマイクロソフトの年次の相関
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

In [33]:
# 線形回帰
# statsmodelsというライブラリを使っている
# 最小二乗法による線形回帰を実行する関数を定義
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

by_year.apply(regress, 'AAPL', ['SPX'])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514
