In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse

In [2]:
# resampleメソッドを試す
# まずは日にち頻度のSeries
rng = pd.date_range('1/1/2000', periods=100, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts[:5]

2000-01-01   -0.404929
2000-01-02   -0.673073
2000-01-03    0.436685
2000-01-04   -0.494372
2000-01-05   -0.732519
Freq: D, dtype: float64

In [3]:
# 期間の平均をとって月頻度に変換する
ts.resample('M').mean()

2000-01-31   -0.269904
2000-02-29    0.210614
2000-03-31    0.114985
2000-04-30    0.388600
Freq: M, dtype: float64

In [4]:
# kind: 期間に集約するときはperiod、タイムスタンプに集約するときはtimestampを指定する
ts.resample('M', kind='period').mean()

2000-01   -0.269904
2000-02    0.210614
2000-03    0.114985
2000-04    0.388600
Freq: M, dtype: float64

In [5]:
# ダウンサンプリング
# 一分毎の期間を作る
rng = pd.date_range('1/1/2000', periods=12, freq='T')
ts = pd.Series(np.arange(12), index=rng)
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int64

In [6]:
# 平均をつかって五分毎に頻度を変換する
ts.resample('5min').sum()

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [7]:
# closed: どちらを閉区間にするかを指定できる
ts.resample('5min', closed='right').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int64

In [8]:
# label='right': 右側のビンをラベルとして使う
ts.resample('5min', closed='right', label='right').sum()

2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int64

In [9]:
# オフセットを設定してビンをずらす
ts.resample('5min', closed='right', label='right', loffset='-1s').sum()

1999-12-31 23:59:59     0
2000-01-01 00:04:59    15
2000-01-01 00:09:59    40
2000-01-01 00:14:59    11
Freq: 5T, dtype: int64

In [10]:
# open hight-low close: あるビンに対しての最初(open)の値と最後(close)と最大値(hight)と最小値(low)を出力する
ts.resample('5min').ohlc()

Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,0,4,0,4
2000-01-01 00:05:00,5,9,5,9
2000-01-01 00:10:00,10,11,10,11


In [11]:
# groupbyを使った再サンプリングを見ていく
# この処理はresampleメソッドでも再現できる
rng = pd.date_range('1/1/2000', periods=100, freq='D')
ts = pd.Series(np.arange(100), index=rng)
ts.groupby(lambda x: x.month).mean()

1    15
2    45
3    75
4    95
dtype: int64

In [12]:
ts.groupby(lambda x: x.weekday).mean()

0    47.5
1    48.5
2    49.5
3    50.5
4    51.5
5    49.0
6    50.0
dtype: float64

In [13]:
# アップサンプリングと穴埋め
# 低い頻度から高い頻度に変換するときはどうするか見ていく
frame = pd.DataFrame(np.random.randn(2, 4), index=pd.date_range('1/1/2000', periods=2, freq='W-WED'), columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.355836,0.866101,1.21085,-0.999384
2000-01-12,2.453208,-0.474035,0.29092,-0.678849


In [14]:
# 低い頻度から高い頻度に変換するとデフォルトでは欠損値が発生する
df_daily = frame.resample('D').asfreq()
df_daily

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.355836,0.866101,1.21085,-0.999384
2000-01-06,,,,
2000-01-07,,,,
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,2.453208,-0.474035,0.29092,-0.678849


In [15]:
# ffillでデータを前方に穴埋めする
frame.resample('D').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.355836,0.866101,1.21085,-0.999384
2000-01-06,0.355836,0.866101,1.21085,-0.999384
2000-01-07,0.355836,0.866101,1.21085,-0.999384
2000-01-08,0.355836,0.866101,1.21085,-0.999384
2000-01-09,0.355836,0.866101,1.21085,-0.999384
2000-01-10,0.355836,0.866101,1.21085,-0.999384
2000-01-11,0.355836,0.866101,1.21085,-0.999384
2000-01-12,2.453208,-0.474035,0.29092,-0.678849


In [16]:
# 前方の穴埋めは2回までに制限する
frame.resample('D').ffill(limit=2)

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.355836,0.866101,1.21085,-0.999384
2000-01-06,0.355836,0.866101,1.21085,-0.999384
2000-01-07,0.355836,0.866101,1.21085,-0.999384
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,2.453208,-0.474035,0.29092,-0.678849


In [17]:
# 新しい日付インデックスは、再サンプリング前の日付と重なっている必要はない
frame.resample('W-THU').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-06,0.355836,0.866101,1.21085,-0.999384
2000-01-13,2.453208,-0.474035,0.29092,-0.678849


In [18]:
# 期間で再サンプリングする
frame = pd.DataFrame(np.random.randn(24, 4), index=pd.period_range('1-2000', '12-2001', freq='M'), columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame.head(5)

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01,1.366474,-0.316824,-0.237376,-0.34405
2000-02,-1.247976,-0.504781,-0.623667,1.024422
2000-03,-0.115523,-0.54858,0.712019,0.12833
2000-04,0.73274,0.455265,-1.513973,0.330788
2000-05,-0.582111,0.747779,-1.431323,-0.24928


In [19]:
# ダウンサンプリング
annual_frame = frame.resample('A-DEC').mean()
annual_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000,-0.467154,-0.011654,-0.364344,0.485621
2001,-0.185651,0.106992,0.555875,-0.514216


In [20]:
# アップサンプリング
annual_frame.resample('Q-DEC').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,-0.467154,-0.011654,-0.364344,0.485621
2000Q2,-0.467154,-0.011654,-0.364344,0.485621
2000Q3,-0.467154,-0.011654,-0.364344,0.485621
2000Q4,-0.467154,-0.011654,-0.364344,0.485621
2001Q1,-0.185651,0.106992,0.555875,-0.514216
2001Q2,-0.185651,0.106992,0.555875,-0.514216
2001Q3,-0.185651,0.106992,0.555875,-0.514216
2001Q4,-0.185651,0.106992,0.555875,-0.514216


In [21]:
# conventionで境界のどちら側に値をふるかをコントロールできる
annual_frame.resample('Q-DEC', convention='end').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,-0.467154,-0.011654,-0.364344,0.485621
2001Q1,-0.467154,-0.011654,-0.364344,0.485621
2001Q2,-0.467154,-0.011654,-0.364344,0.485621
2001Q3,-0.467154,-0.011654,-0.364344,0.485621
2001Q4,-0.185651,0.106992,0.555875,-0.514216


In [22]:
annual_frame.resample('Q-MAR').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,-0.467154,-0.011654,-0.364344,0.485621
2001Q1,-0.467154,-0.011654,-0.364344,0.485621
2001Q2,-0.467154,-0.011654,-0.364344,0.485621
2001Q3,-0.467154,-0.011654,-0.364344,0.485621
2001Q4,-0.185651,0.106992,0.555875,-0.514216
2002Q1,-0.185651,0.106992,0.555875,-0.514216
2002Q2,-0.185651,0.106992,0.555875,-0.514216
2002Q3,-0.185651,0.106992,0.555875,-0.514216
