In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import FinanceDataReader as fdr

from pandas import DataFrame, Series
from initializer import init

%matplotlib inline
plt.rcParams['font.family'] = 'AppleGothic'
mpl.rcParams['axes.unicode_minus'] = False

init()

In [2]:
df: DataFrame = fdr.DataReader("005930", "2009-09-01", "2010-12-31").drop("Change", axis=1)
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-12-24,18780,18780,18500,18500,169733
2010-12-27,18500,18600,18300,18440,210673
2010-12-28,18560,19000,18520,18760,313323
2010-12-29,18700,19020,18640,18900,238527
2010-12-30,18840,19040,18840,18980,183831


> index는 DateTimeIndex type이고 sorting 되어야 함

In [3]:
sample = df.iloc[:3]
sample

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-01,15600,16000,15340,15980,511854
2009-09-02,15699,15820,15600,15680,558282
2009-09-03,15660,15700,15480,15500,321269


# 1. `asfreq()`

- sampling 대상 index에 mapping 되어 있는 value는 그대로 유지

In [4]:
sample.asfreq("H")

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-01 00:00:00,15600.0,16000.0,15340.0,15980.0,511854.0
2009-09-01 01:00:00,,,,,
2009-09-01 02:00:00,,,,,
2009-09-01 03:00:00,,,,,
2009-09-01 04:00:00,,,,,
2009-09-01 05:00:00,,,,,
2009-09-01 06:00:00,,,,,
2009-09-01 07:00:00,,,,,
2009-09-01 08:00:00,,,,,
2009-09-01 09:00:00,,,,,


In [9]:
sample.asfreq("H", method="ffill").head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-01 00:00:00,15600,16000,15340,15980,511854
2009-09-01 01:00:00,15600,16000,15340,15980,511854
2009-09-01 02:00:00,15600,16000,15340,15980,511854
2009-09-01 03:00:00,15600,16000,15340,15980,511854
2009-09-01 04:00:00,15600,16000,15340,15980,511854


In [10]:
sample.asfreq("M")

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [12]:
"""월 말일이 거래일이 아닐 경우 다음과 같이 NaN으로 값이 채워짐"""
df.asfreq("M")

'월 말일이 거래일이 아닐 경우 다음과 같이 NaN으로 값이 채워짐'

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-30,16140.0,16500.0,16120.0,16300.0,479813.0
2009-10-31,,,,,
2009-11-30,14460.0,14640.0,14380.0,14400.0,376335.0
2009-12-31,,,,,
2010-01-31,,,,,
2010-02-28,,,,,
2010-03-31,16280.0,16680.0,16260.0,16360.0,434300.0
2010-04-30,16560.0,17040.0,16540.0,16980.0,454668.0
2010-05-31,15440.0,15620.0,15360.0,15520.0,255822.0
2010-06-30,15399.0,15600.0,15399.0,15480.0,536406.0


# 2. `resample()`

- (date)time-based groupby
- aggregation function과 같이 쓰인다.
- 사실 `asfreq()`는 잘 사용하지 않는다.

In [13]:
close_df = df[["Close"]]
log_rtn_df: DataFrame = np.log(close_df / close_df.shift(1)).fillna(0)

log_rtn_df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-09-01,0.0
2009-09-02,-0.019
2009-09-03,-0.012
2009-09-04,-0.01
2009-09-07,0.008


In [14]:
log_rtn_df.resample("M")

<pandas.core.resample.DatetimeIndexResampler object at 0x12067cd00>

In [20]:
"""resample의 단점 - 데이터 내 연말의 값을 대표값으로 설정해버림"""
"""그래도 날짜는 Business Day 여부 상관없이 12월 31일로 고정함"""

log_rtn_df.resample("A").last() # Annual

"""2009-12-30 value 확인"""
log_rtn_df.loc["2009-12-27":]

'resample의 단점 - 데이터 내 연말의 값을 대표값으로 설정해버림'

'그래도 날짜는 Business Day 여부 상관없이 12월 31일로 고정함'

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-12-31,0.016
2010-12-31,0.004


'2009-12-30 value 확인'

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-12-28,-0.004
2009-12-29,-0.001
2009-12-30,0.016
2010-01-04,0.012
2010-01-05,0.016
...,...
2010-12-24,-0.012
2010-12-27,-0.003
2010-12-28,0.017
2010-12-29,0.007


In [29]:
"""사실 이렇게 쓸 일은 없지만, 작동 방식을 보는 것으로 하자."""
log_rtn_df.resample("H").mean().head()
log_rtn_df.resample("H").agg({"Close": "mean"}).head()

'사실 이렇게 쓸 일은 없지만, 작동 방식을 보는 것으로 하자.'

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-09-01 00:00:00,0.0
2009-09-01 01:00:00,
2009-09-01 02:00:00,
2009-09-01 03:00:00,
2009-09-01 04:00:00,


Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-09-01 00:00:00,0.0
2009-09-01 01:00:00,
2009-09-01 02:00:00,
2009-09-01 03:00:00,
2009-09-01 04:00:00,


## 2.1. Upsampling & Downsampling

In [30]:
by_hour = sample.resample("H").mean()
by_hour.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-01 00:00:00,15600.0,16000.0,15340.0,15980.0,511854.0
2009-09-01 01:00:00,,,,,
2009-09-01 02:00:00,,,,,
2009-09-01 03:00:00,,,,,
2009-09-01 04:00:00,,,,,


In [31]:
"""Downsampling"""
sample.resample("A").mean()

'Downsampling'

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-12-31,15653.0,15840.0,15473.333,15720.0,463801.667


### 2.1.1. Filling `NaN`

1. `fillna`: `bfill`, `ffill`
2. `interploate()`: linear interploation

In [33]:
"""이전 값으로 NaN 채우기"""
by_hour.fillna(method="ffill").head()

'이전 값으로 NaN 채우기'

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-01 00:00:00,15600.0,16000.0,15340.0,15980.0,511854.0
2009-09-01 01:00:00,15600.0,16000.0,15340.0,15980.0,511854.0
2009-09-01 02:00:00,15600.0,16000.0,15340.0,15980.0,511854.0
2009-09-01 03:00:00,15600.0,16000.0,15340.0,15980.0,511854.0
2009-09-01 04:00:00,15600.0,16000.0,15340.0,15980.0,511854.0


In [37]:
"""interpolate() - NaN 양 끝 데이터를 등분해서 값을 채움"""
"""[1, NaN, NaN, 4] 라고 할 때 [1, 2, 3, 4]로 만듦"""
by_hour.interpolate().head()

"""실제로 값 차이가 동일한지 확인하려면 - (df - df.shift(1)) or diff()"""
by_hour.interpolate().diff().head()

'interpolate() - NaN 양 끝 데이터를 등분해서 값을 채움'

'[1, NaN, NaN, 4] 라고 할 때 [1, 2, 3, 4]로 만듦'

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-01 00:00:00,15600.0,16000.0,15340.0,15980.0,511854.0
2009-09-01 01:00:00,15604.125,15992.5,15350.833,15967.5,513788.5
2009-09-01 02:00:00,15608.25,15985.0,15361.667,15955.0,515723.0
2009-09-01 03:00:00,15612.375,15977.5,15372.5,15942.5,517657.5
2009-09-01 04:00:00,15616.5,15970.0,15383.333,15930.0,519592.0


'실제로 값 차이가 동일한지 확인하려면 - (df - df.shift(1)) or diff()'

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-01 00:00:00,,,,,
2009-09-01 01:00:00,4.125,-7.5,10.833,-12.5,1934.5
2009-09-01 02:00:00,4.125,-7.5,10.833,-12.5,1934.5
2009-09-01 03:00:00,4.125,-7.5,10.833,-12.5,1934.5
2009-09-01 04:00:00,4.125,-7.5,10.833,-12.5,1934.5


## 2.2. `kind` argument

In [41]:
"""인덱스의 종류를 지정한다"""
"""Period"""
df.resample("M", kind="period").mean()
"""Monthly Last Day"""
df.resample("M", kind="timestamp").mean()

'인덱스의 종류를 지정한다'

'Period'

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09,15814.455,16009.955,15633.455,15865.455,464460.273
2009-10,14999.857,15171.143,14721.762,14897.143,533470.381
2009-11,14692.952,14838.095,14507.476,14651.429,374873.143
2009-12,15435.952,15590.381,15321.524,15496.19,334084.905
2010-01,16438.0,16607.9,16199.0,16402.0,378501.05
2010-02,15305.053,15399.947,15098.842,15232.632,317400.0
2010-03,15844.318,15983.591,15708.136,15853.636,274398.773
2010-04,16891.818,17017.273,16688.182,16882.727,332281.682
2010-05,15800.895,15932.526,15587.368,15744.211,396018.053
2010-06,15964.619,16094.19,15813.238,15953.333,299446.048


'Monthly Last Day'

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-30,15814.455,16009.955,15633.455,15865.455,464460.273
2009-10-31,14999.857,15171.143,14721.762,14897.143,533470.381
2009-11-30,14692.952,14838.095,14507.476,14651.429,374873.143
2009-12-31,15435.952,15590.381,15321.524,15496.19,334084.905
2010-01-31,16438.0,16607.9,16199.0,16402.0,378501.05
2010-02-28,15305.053,15399.947,15098.842,15232.632,317400.0
2010-03-31,15844.318,15983.591,15708.136,15853.636,274398.773
2010-04-30,16891.818,17017.273,16688.182,16882.727,332281.682
2010-05-31,15800.895,15932.526,15587.368,15744.211,396018.053
2010-06-30,15964.619,16094.19,15813.238,15953.333,299446.048


In [42]:
df.resample("M", kind="period").mean().index
df.resample("M", kind="timestamp").mean().index # kind default

PeriodIndex(['2009-09', '2009-10', '2009-11', '2009-12', '2010-01', '2010-02',
             '2010-03', '2010-04', '2010-05', '2010-06', '2010-07', '2010-08',
             '2010-09', '2010-10', '2010-11', '2010-12'],
            dtype='period[M]', name='Date')

DatetimeIndex(['2009-09-30', '2009-10-31', '2009-11-30', '2009-12-31',
               '2010-01-31', '2010-02-28', '2010-03-31', '2010-04-30',
               '2010-05-31', '2010-06-30', '2010-07-31', '2010-08-31',
               '2010-09-30', '2010-10-31', '2010-11-30', '2010-12-31'],
              dtype='datetime64[ns]', name='Date', freq='M')

## 2.3. `ohlc()`

In [46]:
"""월봉?!"""
df["Close"].resample("M").ohlc()

'월봉?!'

Unnamed: 0_level_0,open,high,low,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-09-30,15980,16500,15280,16300
2009-10-31,15840,15840,14360,14460
2009-11-30,14360,15180,14240,14400
2009-12-31,14740,15980,14740,15980
2010-01-31,16180,17000,15680,15680
2010-02-28,15540,15580,14720,14880
2010-03-31,15400,16400,15220,16360
2010-04-30,16900,17400,16500,16980
2010-05-31,16580,16680,14820,15520
2010-06-30,15360,16600,15360,15480


## 2.4. vs `asfreq()`

In [48]:
df.asfreq("M")   # DataFrame
df.resample("M") # Group - DatetimeIndexResampler

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-30,16140.0,16500.0,16120.0,16300.0,479813.0
2009-10-31,,,,,
2009-11-30,14460.0,14640.0,14380.0,14400.0,376335.0
2009-12-31,,,,,
2010-01-31,,,,,
2010-02-28,,,,,
2010-03-31,16280.0,16680.0,16260.0,16360.0,434300.0
2010-04-30,16560.0,17040.0,16540.0,16980.0,454668.0
2010-05-31,15440.0,15620.0,15360.0,15520.0,255822.0
2010-06-30,15399.0,15600.0,15399.0,15480.0,536406.0


<pandas.core.resample.DatetimeIndexResampler object at 0x169ccb7f0>

In [51]:
df.asfreq("M").head()          # 무조건 월말일
df.resample("M").last().head() # 가장 말일 데이터 - 근데 날짜도 무조건 말일로 고정해버림

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-30,16140.0,16500.0,16120.0,16300.0,479813.0
2009-10-31,,,,,
2009-11-30,14460.0,14640.0,14380.0,14400.0,376335.0
2009-12-31,,,,,
2010-01-31,,,,,


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-30,16140,16500,16120,16300,479813
2009-10-31,14660,14799,14440,14460,557328
2009-11-30,14460,14640,14380,14400,376335
2009-12-31,15760,15980,15740,15980,295625
2010-01-31,16000,16019,15600,15680,457285


In [52]:
df.asfreq("M").mean()
df.resample("M").mean()

Open      15742.111
High      16002.111
Low       15670.889
Close     15802.222
Volume   415651.222
dtype: float64

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-30,15814.455,16009.955,15633.455,15865.455,464460.273
2009-10-31,14999.857,15171.143,14721.762,14897.143,533470.381
2009-11-30,14692.952,14838.095,14507.476,14651.429,374873.143
2009-12-31,15435.952,15590.381,15321.524,15496.19,334084.905
2010-01-31,16438.0,16607.9,16199.0,16402.0,378501.05
2010-02-28,15305.053,15399.947,15098.842,15232.632,317400.0
2010-03-31,15844.318,15983.591,15708.136,15853.636,274398.773
2010-04-30,16891.818,17017.273,16688.182,16882.727,332281.682
2010-05-31,15800.895,15932.526,15587.368,15744.211,396018.053
2010-06-30,15964.619,16094.19,15813.238,15953.333,299446.048


# 3. Exercise - 월별 수익률 구하기

- 월마다 발생하는 총수익을 월별로 산출

In [53]:
pd.set_option("display.float_format", lambda x: "%.5f" % x)

## 3.1. Using `resample()`

In [54]:
log_rtn_df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-09-01,0.0
2009-09-02,-0.01895
2009-09-03,-0.01155
2009-09-04,-0.01038
2009-09-07,0.00779


In [60]:
"""로그 수익률"""
log_rtn_df.resample("M").sum()

'로그 수익률'

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-09-30,0.01983
2009-10-31,-0.11978
2009-11-30,-0.00416
2009-12-31,0.10411
2010-01-31,-0.01895
2010-02-28,-0.05237
2010-03-31,0.09482
2010-04-30,0.0372
2010-05-31,-0.08991
2010-06-30,-0.00258


In [61]:
"""실제 수익률"""
np.exp(log_rtn_df.resample("M").sum()) - 1

'실제 수익률'

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-09-30,0.02003
2009-10-31,-0.11288
2009-11-30,-0.00415
2009-12-31,0.10972
2010-01-31,-0.01877
2010-02-28,-0.05102
2010-03-31,0.09946
2010-04-30,0.0379
2010-05-31,-0.08598
2010-06-30,-0.00258


In [64]:
"""월별 수익률"""
month_cum_rtn_df: DataFrame = (np.exp(log_rtn_df.resample("M").sum()) - 1)

month_cum_rtn_df.head()
month_cum_rtn_df.loc["2010-01-31"]

'월별 수익률'

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-09-30,0.02003
2009-10-31,-0.11288
2009-11-30,-0.00415
2009-12-31,0.10972
2010-01-31,-0.01877


Close   -0.01877
Name: 2010-01-31 00:00:00, dtype: float64

In [82]:
"""10월 31일 데이터가 없어서 30일 데이터로 계산함"""
(df.loc["2009-10-30", "Close"] / df.loc["2009-09-30", "Close"]) - 1

assert round(month_cum_rtn_df.loc["2009-10-31", "Close"], 13) == round((df.loc["2009-10-30", "Close"] / df.loc["2009-09-30", "Close"]) - 1, 13)

'10월 31일 데이터가 없어서 30일 데이터로 계산함'

-0.1128834355828221

## 3.3. `drop_duplicates()`

- 정확하게 "월말 데이터"를 내가 가지고 있는 데이터의 "월 말 날짜"에 가져오기

In [83]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-01,15600,16000,15340,15980,511854
2009-09-02,15699,15820,15600,15680,558282
2009-09-03,15660,15700,15480,15500,321269
2009-09-04,15620,15660,15220,15340,456530
2009-09-07,15420,15600,15219,15460,339857


In [84]:
df["year"] = df.index.year
df["month"] = df.index.month

monthly_df = df.drop_duplicates(subset=["year", "month"], keep="last")
monthly_df = monthly_df.drop(["year", "month"], axis=1)
monthly_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-30,16140,16500,16120,16300,479813
2009-10-30,14660,14799,14440,14460,557328
2009-11-30,14460,14640,14380,14400,376335
2009-12-30,15760,15980,15740,15980,295625
2010-01-29,16000,16019,15600,15680,457285


In [87]:
"""첫 달은 수익률이 0이 되어버리는..."""
monthly_df[["Close"]].pct_change().fillna(0)

'첫 달은 수익률이 0이 되어버리는...'

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2009-09-30,0.0
2009-10-30,-0.11288
2009-11-30,-0.00415
2009-12-30,0.10972
2010-01-29,-0.01877
2010-02-26,-0.05102
2010-03-31,0.09946
2010-04-30,0.0379
2010-05-31,-0.08598
2010-06-30,-0.00258
