## create_datasets를 먼저 돌리고 실행

In [1]:
import pandas as pd

# Fama - French 5요인
- 참고 논문: http://kiss.kstudy.com/thesis/thesis-view.asp?key=3450399

## Size 요소 가져오기

In [2]:
idx = pd.IndexSlice

In [3]:
stock_data = pd.read_hdf('assets.h5', 'finance_datareader/prices')
market_cap_rank_data = stock_data[['marcap', 'rank']]

In [4]:
total_asset = pd.read_csv('IFRS/TotalAssets.csv', encoding='CP949')

In [5]:
# 종목코드를 6자리로 맞춰줌 -> 파이썬 고질병 '000660'을 숫자로 자동 인식하면 앞의 000을 날려버림 이거를 string으로 변경해서 복구
ticker = total_asset['거래소코드'].apply("{0:0>6}".format)

In [6]:
# 금융업 제외한 재무제표랑 market cap 데이터랑 겹치는 종목 추출
intersect_ticker = market_cap_rank_data.unstack('date').index.intersection(ticker)

In [7]:
size_factor = (market_cap_rank_data
                .unstack('date')
                .loc[intersect_ticker,:].stack('date')
                .reorder_levels(['date','ticker'])
                .sort_index())

In [8]:
size_factor = size_factor.astype('int64')

In [9]:
# 월별로 시가총액 순위를 구하고 6월달 시가총액 순위만 출력
month_rank = size_factor['marcap'].unstack('ticker').resample('M').last().rank(axis=1)
six_month_rank = month_rank[month_rank.index.month == 6].stack('ticker').to_frame()
six_month_rank = six_month_rank.rename(columns={0:'rank'})
six_month_rank = six_month_rank.astype('int32')

In [10]:
# Monthly Period로 변경
six_month_rank.index = six_month_rank.index.set_levels(six_month_rank.index.levels[0].to_period('M'), level=0)

In [11]:
# 일자별 시가총액 순위 데이터
size_factor.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,marcap,rank
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
1995-05-02,20,76796000000,191
1995-05-02,40,24826762500,363
1995-05-02,50,84854000000,179
1995-05-02,70,247896000000,58
1995-05-02,80,166793251500,81


In [12]:
six_month_rank.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
date,ticker,Unnamed: 2_level_1
1995-06,20,230
1995-06,40,72
1995-06,50,233
1995-06,70,309
1995-06,80,293


## B/M Book to Market Ratio 구하기
- 시가총액은 연도 말을 기준으로 한다.
- 재무제표는 사업보고서이고 공시 날짜는 6월 1일로 통일한다.

In [13]:
def get_ifrs_data(data_path: str):
    data = pd.read_csv(data_path, encoding='CP949')
    data.fillna(0, inplace=True)
    data['거래소코드'] = data['거래소코드'].apply("{0:0>6}".format)
    data['회계년도'] = pd.to_datetime(data['회계년도'])
    data = data.set_index(['회계년도','거래소코드']).sort_index()
    # 회계년도 안맞는 것들 resample을 이용해 연말로 회계년도 통일
    data = data.unstack('거래소코드').resample('Y').last().stack('거래소코드')
    data.index = data.index.set_levels(data.index.levels[0].to_period('M'), level=0)
    col_list = data.columns
    before_ifrs = data.loc[idx[:'2006',:],:][col_list[-2]]
    after_ifrs = data.loc[idx['2007':,:],:][col_list[-1]]
    data = pd.concat([before_ifrs, after_ifrs]).to_frame()
    data.columns = [col_list[-2]]
    data.index.names = ['date','ticker']
    data.sort_index(inplace=True)
    return data.copy()

In [14]:
idx = pd.IndexSlice

In [15]:
common_stock_capital = get_ifrs_data('IFRS/CommonStock.csv')

In [16]:
common_stock_capital

Unnamed: 0_level_0,Unnamed: 1_level_0,보통주자본금(천원)
date,ticker,Unnamed: 2_level_1
1981-12,000020,2808000.0
1981-12,000040,3450000.0
1981-12,000050,2400000.0
1981-12,000070,9500000.0
1981-12,000080,3000000.0
...,...,...
2020-12,363280,18750451.0
2020-12,375500,0.0
2020-12,378850,0.0
2020-12,900140,0.0


In [17]:
capital_surplus = get_ifrs_data('IFRS/CapitalSurplus.csv')

In [18]:
capital_surplus

Unnamed: 0_level_0,Unnamed: 1_level_0,자본잉여금(*)(천원)
date,ticker,Unnamed: 2_level_1
1981-12,000020,4410.0
1981-12,000040,1903860.0
1981-12,000050,12797820.0
1981-12,000070,9538010.0
1981-12,000080,8837667.0
...,...,...
2020-12,363280,528596133.0
2020-12,375500,0.0
2020-12,378850,0.0
2020-12,900140,0.0


In [19]:
retained_earnings = get_ifrs_data('IFRS/RetainedEarnings.csv')

In [20]:
retained_earnings

Unnamed: 0_level_0,Unnamed: 1_level_0,이익잉여금(*)(천원)
date,ticker,Unnamed: 2_level_1
1981-12,000020,2567147.0
1981-12,000040,343796.0
1981-12,000050,0.0
1981-12,000070,6452616.0
1981-12,000080,1002994.0
...,...,...
2020-12,363280,-2390485.0
2020-12,375500,0.0
2020-12,378850,0.0
2020-12,900140,0.0


In [21]:
deferred_tax_liabilities = get_ifrs_data('IFRS/DeferredTaxLiabilities.csv')

In [22]:
market_cap_data = stock_data['marcap'].to_frame()

In [23]:
market_cap_data = market_cap_data.unstack('ticker').resample('Y').last().stack('ticker')

In [24]:
market_cap_data.index = market_cap_data.index.set_levels(market_cap_data.index.levels[0].to_period('M'), level=0)

In [25]:
market_cap_data

Unnamed: 0_level_0,Unnamed: 1_level_0,marcap
date,ticker,Unnamed: 2_level_1
1995-12,000020,6.390900e+10
1995-12,000030,1.317500e+12
1995-12,000040,2.919949e+10
1995-12,000050,6.760600e+10
1995-12,000060,2.385240e+11
...,...,...
2021-12,378850,1.194699e+11
2021-12,380440,1.077090e+11
2021-12,383220,3.160334e+12
2021-12,900140,3.540609e+11


In [26]:
book_to_market_data = pd.concat([common_stock_capital, capital_surplus, retained_earnings, deferred_tax_liabilities, market_cap_data], axis=1)
book_to_market_data.sort_index(inplace=True)

In [27]:
book_to_market_data

Unnamed: 0_level_0,Unnamed: 1_level_0,보통주자본금(천원),자본잉여금(*)(천원),이익잉여금(*)(천원),이연법인세부채(천원),marcap
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1981-12,000020,2808000.0,4410.0,2567147.0,0.0,
1981-12,000040,3450000.0,1903860.0,343796.0,0.0,
1981-12,000050,2400000.0,12797820.0,0.0,0.0,
1981-12,000070,9500000.0,9538010.0,6452616.0,0.0,
1981-12,000080,3000000.0,8837667.0,1002994.0,0.0,
...,...,...,...,...,...,...
2021-12,378850,,,,,1.194699e+11
2021-12,380440,,,,,1.077090e+11
2021-12,383220,,,,,3.160334e+12
2021-12,900140,,,,,3.540609e+11


In [28]:
# 시가총액이 있는 년도부터 시작
book_to_market_data = book_to_market_data.loc[idx['1995':'2021',:],:]

In [29]:
# 결측치 제거
book_to_market_data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_to_market_data.dropna(inplace=True)


### Book to Market Ratio 구하기

In [30]:
# B/M 계산
columns_list = book_to_market_data.columns
book_value = book_to_market_data.loc[:,columns_list[:-1]].sum(axis=1)
book_to_market_ratio = book_value.div(book_to_market_data['marcap']).to_frame()
book_to_market_ratio.columns = ['BM']


### 회계년도랑 공시년도가 차이가 나므로 회계년도에서 6개월 뒤로 미룬다.
- 년말 12월 기준의 회계가 있어도 공시하기까지 시간이 걸린다. 이부분을 고려

In [31]:
def offset_6_month(data: pd.DataFrame) -> pd.DataFrame:
    '''
        Input DataFrame E.g
        Multi Index DataFrame
                      price
        date  ticker
     1995-12  000660  1000
              005930  20000
              003229  3004000
              
        
        Output DataFrame E.g
                         price
        date  ticker
     1996-06  000660  1000
              005930  20000
              003229  3004000
        
    '''
    date_index = data.index.levels[0].to_timestamp() + pd.DateOffset(months=6)
    data.index = data.index.set_levels(date_index.to_period('M'), level=0)
    return data 

## B/M Ratio 공시날짜를 위해서 6개월 미루기

In [32]:
book_to_market_ratio = offset_6_month(book_to_market_ratio)

In [33]:
book_to_market_ratio

Unnamed: 0_level_0,Unnamed: 1_level_0,BM
date,ticker,Unnamed: 2_level_1
1996-06,000020,0.000646
1996-06,000040,0.001656
1996-06,000050,0.000926
1996-06,000070,0.001059
1996-06,000080,0.001073
...,...,...
2021-06,344820,0.002336
2021-06,352820,0.000212
2021-06,353200,0.001031
2021-06,363280,0.000537


## 수익성 지표 계산
- OP = 영업이익 / 자기자본 장부가치
    + 영업이익 = 매출액 - 매출원가 - 이자비용 - 판관비

#### 매출액

In [34]:
net_sales = get_ifrs_data('IFRS/NetSales.csv')

In [35]:
net_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,매출액(영업수익)(*)(천원)
date,ticker,Unnamed: 2_level_1
1981-12,000020,26964281.0
1981-12,000040,48075570.0
1981-12,000050,52868914.0
1981-12,000070,223628100.0
1981-12,000080,93182984.0
...,...,...
2020-12,363280,0.0
2020-12,375500,0.0
2020-12,378850,0.0
2020-12,900140,0.0


#### 매출원가

In [36]:
cost_of_sales = get_ifrs_data('IFRS/CostOfSales.csv')

In [37]:
cost_of_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,매출원가(*)(천원)
date,ticker,Unnamed: 2_level_1
1981-12,000020,16814161.0
1981-12,000040,42802676.0
1981-12,000050,46724109.0
1981-12,000070,189372232.0
1981-12,000080,75451679.0
...,...,...
2020-12,363280,0.0
2020-12,375500,0.0
2020-12,378850,0.0
2020-12,900140,0.0


#### 이자비용

In [38]:
interest_expense = get_ifrs_data('IFRS/InterestExpenses.csv')

In [39]:
interest_expense

Unnamed: 0_level_0,Unnamed: 1_level_0,이자비용(천원)
date,ticker,Unnamed: 2_level_1
1981-12,000020,0.0
1981-12,000040,0.0
1981-12,000050,0.0
1981-12,000070,0.0
1981-12,000080,0.0
...,...,...
2020-12,363280,0.0
2020-12,375500,0.0
2020-12,378850,0.0
2020-12,900140,0.0


### 판매비와 관리비

In [40]:
selling_and_admin_expense = get_ifrs_data('IFRS/Selling_and_administrative_expenses.csv')

In [41]:
selling_and_admin_expense

Unnamed: 0_level_0,Unnamed: 1_level_0,판매비와관리비(영업비용)(*)(천원)
date,ticker,Unnamed: 2_level_1
1981-12,000020,6385484.0
1981-12,000040,2264367.0
1981-12,000050,2939748.0
1981-12,000070,10461231.0
1981-12,000080,11557635.0
...,...,...
2020-12,363280,2265873.0
2020-12,375500,0.0
2020-12,378850,0.0
2020-12,900140,0.0


In [42]:
op_data = pd.concat([net_sales, cost_of_sales, interest_expense, selling_and_admin_expense], axis=1)

In [43]:
op_data.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,매출액(영업수익)(*)(천원),매출원가(*)(천원),이자비용(천원),판매비와관리비(영업비용)(*)(천원)
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1981-12,20,26964281.0,16814161.0,0.0,6385484.0
1981-12,40,48075570.0,42802676.0,0.0,2264367.0
1981-12,50,52868914.0,46724109.0,0.0,2939748.0
1981-12,70,223628100.0,189372232.0,0.0,10461231.0
1981-12,80,93182984.0,75451679.0,0.0,11557635.0


In [44]:
col_list = op_data.columns

In [45]:
# 위의 식대로 계산
op = (op_data[col_list[0]] - op_data[col_list[1:]].sum(axis=1)).div(book_value).to_frame()

In [46]:
op.columns = ['OP']
op.dropna(inplace=True)

In [47]:
op = offset_6_month(op)

In [48]:
op 

Unnamed: 0_level_0,Unnamed: 1_level_0,OP
date,ticker,Unnamed: 2_level_1
1996-06,000020,0.615225
1996-06,000040,-0.132015
1996-06,000050,0.121340
1996-06,000070,0.229497
1996-06,000080,0.263004
...,...,...
2021-06,339770,0.198420
2021-06,344820,0.026108
2021-06,352820,0.071987
2021-06,353200,0.018592


## 자본투자
- t-1년 12월 말의 총자산에서 t-2년 12월 말의 총 자산을 차감한 총자산증가액
    + inv = TotalAsset(t-1) / TotalAsset(t-2) - 1

### 총 자산

In [49]:
total_asset = get_ifrs_data('IFRS/TotalAssets.csv')

In [50]:
total_assett_unstack = total_asset.unstack('ticker')

In [51]:
inv = (total_assett_unstack / total_assett_unstack.shift(1)).stack('ticker') - 1

In [52]:
inv = offset_6_month(inv)

In [53]:
inv

Unnamed: 0_level_0,Unnamed: 1_level_0,자산(*)(천원)
date,ticker,Unnamed: 2_level_1
1983-06,000040,0.150404
1983-06,000050,0.033943
1983-06,000100,0.524996
1983-06,000120,0.108065
1983-06,000140,0.154147
...,...,...
2021-06,336260,0.593889
2021-06,336370,0.508118
2021-06,339770,0.519012
2021-06,352820,5.267460


## Size-B/M, Size-OP, Size-Inv Portfolio 구성
- 가격 데이터가 1995년부터 있으므로 1995년부터 Portfolio 구성
- Value weight portfolio

기업규모
- 하위 50%, 상위 50%


B/M
- 상위 33%, 중위 33%, 하위 33%


수익성
- 상위 33%, 중위 33%, 하위 33%


자본투자
- 하위 33%, 중위 33%, 상위 33%

#### Size - B/M 포트폴리오
|Size/ BM|Small|Big|
|---|---|---|
|High BM|SH|BH|
|2|SN|BN|
|Low BM|SL|BL|

#### Size - OP 포트폴리오
|Size/ OP|Small|Big|
|---|---|---|
|High OP|SR|BR|
|2|SN|BN|
|Low OP|SW|BW|

#### Size - Inv 포트폴리오
|Size/ Inv|Small|Big|
|---|---|---|
|High Inv|SC|BC|
|2|SN|BN|
|Low Inv|SA|BA|

In [54]:
period_index = book_to_market_ratio.unstack('ticker').index

### 1년 Size - B/M Portfolio 수익률 먼저 만들어 보기

In [55]:
# 처음 월들을 기준으로 정하기
first_month = period_index[0]
print(first_month)

1996-06


In [56]:
rank = six_month_rank.loc[idx[first_month,:],:]['rank']

In [57]:
rank

date     ticker
1996-06  000020    264
         000040    119
         000050    217
         000070    326
         000080    306
                  ... 
         021050     20
         025000    263
         025620    165
         025820    113
         025860    335
Name: rank, Length: 371, dtype: int32

#### 2분위수로 Size Factor 나누기

In [58]:
rank_qauntile = pd.qcut(rank, q=2, labels=['small','big']).to_frame()

In [59]:
big_marcap_ticker = rank_qauntile[rank_qauntile['rank'].astype(str) == 'big'].index.get_level_values('ticker')

In [60]:
small_marcap_ticker = rank_qauntile[rank_qauntile['rank'].astype(str) == 'small'].index.get_level_values('ticker')

#### B/M을 3분위수로 나누기

#### Size - B/M 포트폴리오
|Size/ BM|Small|Big|
|---|---|---|
|High BM|SH|BH|
|2|SN|BN|
|Low BM|SL|BL|

- 표에서 SL, SM, SB 이런식으로 나누기

In [61]:
big_marcap_bm = book_to_market_ratio.loc[idx[first_month, big_marcap_ticker],:]

In [62]:
big_marcap_bm_quantile = pd.qcut(big_marcap_bm['BM'], q=3, labels=['BL','BN','BH']).to_frame()

In [63]:
small_marcap_bm = book_to_market_ratio.loc[idx[first_month, small_marcap_ticker],:]

In [64]:
small_marcap_bm_quantile = pd.qcut(small_marcap_bm['BM'], q=3, labels=['SL','SN','SH']).to_frame()