In [1]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import pandas_ta
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/Users/lydiretsai/Documents/quant/AlgorithmicTradingMLQuant/stock_list_added.csv')
df = df.set_index(['date']) 
df.index = pd.to_datetime(df.index)
df = df.reset_index().set_index(['date', 'ticker'])

In [4]:
# 每个月的dollar volume表
dollar_volume_df = df.unstack()['dollar volume']
dollar_volume_df = dollar_volume_df.resample('M').mean().stack('ticker').to_frame('dollar volume')

In [20]:
# 批量筛选流动率最高的
last_cols = [c for c in df.columns.unique(0) if c not in ['symbol', 'turn over', 'volatility', 'industry', 'dollar volume', 'volume', 'open', 'high', 'low', 'percentage change', 'price change']]
data = pd.concat([dollar_volume_df, 
                  df.unstack()[last_cols].resample('M').last().stack('ticker')], axis=1).dropna()

# calculate 5-year rolling average of dollar volume for each stocks before filtering
data['dollar volume'] = data.loc[:, 'dollar volume'].unstack('ticker').rolling(5 * 12).mean().stack()
data['dollar_vol_rank'] = data.groupby('date')['dollar volume'].rank(ascending=False) # 从1开始赋值，序号
data = data[data['dollar_vol_rank']<150].drop(['dollar volume', 'dollar_vol_rank'], axis=1)

In [14]:
def calculate_returns(df):
    outlier_cutoff = 0.005
    lags = [1, 2, 3, 6, 9, 12] # months
    for lag in lags:
        df[f'return_{lag}m'] = (df['close']
                                .pct_change(lag)
                                .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff), upper=x.quantile(1-outlier_cutoff)))
                                .add(1)
                                .pow(1/lag)
                                .sub(1))
    return df
data = data.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()

In [15]:
factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3', 
                             'famafrench', 
                             start='2010')[0].drop('RF', axis=1)
factor_data.index = factor_data.index.to_timestamp()
factor_data.index.name = 'date'

factor_data = factor_data.resample('M').last().div(100)
factor_data = factor_data.join(data['return_1m']).sort_index()

In [18]:
factor_data.xs('顺丰控股', level=1).head()

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-02-29,-0.0007,0.0088,-0.0057,0.0325,0.0202,-0.014806
2016-03-31,0.0696,0.0107,0.0119,0.0077,-0.0008,0.411561
2016-04-30,0.0091,0.0123,0.0328,-0.0297,0.019,0.000819
2016-05-31,0.0178,-0.0061,-0.0166,-0.0109,-0.0248,0.11293
2016-06-30,-0.0005,0.0045,-0.0148,0.014,0.0194,1.468564


In [23]:
# filter out stocks with less than 10 months
observations = factor_data.groupby(level=1).size()
valid_stocks = observations[observations >= 10]
factor_data = factor_data[factor_data.index.get_level_values('ticker').isin(valid_stocks.index)]
factor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Mkt-RF,SMB,HML,RMW,CMA,return_1m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-31,TCL中环,-0.0577,-0.0347,0.0209,0.0282,0.0308,-0.340158
2016-01-31,TCL科技,-0.0577,-0.0347,0.0209,0.0282,0.0308,-0.180685
2016-01-31,一汽解放,-0.0577,-0.0347,0.0209,0.0282,0.0308,-0.254461
2016-01-31,三一重工,-0.0577,-0.0347,0.0209,0.0282,0.0308,-0.344594
2016-01-31,上海机场,-0.0577,-0.0347,0.0209,0.0282,0.0308,-0.137491
...,...,...,...,...,...,...,...
2023-11-30,阳光电源,0.0884,-0.0010,0.0165,-0.0389,-0.0099,-0.011289
2023-11-30,陕西煤业,0.0884,-0.0010,0.0165,-0.0389,-0.0099,0.081021
2023-11-30,隆基绿能,0.0884,-0.0010,0.0165,-0.0389,-0.0099,-0.120182
2023-11-30,青岛啤酒,0.0884,-0.0010,0.0165,-0.0389,-0.0099,-0.107814


In [19]:
# calculate rolling factor betas

Unnamed: 0_level_0,Unnamed: 1_level_0,atr,bb_high,bb_low,bb_mid,close,garman_klass_vol,macd,rsi,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-01-31,TCL中环,-0.298897,2.245430,1.907973,2.076701,6.21,0.001774,-1.183499,34.177321,-0.340158,-0.222771,-0.150225,-0.120853,-0.115281,-0.023393
2016-01-31,TCL科技,0.293504,1.388639,1.227750,1.308195,2.63,0.000768,-1.227463,43.835326,-0.180685,-0.039372,-0.038636,-0.056574,-0.068599,-0.005206
2016-01-31,一汽解放,0.925643,2.682087,2.392827,2.537457,11.28,0.001207,-1.770083,39.486200,-0.254461,-0.094919,-0.111097,-0.048495,-0.072793,-0.031344
2016-01-31,三一重工,-0.570688,1.687553,1.298428,1.492990,2.72,0.001636,-0.887111,30.296696,-0.344594,-0.207425,-0.161875,-0.111266,-0.115871,-0.063801
2016-01-31,上海机场,-0.238820,3.305052,3.150265,3.227658,22.96,0.000332,-1.024047,37.524839,-0.137491,-0.054809,-0.056308,-0.014153,-0.006646,0.024661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-31,阳光电源,0.749335,4.543287,4.392325,4.467806,81.75,0.000990,0.339835,45.216842,-0.066674,-0.008752,-0.009599,-0.050411,-0.034874,-0.037782
2024-01-31,陕西煤业,0.933320,3.222411,3.105961,3.164186,23.82,0.000141,2.632033,73.587947,0.140258,0.105800,0.097478,0.065923,0.035577,0.026664
2024-01-31,隆基绿能,0.113113,3.170717,3.097985,3.134351,20.87,0.000594,-0.082597,43.629484,-0.088646,-0.008515,-0.047229,-0.057901,-0.054227,-0.066647
2024-01-31,青岛啤酒,0.314785,4.319882,4.231403,4.275643,75.08,0.000361,-0.102707,56.113750,0.004415,0.003481,-0.035080,-0.044500,-0.046271,-0.022981
