In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set(color_codes=True)
import scipy.stats as stats
import pickle
import gzip

In [13]:
def get_resampled_data(path):
    df = pickle.loads(gzip.decompress(open(path, 'rb').read()))
    df.set_index('datetime', inplace=True)
    daily_data = df.resample('B').agg({'open': 'first', 
                                    'high': 'max', 
                                    'low': 'min', 
                                    'close': 'last', 
                                    'volume': 'sum', 
                                    'amount': 'sum'})
    daily_data['daily_return_ctc'] = daily_data['close'].pct_change()
    daily_data['daily_return_otc'] = (daily_data['close']-daily_data['open'])/daily_data['open']
    df_resampled_5min = df.resample('5T').last()
    df_resampled_5min['log_return'] = np.log(df_resampled_5min['close'] / df_resampled_5min['close'].shift())
    realized_vol = pd.DataFrame(df_resampled_5min.groupby(df_resampled_5min.index.date)['log_return'].transform('std'))
    realized_vol.rename(columns={'log_return':'realized_volatility'},inplace=True)
    daily_data = daily_data.merge(realized_vol,left_on = daily_data.index, right_on = realized_vol.index,how='left')
    daily_data.rename(columns={'key_0':'date'},inplace=True)
    return daily_data

In [14]:
underlying_code = 'SH510050'
path = 'data/'+underlying_code+'_index_hh.pkl.gz'
data = get_resampled_data(path)

In [17]:
underlying_code = 'SH510050'
df = pickle.loads(gzip.decompress(open('data/'+underlying_code+'_factors.pkl.gz', 'rb').read()))
df.columns

Index(['date', 'daily_return_ctc', 'daily_return_otc', 'realized_volatility',
       'KMID', 'KLEN', 'KMID2', 'KUP', 'KUP2', 'KLOW',
       ...
       'IMIN60', 'IMXD60', 'CORR60', 'CORD60', 'CNTP60', 'SUMP60', 'VMA60',
       'VSTD60', 'WVMA60', 'VSUMP60'],
      dtype='object', length=132)

In [8]:
df.shape

(4860, 132)