# Ch10 Model

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 


from adv_finance import bars, labeling, utils, features, stats

In [2]:
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.utils import shuffle

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
plt.style.use('seaborn-talk')
plt.style.use('bmh')
pd.set_option('display.max_rows', 100)


In [7]:
data = pd.read_parquet('/nfs/data/interim_2018/TRADE_A233740_DB.parq')
data = data.loc[~data.index.duplicated()]


In [8]:
data.head() 

Unnamed: 0_level_0,open,high,low,close,vol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02 10:00:03.348,19800,19980,19800,19980,215968
2018-01-02 10:00:15.776,19980,20015,19980,20005,51025
2018-01-02 10:00:39.990,20005,20065,20005,20045,49957
2018-01-02 10:01:11.518,20045,20075,20040,20050,50140
2018-01-02 10:01:30.166,20050,20080,20045,20080,54775


## Primary Model

In [9]:
# Compute sides 
# data['side'] = np.nan 

In [10]:
%%time 

# CUSUM Filter 
daily_vol = stats.get_daily_vol(data['close'], lookback=100)
threshold = daily_vol.mean() * 0.2
t_events = labeling.cusum_filter(data['close'], threshold)
v_barriers = labeling.add_vertical_barrier(t_events=t_events, close=data['close'], num_days=1)


# Side Decision by Cusum Filter & Triple Barrier 
pt_sl = [1, 1]
min_ret = 0.01 
t_side_events = labeling.get_events(close=data['close'],
                                            t_events=t_events,
                                            pt_sl=pt_sl,
                                            target=daily_vol,
                                            min_ret=min_ret,
                                            num_threads=8,
                                            vertical_barrier_times=v_barriers,
                                            side_prediction=None)


side_labels = labeling.get_bins(t_side_events, data['close'])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  target = target.loc[t_events]
2019-06-15 17:32:23.052601 100.0 apply_pt_sl_on_t1 done after 0.07 minutes. Remaining 0.0 minutes.


CPU times: user 3.41 s, sys: 52.6 ms, total: 3.46 s
Wall time: 7.65 s


## Metal Label

In [11]:
%%time
pt_sl = [1, 2]
min_ret = 0.02
t_barrier_events = labeling.get_events(close=data['close'],
                                            t_events=t_events,
                                            pt_sl=pt_sl,
                                            target=daily_vol,
                                            min_ret=min_ret,
                                            num_threads=8,
                                            vertical_barrier_times=v_barriers,
                                           side_prediction=side_labels['bin'])


labels = labeling.get_bins(t_barrier_events, data['close'])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  target = target.loc[t_events]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  side_ = side_prediction.loc[target.index]


CPU times: user 147 ms, sys: 35.2 ms, total: 182 ms
Wall time: 1.17 s


2019-06-15 17:32:24.414711 100.0 apply_pt_sl_on_t1 done after 0.02 minutes. Remaining 0.0 minutes.


## Features

In [12]:
%%time

raw_data = data.copy()

# Log Returns
raw_data['log_ret'] = np.log(raw_data['close']).diff()

# Momentum
raw_data['mom1'] = raw_data['close'].pct_change(periods=1)
raw_data['mom3'] = raw_data['close'].pct_change(periods=3)
raw_data['mom5'] = raw_data['close'].pct_change(periods=5)

# Volatility
raw_data['volatility_15'] = raw_data['log_ret'].rolling(window=15, min_periods=15, center=False).std()
raw_data['volatility_50'] = raw_data['log_ret'].rolling(window=50, min_periods=50, center=False).std()



# Serial Correlation (Takes about 4 minutes)
window_autocorr = 30
raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False)
raw_data['autocorr_5'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=5), raw=False)

# Get the various log -t returns
raw_data['log_t1'] = raw_data['log_ret'].shift(1)
raw_data['log_t3'] = raw_data['log_ret'].shift(3)
raw_data['log_t5'] = raw_data['log_ret'].shift(5)

# Remove look ahead bias
raw_data = raw_data.shift(1)

CPU times: user 1min 7s, sys: 38.7 ms, total: 1min 7s
Wall time: 1min 7s


In [13]:
# Get features at event dates 
X = raw_data 

# Drop unwanted columns 
try: 
    X.drop(['open', 'high', 'low', 'close', 'vol'], axis=1, inplace=True)

except Exception as e: 
    print(e)

    
y = labels.loc[X.index, 'bin']

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


In [21]:
y = y.dropna()
X = X.dropna()
com_idx = y.index.join(X.index).join(labels.index)


In [22]:
X = X.loc[com_idx]
y = y.loc[com_idx]



## Balance Classes 

In [23]:
# Split data into training, validation and test sets
X_training_validation = X['2018-01-01':'2018-10-31']
y_training_validation = y['2018-01-01':'2018-10-31']
X_train, X_validate, y_train, y_validate = train_test_split(X_training_validation, y_training_validation, test_size=0.2, shuffle=False)

In [24]:
train_df = pd.concat([y_train, X_train], axis=1, join='inner')
train_df['bin'].value_counts()

1.0    1329
0.0     684
Name: bin, dtype: int64

In [25]:
# Upsample the training data to have a 50 - 50 split
# https://elitedatascience.com/imbalanced-classes
majority = train_df[train_df['bin'] == 0]
minority = train_df[train_df['bin'] == 1]

new_minority = resample(minority, 
                   replace=True,     # sample with replacement
                   n_samples=majority.shape[0],    # to match majority class
                   random_state=42)

train_df = pd.concat([majority, new_minority])
train_df = shuffle(train_df, random_state=42)

train_df['bin'].value_counts()

1.0    684
0.0    684
Name: bin, dtype: int64

## Fit a Model

In [26]:
%%time 

depth = 2
n_estimator = 10


# Refit a new model with best params, so we can see feature importance
rf2 = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator, criterion='entropy', random_state=42)
rf2.fit(X_train, y_train.values.ravel())

CPU times: user 20.3 ms, sys: 4 µs, total: 20.3 ms
Wall time: 19.8 ms


## Bet Sizing

In [79]:
from adv_finance.labeling import get_signal

In [27]:
y_prob = rf2.predict_proba(X_train)

In [31]:
y_df = pd.Series(y_prob[:, 1], index=X_train.index)

In [33]:
t_barrier_events.head()

Unnamed: 0,side,t1,trgt
2018-01-04 15:12:34.986,0.0,2018-01-05 15:13:36.604,0.02006
2018-01-04 15:13:16.432,1.0,2018-01-05 09:01:17.944,0.020855
2018-01-04 15:14:53.715,1.0,2018-01-05 09:01:03.808,0.022589
2018-01-04 15:15:40.118,1.0,2018-01-05 09:01:03.808,0.023663
2018-01-04 15:16:46.975,1.0,2018-01-05 09:01:17.944,0.025145


In [46]:
import pickle

In [51]:
with open('t_barrier_events.pickle', 'wb') as f:
    pickle.dump(t_barrier_events.to_dict(), f)


In [52]:
with open('y_prob.pickle', 'wb') as f:
    pickle.dump(y_df.to_dict(), f)


In [80]:
bet_size = get_signal(y_df, t_barrier_events0)

In [97]:
df_betsize = pd.DataFrame(y_df, columns=['Prob'])
df_betsize['BetSize'] = bet_size


# APPENDIX