# Experiments with Machine Learning

**APPROACH:** Predict the price using Machine Learning models, then decide to go long or short.

First, import necessary libraries

In [1]:
import pandas as pd 
import yfinance as yf
import math
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, accuracy_score
from backtesting import Backtest
from business_logic.decision_making.data_prepration import get_OHLC_df, label_OHLC_df, split_train_test, prepare_data_train_model
from business_logic.decision_making.strategies.BinaryClassificationStrategy import BinaryClassificationStrategy
from business_logic.models.portfolio import Portfolio
from business_logic.models.stock import Stock
from enums import Position

## Prepare data
3-year data from 2017-2019, and test with data of 2020 and the first half of 2021

In [2]:
aapl = yf.Ticker('AAPL')
orig_data = aapl.history(start='2018-04-02', end='2021-03-31') 
orig_data.shape

(755, 7)

In [3]:
orig_data.index

DatetimeIndex(['2018-04-02', '2018-04-03', '2018-04-04', '2018-04-05',
               '2018-04-06', '2018-04-09', '2018-04-10', '2018-04-11',
               '2018-04-12', '2018-04-13',
               ...
               '2021-03-17', '2021-03-18', '2021-03-19', '2021-03-22',
               '2021-03-23', '2021-03-24', '2021-03-25', '2021-03-26',
               '2021-03-29', '2021-03-30'],
              dtype='datetime64[ns]', name='Date', length=755, freq=None)

As can be seen from above, the data fetched from Yahoo Finance is a Dataframe, indexed and sorted by date, which is very convenient. The next step is to split the data into train and test set:

In [4]:
data = get_OHLC_df(orig_data)
data = label_OHLC_df(data, 2)
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-04,39.768909,41.488656,39.742377,1.0,138422000
2018-04-05,41.626139,42.024116,41.505539,1.0,107732800
2018-04-06,41.237811,41.602020,40.569688,-1.0,140021200
2018-04-09,40.974904,41.749151,40.967668,-1.0,116070800
2018-04-10,41.727450,41.968650,41.372887,1.0,113634400
...,...,...,...,...,...
2021-03-24,122.820000,122.900002,120.070000,-1.0,88530500
2021-03-25,119.540001,121.660004,119.000000,-1.0,98844700
2021-03-26,120.349998,121.480003,118.919998,1.0,93958900
2021-03-29,121.650002,122.580002,120.730003,1.0,80819200


In [5]:
data.Close.unique()

array([ 1., -1.])

In [6]:
split_date = np.datetime64('2020-03-31')
X_train, X_test, y_train, y_test = split_train_test(data, split_date)

## Build models and test their performance

First, I will create a Random Forest Classifier to predict if the price will go up or down. My strategy will then decide to go long or short accordingly. For experimenting, I dedcided to create a classifier with default values.

In [7]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
f1_score(y_test, y_pred)

0.6168224299065421

In [8]:
accuracy_score(y_test, y_pred)

0.5119047619047619

In [9]:
class TwoClassRandomForestStrategy(BinaryClassificationStrategy):
    price_delta = .004

    def init(self):
        self.clf = rfc

    def next(self):
        row = self.data.df.iloc[-1:]
        X = row[['Open', 'High', 'Low', 'Volume']]
        pred = self.clf.predict(X)[0]

        self.decide_trade(pred)

        # if position has been hold for more than 2 days => set stop-loss to be more aggressive
        current_time = self.data.index[-1]
        high, low = self.data.High, self.data.Low
        for trade in self.trades:
            if current_time - trade.entry_time > pd.Timedelta('2 days'):
                if trade.is_long:
                    trade.sl = max(trade.sl, low)
                else:
                    trade.sl = min(trade.sl, high)

In [10]:
test_data = orig_data[orig_data.index > split_date]
bt = Backtest(test_data, TwoClassRandomForestStrategy, commission=.0002, margin=.05)
bt.run()

Start                     2020-04-01 00:00:00
End                       2021-03-30 00:00:00
Duration                    363 days 00:00:00
Exposure Time [%]                   99.203187
Equity Final [$]                   397.524608
Equity Peak [$]                       10000.0
Return [%]                         -96.024754
Buy & Hold Return [%]              100.621644
Return (Ann.) [%]                  -96.075505
Volatility (Ann.) [%]                1.585817
Sharpe Ratio                              0.0
Sortino Ratio                             0.0
Calmar Ratio                              0.0
Max. Drawdown [%]                  -96.024754
Avg. Drawdown [%]                  -96.024754
Max. Drawdown Duration      362 days 00:00:00
Avg. Drawdown Duration      362 days 00:00:00
# Trades                                  250
Win Rate [%]                              6.0
Best Trade [%]                        0.58093
Worst Trade [%]                     -3.204015
Avg. Trade [%]                    

In the first attempt, this model lost us almost all of our money. This is understandable because this model uses only default values for hyperparameters, which results in only about 50% accuracy. This will need a lot of fine-tuning.
Also, the current strategy is very sensitive to price changes because even the slightest change is classified with either up or down. Therefore, if we are going long and the price experiences a small hiccup but the upward trend remains, our bot would just sell all the shares because of that hiccup.

## Save to Mongo DB

In [11]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..')))
from database.mongo_client import get_mongo_db_conn
from business_logic.model_crud import save_model_to_mongo
import pymongo
import time
import pickle

save_model_to_mongo(rfc, "RandomForestDefault")

# 3-class labelled data

In [12]:
data = get_OHLC_df(orig_data)
data = label_OHLC_df(data, 2, small_change_threshold=0.004)
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-04,39.768909,41.488656,39.742377,1.0,138422000
2018-04-05,41.626139,42.024116,41.505539,1.0,107732800
2018-04-06,41.237811,41.602020,40.569688,-1.0,140021200
2018-04-09,40.974904,41.749151,40.967668,-1.0,116070800
2018-04-10,41.727450,41.968650,41.372887,1.0,113634400
...,...,...,...,...,...
2021-03-24,122.820000,122.900002,120.070000,-1.0,88530500
2021-03-25,119.540001,121.660004,119.000000,-1.0,98844700
2021-03-26,120.349998,121.480003,118.919998,1.0,93958900
2021-03-29,121.650002,122.580002,120.730003,1.0,80819200


In [13]:
data.Close.value_counts()

 1.0    437
-1.0    257
 0.0     59
Name: Close, dtype: int64

Dataset is a bit skewed, so accuracy_score shouldn't be considered very seriously. 

In [14]:
X_train, X_test, y_train, y_test = split_train_test(data, split_date)

In [15]:
class ThreeClassRandomForestStrategy(BinaryClassificationStrategy):
    def init(self):
        self.clf = RandomForestClassifier()
        self.prepare_model(self.clf)

    def next(self):
        print(self.position)
        if self.data.df.index[-1] < self.split_date:
            return

        row = self.data.df.iloc[-1:]
        X = row[['Open', 'High', 'Low', 'Volume']]
        pred = self.clf.predict(X)[0]
        # print(f'Date: {str(row.index[0])} -- Pred: {pred} -- Actual: {row.Close.values[0]}')
        self.decide_trade(pred)

bt = Backtest(orig_data, ThreeClassRandomForestStrategy, commission=.0002, margin=.05)
stats = bt.run()

f1 score: 0.3243237807943691
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trades)>
<Position: 0 (0 trade

In [16]:
stats

Start                     2018-04-02 00:00:00
End                       2021-03-30 00:00:00
Duration                   1093 days 00:00:00
Exposure Time [%]                   32.715232
Equity Final [$]                   386.938765
Equity Peak [$]                       10000.0
Return [%]                         -96.130612
Buy & Hold Return [%]              198.235923
Return (Ann.) [%]                  -66.225376
Volatility (Ann.) [%]                8.604834
Sharpe Ratio                              0.0
Sortino Ratio                             0.0
Calmar Ratio                              0.0
Max. Drawdown [%]                  -96.130612
Avg. Drawdown [%]                  -96.130612
Max. Drawdown Duration      364 days 00:00:00
Avg. Drawdown Duration      364 days 00:00:00
# Trades                                  248
Win Rate [%]                         6.048387
Best Trade [%]                        0.58093
Worst Trade [%]                     -3.483085
Avg. Trade [%]                    

In [17]:
stat_df = stats['_trades']
stat_df

Unnamed: 0,Size,EntryBar,ExitBar,EntryPrice,ExitPrice,PnL,ReturnPct,EntryTime,ExitTime,Duration
0,-654,504,504,61.138760,61.150990,-7.998549,-0.000200,2020-04-01,2020-04-01,0 days
1,670,505,505,59.634753,59.525183,-73.411891,-0.001837,2020-04-02,2020-04-02,0 days
2,658,506,506,60.245149,60.233102,-7.926676,-0.000200,2020-04-03,2020-04-03,0 days
3,-637,507,507,62.230069,62.242517,-7.929697,-0.000200,2020-04-06,2020-04-06,0 days
4,589,508,508,67.192698,64.852319,-1378.483243,-0.034831,2020-04-07,2020-04-07,0 days
...,...,...,...,...,...,...,...,...,...,...
243,-13,751,751,119.516093,120.570356,-13.705424,-0.008821,2021-03-25,2021-03-25,0 days
244,13,752,752,120.374068,120.107636,-3.463618,-0.002213,2021-03-26,2021-03-26,0 days
245,-12,753,753,121.625672,121.694839,-0.830011,-0.000569,2021-03-29,2021-03-29,0 days
246,-13,754,754,120.085979,120.110001,-0.312286,-0.000200,2021-03-30,2021-03-30,0 days


In [18]:
portfolio = Portfolio()
symbol = 'AAPL'
split_date = np.datetime64('2020-03-31')

clf = RandomForestClassifier()
prepare_data_train_model(clf, orig_data, split_date)

for index, row in orig_data.iterrows():
    X = [row[['Open', 'High', 'Low', 'Volume']]]
    prediction = clf.predict(X)
    stock = portfolio.get_stock(symbol)
    if stock is None:
        current_price = row['Close']
        time = str(row.index[0])
        num_shares = math.floor(portfolio.max_per_stock/current_price)
        stock = None
        if prediction == 1:
            stock = Stock(symbol, current_price, num_shares, time, Position.IS_LONG, is_testing = True)
            portfolio.add_stock(stock)            
        elif prediction == -1:
            stock = Stock(symbol, current_price, -num_shares, time, Position.IS_SHORT, is_testing = True)
            portfolio.add_stock(stock)
        print(f"Pred: {prediction} -- Added stock: {str(stock)} -- Total balance: {portfolio.balance}")
    else:
        if (prediction == 1 and stock.position == Position.IS_SHORT) or (prediction == -1 and stock.position == Position.IS_LONG):
            portfolio.drop_stock(stock.symbol)
            print(f"Pred: {prediction} -- Dropped stock: {str(stock)} -- Total balance: {portfolio.balance}")
        

portfolio.drop_stock(symbol) # sell all shares by the end to see the total return
print(portfolio.balance)

Pred: [-1.] -- Added stock: AAPL, 40.20307159423828, -19 -- Total balance: 4763.858360290527
Pred: [1.] -- Dropped stock: AAPL, 40.20307159423828, -19 -- Total balance: 4000.0
Pred: [1.] -- Added stock: AAPL, 41.39217758178711, 19 -- Total balance: 3213.548625946045
Pred: [-1.] -- Dropped stock: AAPL, 41.39217758178711, 19 -- Total balance: 4000.0
Pred: [-1.] -- Added stock: AAPL, 41.015907287597656, -19 -- Total balance: 4779.3022384643555
Pred: [1.] -- Dropped stock: AAPL, 41.015907287597656, -19 -- Total balance: 4000.0
Pred: [1.] -- Added stock: AAPL, 41.592369079589844, 19 -- Total balance: 3209.744987487793
Pred: [-1.] -- Dropped stock: AAPL, 41.592369079589844, 19 -- Total balance: 4000.0
Pred: [-1.] -- Added stock: AAPL, 39.97152328491211, -20 -- Total balance: 4799.430465698242
Pred: [1.] -- Dropped stock: AAPL, 39.97152328491211, -20 -- Total balance: 4000.0
Pred: [-1.] -- Added stock: AAPL, 39.15144348144531, -20 -- Total balance: 4783.028869628906
Pred: [1.] -- Dropped stoc