## Imports

In [56]:
from sklearn.model_selection import train_test_split , ShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn import metrics
import xgboost as xgb

import pandas as pd
import numpy as np
import random

import backtrader as bt

import matplotlib.pyplot as plt

import tabulate

## Data loading from CSV
### Stockdata
* Load data from CSV

In [81]:
stock_list = pd.read_csv('csv/result_table.csv',sep=',')
u_symbol = stock_list['SYMBOL'].unique()
date = pd.Series(stock_list['Date'])
stock_list.drop(['Date'],axis=1,inplace=True)
stock_list['date'] = date.apply(lambda x: np.datetime64(x))

stock_list

Unnamed: 0,SYMBOL,1Day,1Week,1Month,3Months,6Months,1Year,2Years,Day1Prior,Day2Prior,...,Month1Prior,Month2Prior,Month3Prior,Month4Prior,Month5Prior,Month6Prior,Month7Prior,Year1Prior,Year2Prior,date
0,ALT,-0.006659,-0.087680,,,,,,0.065012,-0.133654,...,-0.249167,0.174707,2.817797,2.412879,2.085616,2.312500,1.815625,-0.272213,0.202937,2024-02-14
1,JOBY,0.008170,0.024510,-0.151961,0.004902,,,,-0.067073,-0.055556,...,-0.159341,-0.202086,-0.420455,-0.216389,0.313305,0.481840,0.614776,0.478261,-0.322259,2023-10-11
2,TLRY,0.004065,-0.024390,-0.223577,-0.195122,,,,-0.039062,-0.160410,...,-0.068182,0.464286,0.556962,-0.027668,0.069565,-0.075188,-0.115108,-0.236025,-0.797864,2023-09-19
3,CVNA,-0.004679,0.000425,0.602722,1.161208,0.680136,,,0.105833,0.232826,...,1.239048,1.620959,2.334752,1.133394,2.349003,3.867495,1.367573,0.117927,-0.914403,2023-06-13
4,UEC,0.007092,0.056738,-0.106383,0.209220,0.914894,,,0.032967,0.032967,...,-0.203390,-0.286076,-0.255937,-0.267532,-0.331754,-0.124224,-0.369128,-0.422131,-0.075410,2023-03-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,CCRC,-0.002417,-0.016116,-0.033038,-0.140210,-0.103143,-0.157937,-0.629331,0.001614,0.044613,...,0.035893,0.299476,0.453162,0.411832,-0.376068,-0.328100,-0.250151,-0.108477,-0.119233,2018-11-15
444,SRNEQ,0.119565,0.195652,0.369565,0.543478,-0.032609,0.039130,-0.606522,-0.269841,-0.342857,...,-0.480226,-0.406452,0.260274,1.044444,0.862348,1.628571,1.555556,0.150000,-0.065041,2018-03-28
445,MARK,-0.056402,0.292683,0.350610,-0.076220,-0.376524,-0.696646,-0.914634,-0.056115,-0.290811,...,-0.497318,-0.257919,0.935103,1.122977,1.411765,1.385455,1.095847,1.030960,0.623762,2018-02-07
446,IPO.L,-0.044976,-0.031098,-0.065756,-0.294122,-0.108625,-0.241517,-0.507230,-0.003475,-0.003475,...,0.039580,0.010499,0.151426,0.075150,0.004892,0.078351,0.010499,-0.085454,-0.324780,2017-12-18


In [92]:
non_test_variables = ['SYMBOL','1Day','1Week','1Month','3Months','6Months','1Year','2Years','date']
predict_variables = ['1Day','1Week','1Month','3Months','6Months','1Year','2Years']
headers = 

print("Number of rows before elimination of NaN: {0}".format(len(stock_list)))
stock_list_2y = stock_list.dropna(subset=["2Years"])
print("Number of rows after elimination of NaN only in 2Years: {0}".format(len(stock_list_2y)))
stock_list_2y = stock_list_2y.dropna(subset=non_test_variables)
print("Number of rows after elimination of NaN only in 2Years and other: {0}".format(len(stock_list_2y)))
stock_list_dropna = stock_list.dropna()
print("Number of rows after elimination of NaN: {0}".format(len(stock_list_dropna)))

Number of rows before elimination of NaN: 448
Number of rows after elimination of NaN only in 2Years: 337
Number of rows after elimination of NaN only in 2Years and other: 297
Number of rows after elimination of NaN: 219


## Create Testsplit
* We create a testsplit at a certain date which allows for roughly 70% of the data to be before it for training.
* We sort the data before hand and then split it such is guranteed that we dont have future data in our training data. 
* The other 30% will be after the date, we ensure this way that when we do out of sample testing that the training wasnt
"poisoned" with future data.

In [157]:
# We dont shuffle the array to prevent having future data in training set. 
sorted_stock_list = stock_list_dropna.sort_values(['date'])
train, test = train_test_split(stock_list_dropna, test_size=0.3, shuffle = False)
# Making sure we got no overlap
print("First date of \033[92mtraining\033[0m data: {0}".format(train['date'].iloc[0]))
print("Last date of \033[92mtraining\033[0m data:  {0}".format(train['date'].iloc[-1]))
print("First date of \x1b[31mtest\x1b[0m data:     {0}".format(test['date'].iloc[0]))
print("Last date of \x1b[31mtest\x1b[0m data:      {0}".format(test['date'].iloc[-1]))

First date of [92mtraining[0m data: 2021-12-15 00:00:00
Last date of [92mtraining[0m data:  2019-08-28 00:00:00
First date of [31mtest[0m data:     2019-08-13 00:00:00
Last date of [31mtest[0m data:      2017-06-15 00:00:00


In [59]:
x_train = np.array(train.drop(non_test_variables,axis=1))
y_train = {}
for variable in predict_variables:
    y_train[variable] = np.array(train[variable])

x_test = np.array(test.drop(non_test_variables,axis=1))
y_test ={}
for variable in predict_variables:
    y_test[variable] = np.array(test[variable])

## Train

In [60]:
regressors = {}
for variable in predict_variables: 
    # Define classifier
    clf = xgb.XGBRegressor()
    # Train on trainsplit
    clf.fit(x_train,y_train[variable])
    regressors[variable] = clf

## Predict In and Out-sample

In [61]:
# predict insample
for variable in predict_variables: 
    y_pred_insample = regressors[variable].predict(x_train)
    mse = mean_squared_error(y_pred_insample, y_train[variable])
    print("Mean squared error insample for variable:{0} = {1}".format(variable, mse))

print('\n')

# predict outsample
outsample_predictions = {}
for variable in predict_variables: 
    y_pred_outsample = regressors[variable].predict(x_test)
    y_base_truth = y_test[variable]
    outsample_predictions[variable] = y_pred_outsample
    mse = mean_squared_error(y_pred_outsample, y_test[variable])
    print("Mean squared error outsample for variable:{0} = {1}".format(variable, mse))
#%TODO strategie und plot fixen
#%TODO Strategie auf dem schreiben und testen

Mean squared error insample for variable:1Day = 6.564940709182439e-07
Mean squared error insample for variable:1Week = 5.499838541992013e-07
Mean squared error insample for variable:1Month = 5.591008867812544e-07
Mean squared error insample for variable:3Months = 3.7795278932067563e-07
Mean squared error insample for variable:6Months = 4.272477885464009e-07
Mean squared error insample for variable:1Year = 4.818931240968544e-07
Mean squared error insample for variable:2Years = 5.701602215980852e-07


Mean squared error outsample for variable:1Day = 0.007642666791922184
Mean squared error outsample for variable:1Week = 0.0207512885268989
Mean squared error outsample for variable:1Month = 0.05937085221636432
Mean squared error outsample for variable:3Months = 0.3384735353452747
Mean squared error outsample for variable:6Months = 0.4545161328974314
Mean squared error outsample for variable:1Year = 1.194387189044135
Mean squared error outsample for variable:2Years = 1.5837983600057832


## Predict direction
* We are testing if even tho the MSE isn't that great, we can at least predict the direction accuratly
* 1 represents falling 0 is rising

In [62]:
def classifier(x):
    if( x >= 0 ):
        return 0
    else:
        return 1

for variable in predict_variables: 
    classified_predictions = np.array(list(map(classifier, outsample_predictions[variable])))
    classified_y = np.array(list(map(classifier, y_test[variable])))
    print("Accuracy for {0} :".format(variable),metrics.accuracy_score(classified_y,classified_predictions))
    print("Precision for {0} :".format(variable),metrics.precision_score(classified_y, classified_predictions,average="macro"))
    print("Recall for {0} :".format(variable),metrics.recall_score(classified_y, classified_predictions,average="macro"))

# TODO Mathematisch berechnen. 
# TODO check if 1year and 2year predictions are equally accurate for one stock

Accuracy for 1Day : 0.48484848484848486
Precision for 1Day : 0.47609756097560973
Recall for 1Day : 0.4774193548387097
Accuracy for 1Week : 0.5606060606060606
Precision for 1Week : 0.5666666666666667
Recall for 1Week : 0.5671015843429636
Accuracy for 1Month : 0.6060606060606061
Precision for 1Month : 0.6153846153846154
Recall for 1Month : 0.6119815668202765
Accuracy for 3Months : 0.5151515151515151
Precision for 3Months : 0.5147058823529411
Recall for 3Months : 0.5147058823529411
Accuracy for 6Months : 0.5151515151515151
Precision for 6Months : 0.5027777777777778
Recall for 6Months : 0.5029761904761905
Accuracy for 1Year : 0.5757575757575758
Precision for 1Year : 0.5668202764976958
Recall for 1Year : 0.5733063700707786
Accuracy for 2Years : 0.6212121212121212
Precision for 2Years : 0.6286764705882353
Recall for 2Years : 0.6680672268907563


## Calculate possible returns
* Assuming we enter the market 1 Day after the short selling repport and either short or go long depending on the predictions over two years. 

In [63]:
# Data for the broker
clean_data = pd.read_csv('clean_data.csv',sep=',')
u_symbol = clean_data['symbol'].unique()
date = pd.Series(clean_data['datetime'])
clean_data.drop(['datetime'],axis=1,inplace=True)
clean_data['date'] = date.apply(lambda x: np.datetime64(x))

stock_data = {}
for symbol in u_symbol:
    stock_data[symbol] = clean_data[stock_list['symbol'] == symbol]

stock_data[list(stock_data.keys())[0]]

Unnamed: 0,open,high,low,close,volume,symbol,log_returns,date
0,32.30,33.58,32.30,32.84,7315000.0,1179.HK,0.056371,2020-09-23
1,34.08,34.20,33.50,33.73,2797160.0,1179.HK,0.026740,2020-09-24
2,33.12,33.12,32.30,32.62,3964430.0,1179.HK,-0.033462,2020-09-25
3,32.68,33.22,32.68,33.08,763500.0,1179.HK,0.014003,2020-09-28
4,33.58,34.18,33.58,33.99,2398130.0,1179.HK,0.027137,2020-09-29
...,...,...,...,...,...,...,...,...
838,29.20,29.20,28.30,28.85,2695218.0,1179.HK,-0.012059,2024-02-22
839,29.10,30.05,29.10,29.70,1548300.0,1179.HK,0.029037,2024-02-23
840,30.35,31.00,30.30,30.40,3111600.0,1179.HK,0.023296,2024-02-26
841,29.60,30.10,29.15,29.95,2316600.0,1179.HK,-0.014913,2024-02-27


In [64]:
# Method to find nearest date to a given pivot
def nearest(items, pivot):
    return min(items, key=lambda x: abs(x - pivot))

### Trade mit XGBOOST

In [219]:
def trade(test, regressor):
    returns = {}
    for index, row in test.iterrows():
        x = np.array(test[test['SYMBOL'] == row['SYMBOL']].drop(non_test_variables, axis=1))
        prediction_2y = regressors[regressor].predict(x)[0]
        # print(f"prediction 2 Years for stock{row['SYMBOL']} : {prediction_2y}")

        df = stock_data[row['SYMBOL']]
        buy_data = df[df['date'] == row['date']]
        entry_price = (buy_data['high'].values[0] + buy_data['low'].values[0])/2

        sell_date = row['date'] + np.timedelta64(730,'D')
        # Get the first date 2 years after entring the market
        sell_pivot = nearest(np.array(df['date']), sell_date)
        sell_data = df[df['date'] == sell_pivot]
        exit_price = (sell_data['high'].values[0] + sell_data['low'].values[0])/2

        profit = 0
        profit_percentage = 0
        result = {}
        if(prediction_2y > 0):
            # long
            profit = exit_price - entry_price
            profit_percentage = profit/ entry_price * 100
        else:
            # short
            profit = entry_price - exit_price
            profit_percentage = profit/ entry_price * 100
            

        result['entry_price'] = entry_price
        result['exit_price'] = exit_price
        result['strategy'] = 'long' if prediction_2y > 0 else 'short'
        result['profit'] = profit
        result['profit_percentage'] = profit_percentage
        # print(result)
        # returns.put(np.array(result))
        returns[row['SYMBOL']] = result
    return(returns)

{'RESN': {'entry_price': 3.15,
  'exit_price': 2.406,
  'strategy': 'short',
  'profit': 0.7439999999999998,
  'profit_percentage': 23.619047619047613},
 'AXDX': {'entry_price': 203.3,
  'exit_price': 60.4,
  'strategy': 'short',
  'profit': 142.9,
  'profit_percentage': 70.29021151008362},
 'UEEC': {'entry_price': 1.675,
  'exit_price': 0.96,
  'strategy': 'short',
  'profit': 0.7150000000000001,
  'profit_percentage': 42.68656716417911},
 'GRNF': {'entry_price': 11.94,
  'exit_price': 0.16,
  'strategy': 'short',
  'profit': 11.78,
  'profit_percentage': 98.65996649916248},
 'BSGM': {'entry_price': 69.32,
  'exit_price': 37.11,
  'strategy': 'short',
  'profit': 32.209999999999994,
  'profit_percentage': 46.465666474321985},
 'SOLY': {'entry_price': 6.74,
  'exit_price': 22.32,
  'strategy': 'short',
  'profit': -15.58,
  'profit_percentage': -231.15727002967355},
 'MMAT': {'entry_price': 277.0,
  'exit_price': 463.0,
  'strategy': 'long',
  'profit': 186.0,
  'profit_percentage': 67

### Random Trade

In [66]:
returns_rnd = {}
for index, row in test.iterrows():
    x = np.array(test[test['SYMBOL'] == row['SYMBOL']].drop(non_test_variables, axis=1))
    # prediction_2y = regressors['2Years'].predict(x)[0]
    rand = random.randint(0,1)
    # print(f"prediction 2 Years for stock{row['SYMBOL']} : {prediction_2y}")

    df = stock_data[row['SYMBOL']]
    buy_data = df[df['date'] == row['date']]
    entry_price = (buy_data['high'].values[0] + buy_data['low'].values[0])/2

    sell_date = row['date'] + np.timedelta64(730,'D')
    # Get the first date 2 years after entring the market
    sell_pivot = nearest(np.array(df['date']), sell_date)
    sell_data = df[df['date'] == sell_pivot]
    exit_price = (sell_data['high'].values[0] + sell_data['low'].values[0])/2

    profit = 0
    profit_percentage = 0
    result = {}
    if(rand == 0):
        # long
        profit = exit_price - entry_price
        profit_percentage = profit/ entry_price * 100
    else:
        # short
        profit = entry_price - exit_price
        profit_percentage = profit / entry_price * 100
        

    result['entry_price'] = entry_price
    result['exit_price'] = exit_price
    result['strategy'] = 'long' if prediction_2y > 0 else 'short'
    result['profit'] = profit
    result['profit_percentage'] = profit_percentage
    # print(result)
    # returns.put(np.array(result))
    returns_rnd[row['SYMBOL']] = result

{'GRNF': {'entry_price': 11.94, 'exit_price': 0.16, 'strategy': 'short', 'profit': 11.78, 'profit_percentage': 98.65996649916248}, 'BSGM': {'entry_price': 69.32, 'exit_price': 37.11, 'strategy': 'short', 'profit': 32.209999999999994, 'profit_percentage': 46.465666474321985}, 'MMAT': {'entry_price': 277.0, 'exit_price': 463.0, 'strategy': 'short', 'profit': -186.0, 'profit_percentage': -67.14801444043322}, 'CFMS': {'entry_price': 61.125, 'exit_price': 20.9125, 'strategy': 'short', 'profit': -40.2125, 'profit_percentage': -65.78732106339467}, 'AMSC': {'entry_price': 13.425, 'exit_price': 19.035, 'strategy': 'short', 'profit': -5.609999999999999, 'profit_percentage': -41.7877094972067}, 'TTOO': {'entry_price': 17175.0, 'exit_price': 7950.0, 'strategy': 'short', 'profit': -9225.0, 'profit_percentage': -53.7117903930131}, 'NBEVQ': {'entry_price': 6.63, 'exit_price': 3.575, 'strategy': 'short', 'profit': -3.0549999999999997, 'profit_percentage': -46.07843137254902}, 'HSDT': {'entry_price': 1

In [175]:
def evalute_overall_profit(df):
    total = 0
    positive_trades = 0
    negative_trades = 0
    for symbol in df: 
        total = total + df[symbol]['profit_percentage']
        if df[symbol]['profit_percentage'] > 0:
            positive_trades += 1
        else:
            negative_trades += 1

    return({'total': total, 'positive_trades':positive_trades, 'negative_trades':negative_trades})


In [214]:
rnd_res = list(evalute_overall_profit(returns_rnd).values())
xgb_res = list(evalute_overall_profit(returns_xgb).values())
print("Random profit: \033[91m{0}%\033[0m \nRandom postive trades: \033[91m{1}\033[0m\nRandom negative trades: \033[91m{2}\033[0m".format(round(rnd_res[0],1),rnd_res[1],rnd_res[2]))
print("XGB profit: \033[92m{0}%\033[0m \nXGB postive trades: \033[92m{1}\033[0m\nXGB negative trades: \033[92m{2}\033[0m".format(round(xgb_res[0],1),xgb_res[1],xgb_res[2]))

Random profit: [91m-651.4%[0m 
Random postive trades: [91m32[0m
Random negative trades: [91m32[0m
XGB profit: [92m2004.9%[0m 
XGB postive trades: [92m39[0m
XGB negative trades: [92m25[0m


# Train and test only on 2Years

## Create Testsplit

In [216]:
sorted_stock_list = stock_list_2y.sort_values(['date'])
train_2y, test_2y = train_test_split(stock_list_2y, test_size=0.3, shuffle = False)
# Making sure we got no overlap
print("First date of \033[92mtraining\033[0m data: {0}".format(train['date'].iloc[0]))
print("Last date of \033[92mtraining\033[0m data:  {0}".format(train['date'].iloc[-1]))
print("First date of \x1b[31mtest\x1b[0m data:     {0}".format(test['date'].iloc[0]))
print("Last date of \x1b[31mtest\x1b[0m data:      {0}".format(test['date'].iloc[-1]))

First date of [92mtraining[0m data: 2021-12-30 00:00:00
Last date of [92mtraining[0m data:  2019-11-20 00:00:00
First date of [31mtest[0m data:     2019-10-10 00:00:00
Last date of [31mtest[0m data:      2017-06-15 00:00:00


In [217]:
x_train = np.array(train_2y.drop(non_test_variables,axis=1))
y_train = np.array(train_2y['2Years'])

x_test = np.array(test_2y.drop(non_test_variables,axis=1))
y_test = np.array(test_2y['2Years'])

## Train
* Train only on 2 Year dataset

In [218]:
# Define classifier
regressor_2y = xgb.XGBRegressor()
# Train on trainsplit
regressor_2y.fit(x_train,y_train)

## Evaluate
* Evaluate performance of 2y Regressor

## Backtesting

In [69]:
# Create a Stratey
class TestStrategy(bt.Strategy):

    def log(self, txt, dt=None):
        ''' Logging function for this strategy'''
        dt = dt or self.datas[0].datetime.date(0)
        print(f'{dt} | {txt}')
        # print('%s, %s' % (dt.isoformat(), txt))

    def __init__(self):
        # Keep a reference to the "close" line in the data[0] dataseries
        self.dataclose = self.datas[0].close
        self.log_returns = self.datas[0].log_returns
        self.not_executed = True

        # To keep track of pending orders
        self.order = None

    def notify_order(self, order):
        if order.status in [order.Submitted, order.Accepted]:
            # Buy/Sell order submitted/accepted to/by broker - Nothing to do
            return

        # Check if an order has been completed
        # Attention: broker could reject order if not enough cash
        if order.status in [order.Completed]:
            if order.isbuy():
                self.log('BUY EXECUTED, %.2f' % order.executed.price)
            elif order.issell():
                self.log('SELL EXECUTED, %.2f' % order.executed.price)

            self.bar_executed = len(self)

        elif order.status in [order.Canceled, order.Margin, order.Rejected]:
            self.log('Order Canceled/Margin/Rejected')

        # Write down: no pending order
        self.order = None

    def next(self):
        # Simply log the closing price of the series from the reference
        # self.log('Close, %.2f' % self.dataclose[0]) 
        if self.order:
            return
        # trigger buy one day after  short selling repport released
        # TODO make it one day after not the day and remove the billion convertions
        if(np.datetime64(test.iloc[0]['date']) == np.datetime64(self.datas[0].datetime.date(0))):
            # use XGB to predict if it falls or raises in 2 Years
            prediction_2y = regressors['2Years'].predict(np.array(test[test['SYMBOL'] == test.iloc[0][0]].drop(non_test_variables,axis=1)))
            if(prediction_2y > 0):
                self.order = self.buy(size=1)
            else:
                self.order = self.sell(size=1)

        if(np.datetime64(self.datas[0].datetime.date(0)) >= (np.datetime64(test.iloc[0]['date']) + np.timedelta64(365,'D')) and self.not_executed):
            self.order = self.buy(size=1)
            self.not_executed = False

        # Check if an order is pending ... if yes, we cannot send a 2nd one
 
exec(TestStrategy, test.iloc[0][0])

  exec(TestStrategy, test.iloc[0][0])


TypeError: exec() globals must be a dict, not str

### Custom Data feed
We make a custom data feed to inculde the log_returns in our data. We need this to predict. 

In [None]:
class CustomFeed(bt.feeds.PandasData):
    lines = ('datetime', 'open', 'high', 'low',
             'close', 'volume', 'log_returns')
    params = (
        ('datetime', 'date'),
        ('openinterest',None),
        ('open', -1),
        ('high', -1),
        ('low', -1),
        ('close', -1),
        ('volume', -1),
        ('log_returns', -1)
    ) 

In [None]:
def exec(strategy, symbol):
    cerebro = bt.Cerebro()
    # Add a strategy
    cerebro.addstrategy(strategy)    
    data = CustomFeed(dataname=stock_data[symbol])
    
    # Add the Data Feed to Cerebro
    cerebro.adddata(data)

    cerebro.broker.setcash(10000.0)

    print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())

    cerebro.run()

    print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())

    plot = cerebro.plot()
    # plot[0].savefig("figure_{}.png".format("index")) 
    for index, figure in enumerate(plot[0]):
        # return figure
        figure.savefig("figure_{}.png".format(index)) 

# ---------------------------------
# for test stocks run strategy
# in strategy if stock predict in 2 years up then buy else short
# cash out after 2 years

exec(TestStrategy, test.iloc[0][0])
# print(test.iloc[0]['date'])

# for index, row in test.iterrows():
    # exec(TestStrategy, row['SYMBOL'])
# print(stock_data)
# stock_data['ABAT']['date'] = stock_data['ABAT']['date'].apply(lambda x: np.datetime64(x))
# exec(TestStrategy, stock_data['ABAT'])

  exec(TestStrategy, test.iloc[0][0])


Starting Portfolio Value: 10000.00


  prediction_2y = regressors['2Years'].predict(np.array(test[test['SYMBOL'] == test.iloc[0][0]].drop(non_test_variables,axis=1)))


2020-04-23 | SELL EXECUTED, 87.99
2021-04-23 | BUY EXECUTED, 161.71
Final Portfolio Value: 9926.28


<IPython.core.display.Javascript object>