## Imports

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn import metrics

import pandas as pd
import numpy as np

import backtrader as bt

## Data laoding from CSV
### Stockdata
* Load data from CSV

In [3]:
stock_list = pd.read_csv('csv/result_table.csv',sep=',')
u_symbol = stock_list['SYMBOL'].unique()
date = pd.Series(stock_list['Date'])
stock_list.drop(['Date'],axis=1,inplace=True)
stock_list['date'] = date.apply(lambda x: np.datetime64(x))

stock_list

Unnamed: 0,SYMBOL,1Day,1Week,1Month,3Months,6Months,1Year,2Years,Day1Prior,Day2Prior,...,Month1Prior,Month2Prior,Month3Prior,Month4Prior,Month5Prior,Month6Prior,Month7Prior,Year1Prior,Year2Prior,date
0,ALT,-0.006659,-0.087680,,,,,,0.065012,-0.133654,...,-0.249167,0.174707,2.817797,2.412879,2.085616,2.312500,1.815625,-0.272213,0.202937,2024-02-14
1,JOBY,0.008170,0.024510,-0.151961,0.004902,,,,-0.067073,-0.055556,...,-0.159341,-0.202086,-0.420455,-0.216389,0.313305,0.481840,0.614776,0.478261,-0.322259,2023-10-11
2,TLRY,0.004065,-0.024390,-0.223577,-0.195122,,,,-0.039062,-0.160410,...,-0.068182,0.464286,0.556962,-0.027668,0.069565,-0.075188,-0.115108,-0.236025,-0.797864,2023-09-19
3,CVNA,-0.004679,0.000425,0.602722,1.161208,0.680136,,,0.105833,0.232826,...,1.239048,1.620959,2.334752,1.133394,2.349003,3.867495,1.367573,0.117927,-0.914403,2023-06-13
4,UEC,0.007092,0.056738,-0.106383,0.209220,0.914894,,,0.032967,0.032967,...,-0.203390,-0.286076,-0.255937,-0.267532,-0.331754,-0.124224,-0.369128,-0.422131,-0.075410,2023-03-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,CCRC,-0.002417,-0.016116,-0.033038,-0.140210,-0.103143,-0.157937,-0.629331,0.001614,0.044613,...,0.035893,0.299476,0.453162,0.411832,-0.376068,-0.328100,-0.250151,-0.108477,-0.119233,2018-11-15
444,SRNEQ,0.119565,0.195652,0.369565,0.543478,-0.032609,0.039130,-0.606522,-0.269841,-0.342857,...,-0.480226,-0.406452,0.260274,1.044444,0.862348,1.628571,1.555556,0.150000,-0.065041,2018-03-28
445,MARK,-0.056402,0.292683,0.350610,-0.076220,-0.376524,-0.696646,-0.914634,-0.056115,-0.290811,...,-0.497318,-0.257919,0.935103,1.122977,1.411765,1.385455,1.095847,1.030960,0.623762,2018-02-07
446,IPO.L,-0.044976,-0.031098,-0.065756,-0.294122,-0.108625,-0.241517,-0.507230,-0.003475,-0.003475,...,0.039580,0.010499,0.151426,0.075150,0.004892,0.078351,0.010499,-0.085454,-0.324780,2017-12-18


In [4]:
print("Number of rows before elimination of NaN: {0}".format(len(stock_list)))
stock_list.dropna(inplace=True)
print("Number of rows after elimination of NaN: {0}".format(len(stock_list)))


Number of rows before elimination of NaN: 448
Number of rows after elimination of NaN: 219


## Create Testsplit
* We create a testsplit at a certain date which allows for roughly 70% of the data to be before it for training.
* The other 30% will be after the date, we ensure this way that when we do out of sample testing that the training wasnt
"poisoned" with future data.

In [71]:
train, test = train_test_split(stock_list, test_size=0.3)
non_test_variables = ['SYMBOL','1Day','1Week','1Month','3Months','6Months','1Year','2Years','date']
predict_variables = ['1Day','1Week','1Month','3Months','6Months','1Year','2Years']

x_train = np.array(train.drop(non_test_variables,axis=1))
y_train = {}
for variable in predict_variables:
    y_train[variable] = np.array(train[variable])

x_test = np.array(test.drop(non_test_variables,axis=1))
y_test ={}
for variable in predict_variables:
    y_test[variable] = np.array(test[variable])


<class 'numpy.ndarray'>


## Train

In [72]:
regressors = {}
for variable in predict_variables: 
    # Define classifier
    clf = xgb.XGBRegressor()
    # Train on trainsplit
    clf.fit(x_train,y_train[variable])
    regressors[variable] = clf

## Test

In [81]:
# predict outsample
for variable in predict_variables: 
    y_pred_insample = regressors[variable].predict(x_train)
    mse = mean_squared_error(y_pred_insample, y_train[variable])
    print("Mean squared error insample for variable:{0} = {1}".format(variable, mse))

print('\n')

# predict outsample
for variable in predict_variables: 
    y_pred_outsample = regressors[variable].predict(x_test)
    mse = mean_squared_error(y_pred_outsample, y_test[variable])
    print("Mean squared error outsample for variable:{0} = {1}".format(variable, mse))



Mean squared error insample for variable:1Day = 5.700264498381292e-07
Mean squared error insample for variable:1Week = 5.892107171331623e-07
Mean squared error insample for variable:1Month = 4.198577298302936e-07
Mean squared error insample for variable:3Months = 4.0407973417375916e-07
Mean squared error insample for variable:6Months = 5.354930338501452e-07
Mean squared error insample for variable:1Year = 4.831673234343952e-07
Mean squared error insample for variable:2Years = 6.006955387847057e-07


Mean squared error outsample for variable:1Day = 0.0038773734282723012
Mean squared error outsample for variable:1Week = 0.025937961998184884
Mean squared error outsample for variable:1Month = 0.061074249231457683
Mean squared error outsample for variable:3Months = 0.23455094453515885
Mean squared error outsample for variable:6Months = 0.43674831914709744
Mean squared error outsample for variable:1Year = 2.432436314221949
Mean squared error outsample for variable:2Years = 2.598652775005795
