# TOC:
* [Imports](#imports)
* [Util Methods & Constants](#utils)
* [Load Data](#load-data)
    * [Load pre processed training data](#load-data)
        *   [Split data](#split-data)
    * [Load daily stock data](#load-daily-data)
* [Create testsplits](#create-test-splits)


# Imports <a class="anchor" id="imports"></a>

In [50]:
from sklearn.model_selection import train_test_split , ShuffleSplit
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics
import xgboost as xgb
from xgboost import plot_importance

import pandas as pd
import numpy as np
import random

import backtrader as bt

import matplotlib.pyplot as plt
from tabulate import tabulate

# Util methods & Constants <a class="anchor" id="utils"></a>

In [153]:
# Contants
predict_variables = ['1Month','3Months','6Months','1Year','2Years']
meta_variables =  ['date', 'symbol', 'source']
non_test_variables = predict_variables + meta_variables
train_variables = ["Day1Prior","Day2Prior","Day3Prior","Day4Prior","Day5Prior","Day6Prior","Week1Prior","Week2Prior","Week3Prior","Month1Prior","Month2Prior","Month3Prior","Month4Prior","Month5Prior","Month6Prior","Month7Prior","Month8Prior","Month9Prior","Month10Prior","Month11Prior","Month12Prior","Month13Prior","Month14Prior","Month15Prior","Month16Prior","Month17Prior","Month18Prior","Month19Prior","Month20Prior","Month21Prior","Month22Prior","Month23Prior","Month24Prior"]

short_value = -1
long_value = 0
test_size = 0.4


In [154]:
# Util methods

# Method to find nearest date to a given pivot
def nearest(items, pivot):
    return min(items, key=lambda x: abs(x - pivot))

# Classifies a varibale an imput as either short or long
def classify(x):
    if( x < 0 ):
        return short_value
    else:
        return long_value

# Load data <a class="anchor" id="load-data"></a>
## Load pre processed training data <a class="anchor" id="load-data"></a>


In [157]:
classified_data = pd.read_csv('csv/table_all_class.csv',sep=',')
u_symbol = classified_data['SYMBOL'].unique()
date = pd.Series(classified_data['Date'])
classified_data.drop(['Date'],axis=1,inplace=True)
classified_data['date'] = date.apply(lambda x: np.datetime64(x))
classified_data.rename(columns={"SYMBOL": "symbol"}, inplace=True)
classified_data[0:2]

Unnamed: 0,symbol,1Month,3Months,6Months,1Year,2Years,Day1Prior,Day2Prior,Day3Prior,Day4Prior,...,Month17Prior,Month18Prior,Month19Prior,Month20Prior,Month21Prior,Month22Prior,Month23Prior,Month24Prior,source,date
0,ALT,,,,,,0.065012,-0.133654,-0.136973,-0.136973,...,-0.340892,-0.50221,-0.199111,-0.001109,0.794821,0.722753,0.371385,0.202937,Kerrisdale Capital,2024-02-14
1,JOBY,short,short,,,,-0.067073,-0.055556,-0.054096,-0.054096,...,0.148218,0.176923,0.256674,0.277662,0.071804,-0.098675,-0.337662,-0.322259,Kerrisdale Capital,2023-10-11


### Split data into individual datasets  <a class="anchor" id="split-data"></a>
* Spliting data into individual datasets so that each prediction period can be trained with the maximum amount of data not including NaN
* We also drop rows with NaN values in the colums we train on

In [158]:
split_classification_data = {}
classification_data_before_and_after_drop = {}
before_drop = len(classified_data)
# we drop rows with NaN values for each predict variable. So if there is NaN for 2 Years but not for 6Months, 6Months can be trained on that data
for variable in predict_variables:
    entry = {}
    entry['before_drop'] = before_drop
    split_classification_data[variable] = classified_data.dropna(subset=[variable]+train_variables)
    entry['after_drop'] = len(split_classification_data[variable])
    classification_data_before_and_after_drop[variable] = entry

for index, row in classification_data_before_and_after_drop.items():
    print("Variable: {0}  \t before drop: {1}\t after drop: {2}".format(index,row['before_drop'],row['after_drop']))

Variable: 1Month  	 before drop: 448	 after drop: 306
Variable: 3Months  	 before drop: 448	 after drop: 302
Variable: 6Months  	 before drop: 448	 after drop: 288
Variable: 1Year  	 before drop: 448	 after drop: 266
Variable: 2Years  	 before drop: 448	 after drop: 225


## Load daily stock data <a class="anchor" id="load-daily-data"></a>

In [159]:
# Daily data
clean_data = pd.read_csv('csv/clean_data_new.csv',sep=',')
u_symbol = clean_data['symbol'].unique()
date = pd.Series(clean_data['datetime'])
clean_data.drop(['datetime'],axis=1,inplace=True)
clean_data['date'] = date.apply(lambda x: np.datetime64(x))

stock_data = {}
for symbol in u_symbol:
    stock_data[symbol] = clean_data[clean_data['symbol'] == symbol]

stock_data[list(stock_data.keys())[0]][0:2]

Unnamed: 0,open,high,low,close,volume,symbol,log_returns,date
0,32.3,33.58,32.3,32.84,7315000.0,1179.HK,0.056371,2020-09-23
1,34.08,34.2,33.5,33.73,2797160.0,1179.HK,0.02674,2020-09-24


# Create test splits<a class="anchor" id="create-test-splits"></a>
* We create a specific sorted test split for each training variable

In [166]:
classified_test_splits = {}
for variable in predict_variables:
    # We sort the array to prevent having future data in training set. 
    sorted_data = split_classification_data[variable].sort_values(['date'], ascending= True)
    train, test = train_test_split(sorted_data, test_size=test_size, shuffle = False)
    # Making sure we got no overlap
    if(train['date'].iloc[0] >= train['date'].iloc[-1]):
        raise Exception("First date of training data is higher than last date") 
    if(test['date'].iloc[0] >= test['date'].iloc[-1]):
        raise Exception("First date of test data is higher than last date") 
    if(train['date'].iloc[-1] >= test['date'].iloc[0]):
        raise Exception("Overlap of training and test data") 