In [7]:
# Remove Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message=".*use_label_encoder.*")

# General
import numpy as np

# Data Management
import pandas as pd
from sklearn.model_selection import train_test_split

# Machine Learning
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# binary classification for specific metrics
from sklearn.metrics import plot_roc_curve


# General Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix    
from sklearn.metrics import ConfusionMatrixDisplay

# Reporting
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams



# Data Management

In [8]:
df = pd.read_csv("data/BTC-USD.csv")
df.set_index("Date" , inplace=True)

# Target Algo

In [9]:
# Specify Target
df.loc[df["Range"].shift(-1) > df["AVG_Range"] , "TARGET"] = 1
df.loc[df["Range"].shift(-1) <= df["AVG_Range"] , "TARGET"] = 0

# Clean Data

In [10]:
# Check for NAN
nan_location = np.where(np.isnan(df))

# Fill NaNs
df["TARGET"].fillna(0,inplace=True)
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Returns,Range,Bench_C_Rets,RSI,RSI_Return,...,DOW,Rolling_rets,AVG_Range,Returns_T1,Range_T1,Rolling_rets_T1,Returns_T2,Range_T2,Rolling_rets_T2,TARGET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-05-23,0.003965,0.014381,0.010036,27225.726562,0.238807,0.013945,0.023065,3.144802,46.178235,1.112756,...,1,-0.009218,0.032783,0.003643,0.018682,-0.031291,-0.013851,0.020931,-0.015115,1.0
2023-05-24,0.013727,-0.007657,-0.026462,26334.818359,0.189959,-0.032723,0.042826,3.008356,38.323938,0.829913,...,2,-0.039547,0.033091,0.013945,0.023065,-0.009218,0.003643,0.018682,-0.031291,0.0
2023-05-25,-0.03288,-0.023254,-0.008273,26476.207031,-0.150191,0.005369,0.027073,3.029568,40.066179,1.045461,...,3,-0.062598,0.032569,-0.032723,0.042826,-0.039547,0.013945,0.023065,-0.009218,0.0
2023-05-26,0.005497,0.012228,0.01751,26719.291016,-0.082268,0.009181,0.02174,3.0667,43.045036,1.074348,...,4,-0.057483,0.030035,0.005369,0.027073,-0.062598,-0.032723,0.042826,-0.039547,0.0
2023-05-27,0.009292,-0.001032,0.010522,26868.353516,-0.37915,0.005579,0.010058,3.089815,44.855043,1.042049,...,5,-0.088884,0.028646,0.009181,0.02174,-0.057483,0.005369,0.027073,-0.062598,0.0


# Train Test Split (timeseries method)

In [13]:
# REMOVE UNWANTED COLUMNS (dont put information about the future in your data other than the target)
df_tts = df.copy()
df_tts = df_tts[["Returns", "Range", "RSI", "RSI_Return" , "DOW" , "AVG_Range", "TARGET"]]

In [16]:
# Split into Learning (X) and Target (y) data
X = df_tts.iloc[: , : -1 ]
y = df_tts.iloc[: ,   -1 ]

In [18]:
# Perform train test split (TIMESERIES BASED METHOD)
train_size_rate = 0.7
train_size = int(len(X) * train_size_rate)
test_size = len(X) - train_size

X_train = X.head(train_size)
y_train = y.head(train_size)

X_test = X.tail(test_size)
y_test = y.tail(test_size)

size_check = len(y_test) + len(y_train) == len(X)


print(size_check)

True
