# Feature Selecection

In [24]:
# Remove Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# General
import numpy as np


# Data Management
import pandas as pd
from sklearn.model_selection import train_test_split


# Machine Learning
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


# BINARY CLASSIFICATION METRICS
from sklearn.metrics import RocCurveDisplay


# Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


# Reporting
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from xgboost import plot_tree

# Import Processed Data

In [25]:
# FETCH DATA
df = pd.read_csv("data/BTC-USD.csv")
df.set_index("Date" , inplace=True)

# Specify Prediction Target

In [26]:
# Specify Target
df.loc[df["Range"].shift(-1) > df["AVG_Range"] , "TARGET"] = 1
df.loc[df["Range"].shift(-1) <= df["AVG_Range"] , "TARGET"] = 0

# Check for NAN
nan_location = np.where(np.isnan(df))

# Fill NaNs
df["TARGET"].fillna(0,inplace=True)
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Returns,Range,Bench_C_Rets,RSI,RSI_Return,...,DOW,Rolling_rets,AVG_Range,Returns_T1,Range_T1,Rolling_rets_T1,Returns_T2,Range_T2,Rolling_rets_T2,TARGET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-08-28,0.003129,0.001269,-0.003254,26106.150391,0.591434,0.000631,0.012286,2.966457,27.514966,1.013479,...,0,-0.113528,0.021405,0.003123,0.007713,-0.112874,-0.001505,0.004753,-0.112281,1.0
2023-08-29,0.000493,0.07217,0.001238,27727.392578,1.669173,0.062102,0.084002,3.212583,52.713963,1.915829,...,1,-0.048646,0.023765,0.000631,0.012286,-0.113528,0.003123,0.007713,-0.112874,1.0
2023-08-30,0.062201,-0.011719,0.044634,27297.265625,-0.443495,-0.015513,0.025525,3.147943,47.951112,0.909647,...,2,-0.062615,0.024205,0.062102,0.084002,-0.048646,0.000631,0.012286,-0.113528,1.0
2023-08-31,-0.015298,-0.010954,-0.048626,25931.472656,0.234791,-0.050034,0.066134,2.940792,36.632746,0.76396,...,3,-0.127894,0.025225,-0.015513,0.025525,-0.062615,0.062102,0.084002,-0.048646,1.0
2023-09-01,-0.050103,-0.048449,-0.015156,25800.724609,-0.147571,-0.005042,0.030094,2.92095,35.762488,0.976244,...,4,-0.115286,0.025029,-0.050034,0.066134,-0.127894,-0.015513,0.025525,-0.062615,0.0


# TRAIN TEST SPLIT

In [27]:
# REMOVE UNWANTED COLUMNS (dont put information about the future in your data other than the target)
df_tts = df.copy()
df_tts = df_tts[["Returns", "Range", "RSI", "DOW", "AVG_Range",   "TARGET"]]

In [28]:
# EVERYTHING EXCEPT TARGET (x)
X = df_tts.iloc[: , : -1]

# TARGET (y)
y = df_tts.iloc[: , -1]
df_tts 

Unnamed: 0_level_0,Returns,Range,RSI,DOW,AVG_Range,TARGET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-02,-0.005165,0.027227,68.352715,6,0.033705,0.0
2020-02-03,-0.005441,0.031544,66.532818,0,0.034219,0.0
2020-02-04,-0.012112,0.023972,62.561591,1,0.034370,1.0
2020-02-05,0.047104,0.058666,69.976461,2,0.034649,0.0
2020-02-06,0.012106,0.029854,71.606200,3,0.033885,0.0
...,...,...,...,...,...,...
2023-08-28,0.000631,0.012286,27.514966,0,0.021405,1.0
2023-08-29,0.062102,0.084002,52.713963,1,0.023765,1.0
2023-08-30,-0.015513,0.025525,47.951112,2,0.024205,1.0
2023-08-31,-0.050034,0.066134,36.632746,3,0.025225,1.0


In [29]:
# TRAIN TEST SPLIT (time series)


# 70% OF THE HISTORICAL DATA GOES TO TRAINING THE MODEL
train_amount_percent = 0.7
train_size = int(len(X) * train_amount_percent)
X_train = X.head(train_size)
y_train = y.head(train_size)


# THE REST GOES TO TESTING THE MODEL FOR ACCURACY
test_size = len(X) - train_size
X_test = X.tail(test_size)
y_test = y.tail(test_size)

In [30]:
size_check = len(y_test) + len(y_train) == len(X)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print()
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")
print()
print(f"Testing data size check: {size_check}")

Shape of X_train: (915, 5)
Shape of y_train: (915,)

Shape of X_test: (393, 5)
Shape of y_test: (393,)

Testing data size check: True


# MODEL FITTING


In [None]:
# fit model
