# Feature Selecection

In [42]:
import json


# Remove Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# General
import numpy as np


# Data Management
import pandas as pd
from sklearn.model_selection import train_test_split


# Machine Learning
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


# BINARY CLASSIFICATION METRICS
from sklearn.metrics import RocCurveDisplay


# Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


# Reporting
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from xgboost import plot_tree

# Import Processed Data

In [43]:
# FETCH DATA
df = pd.read_csv("data/BTC-USD.csv")
df.set_index("Date" , inplace=True)

# Specify Prediction Target

In [44]:
# Specify Target
df.loc[df["Range"].shift(-1) > df["AVG_Range"] , "TARGET"] = 1
df.loc[df["Range"].shift(-1) <= df["AVG_Range"] , "TARGET"] = 0

# Check for NAN
nan_location = np.where(np.isnan(df))

# Fill NaNs
df["TARGET"].fillna(0,inplace=True)
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Returns,Range,Bench_C_Rets,RSI,RSI_Return,...,DOW,Rolling_rets,AVG_Range,Returns_T1,Range_T1,Rolling_rets_T1,Returns_T2,Range_T2,Rolling_rets_T2,TARGET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-09-22,-0.020855,-0.015721,0.004026,26579.568359,-0.208855,0.000449,0.008701,3.038692,50.019039,1.003172,...,4,0.010507,0.023995,-0.020801,0.028937,0.025423,-0.002907,0.019186,0.042684,0.0
2023-09-23,0.000546,-0.003438,0.000943,26579.390625,-0.30004,-7e-06,0.004286,3.038662,50.016509,0.999949,...,5,0.020687,0.023315,0.000449,0.008701,0.010507,-0.020801,0.028937,0.025423,0.0
2023-09-24,3.1e-05,0.003074,-0.011292,26256.826172,0.106441,-0.012136,0.018878,2.989186,45.516063,0.910021,...,6,0.012936,0.023347,-7e-06,0.004286,0.020687,0.000449,0.008701,0.010507,0.0
2023-09-25,-0.01225,-0.011025,-0.007993,26298.480469,0.464424,0.001586,0.015764,2.994949,46.189409,1.014794,...,0,0.016027,0.023715,-0.012136,0.018878,0.012936,-7e-06,0.004286,0.020687,0.0
2023-09-26,0.001561,-0.001197,0.003047,26217.25,-0.167725,-0.003089,0.011467,2.981431,45.020912,0.974702,...,1,0.009815,0.02384,0.001586,0.015764,0.016027,-0.012136,0.018878,0.012936,0.0


# TRAIN TEST SPLIT

In [45]:
# FETCH RECOMMENDED FEATURES FROM FEATURE SELECTION
with open('./data/feature_selection_output.json', 'r') as f:
    recommended_labels = json.load(f)
recommended_labels.append("TARGET")
recommended_labels

['Range', 'RSI', 'RSI_Return', 'DOW', 'AVG_Range', 'TARGET']

In [46]:
# REMOVE UNWANTED COLUMNS (dont put information about the future in your data other than the target)
df_tts = df.copy()
df_tts = df_tts[recommended_labels]

In [47]:
# EVERYTHING EXCEPT TARGET (x)
X = df_tts.iloc[: , : -1]

# TARGET (y)
y = df_tts.iloc[: , -1]
df_tts 

Unnamed: 0_level_0,Range,RSI,RSI_Return,DOW,AVG_Range,TARGET
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-02,0.027227,68.352715,0.975766,6,0.033705,0.0
2020-02-03,0.031544,66.532818,0.973375,0,0.034219,0.0
2020-02-04,0.023972,62.561591,0.940312,1,0.034370,1.0
2020-02-05,0.058666,69.976461,1.118521,2,0.034649,0.0
2020-02-06,0.029854,71.606200,1.023290,3,0.033885,0.0
...,...,...,...,...,...,...
2023-09-22,0.008701,50.019039,1.003172,4,0.023995,0.0
2023-09-23,0.004286,50.016509,0.999949,5,0.023315,0.0
2023-09-24,0.018878,45.516063,0.910021,6,0.023347,0.0
2023-09-25,0.015764,46.189409,1.014794,0,0.023715,0.0


In [48]:
# TRAIN TEST SPLIT (time series)


# 70% OF THE HISTORICAL DATA GOES TO TRAINING THE MODEL
train_amount_percent = 0.7
train_size = int(len(X) * train_amount_percent)
X_train = X.head(train_size)
y_train = y.head(train_size)


# THE REST GOES TO TESTING THE MODEL FOR ACCURACY
test_size = len(X) - train_size
X_test = X.tail(test_size)
y_test = y.tail(test_size)

In [49]:
size_check = len(y_test) + len(y_train) == len(X)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print()
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")
print()
print(f"Testing data size check: {size_check}")

Shape of X_train: (933, 5)
Shape of y_train: (933,)

Shape of X_test: (400, 5)
Shape of y_test: (400,)

Testing data size check: True


# BUILD MODEL

In [50]:
# Select type of model to optimize for
is_binary = True
is_optimize_for_precision = True


In [51]:
# Determine Objective and Eval Metrics
if is_binary:
    objective = "binary:logistic"
    eval_metric = "logloss"
    eval_metric_list = ["error","logloss",eval_metric]
else:
    objective = "multi:softmax"
    eval_metric = "mlogloss"
    eval_metric_list = ["merror","mlogloss",eval_metric]

In [52]:
# Refine Eval Metric
if is_binary and is_optimize_for_precision:
    eval_metric = "aucpr"
    scoring = "precision"
elif is_binary and not is_optimize_for_precision:
    eval_metric = "auc"
    scoring = "f1"
else:
    scoring = "accuracy"

In [53]:
# hyperparameters generated in feature selection


with open('./data/hyperparametertuning.json', 'r') as f:
    data = json.load(f)
    ne = data['ne']
    lr = data['lr']
    md = data['md']
    gm = data['gm']



In [57]:
# Build First Classifier Model
classifier = XGBClassifier(
    objective=objective,
    booster="gbtree",
    # eval_metric=eval_metric,
    subsample = 0.8,
    colsample_bytree=1,
    random_state=1,
    use_label_encoder=False,

    n_estimators=ne,
    learning_rate=lr,
    max_depth=md,
    gamma=gm,
)



In [58]:
# Fit the model
eval_set = [(X_train,y_train) , (X_test,y_test)]

classifier.fit(
    X_train,
    y_train,
    eval_metric=eval_metric_list,
    eval_set=eval_set,
    verbose=False
)


# MAKE PREDICTIONS