# Feature Selecection

In [None]:
import json


# Remove Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# General
import numpy as np


# Data Management
import pandas as pd
from sklearn.model_selection import train_test_split


# Machine Learning
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


# BINARY CLASSIFICATION METRICS
from sklearn.metrics import RocCurveDisplay


# Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


# Reporting
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from xgboost import plot_tree

# Import Processed Data

In [None]:
# FETCH DATA
df = pd.read_csv("data/BTC-USD.csv")
df.set_index("Date" , inplace=True)

# Specify Prediction Target

In [None]:
# Specify Target     (if yesterdays range was greater than the average range)
df.loc[df["Range"].shift(-1) > df["AVG_Range"] , "TARGET"] = 1
df.loc[df["Range"].shift(-1) <= df["AVG_Range"] , "TARGET"] = 0

# Check for NAN
nan_location = np.where(np.isnan(df))

# Fill NaNs
df["TARGET"].fillna(0,inplace=True)
df.tail()

# TRAIN TEST SPLIT

In [None]:
# FETCH RECOMMENDED FEATURES FROM FEATURE SELECTION
with open('./data/feature_selection_output.json', 'r') as f:
    recommended_labels = json.load(f)
recommended_labels.append("TARGET")
recommended_labels

In [None]:
# REMOVE UNWANTED COLUMNS (dont put information about the future in your data other than the target)
df_tts = df.copy()
df_tts = df_tts[recommended_labels]

In [None]:
# EVERYTHING EXCEPT TARGET (x)
X = df_tts.iloc[: , : -1]

# TARGET (y)
y = df_tts.iloc[: , -1]
df_tts 

In [None]:
# TRAIN TEST SPLIT (time series)


# 70% OF THE HISTORICAL DATA GOES TO TRAINING THE MODEL
train_amount_percent = 0.7
train_size = int(len(X) * train_amount_percent)
X_train = X.head(train_size)
y_train = y.head(train_size)


# THE REST GOES TO TESTING THE MODEL FOR ACCURACY
test_size = len(X) - train_size
X_test = X.tail(test_size)
y_test = y.tail(test_size)

In [None]:
size_check = len(y_test) + len(y_train) == len(X)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print()
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")
print()
print(f"Testing data size check: {size_check}")

# BUILD MODEL

In [None]:
# Select type of model to optimize for
is_binary = True
is_optimize_for_precision = True


In [None]:
# Determine Objective and Eval Metrics
if is_binary:
    objective = "binary:logistic"
    eval_metric = "logloss"
    eval_metric_list = ["error","logloss",eval_metric]
else:
    objective = "multi:softmax"
    eval_metric = "mlogloss"
    eval_metric_list = ["merror","mlogloss",eval_metric]

In [None]:
# Refine Eval Metric
if is_binary and is_optimize_for_precision:
    eval_metric = "aucpr"
    scoring = "precision"
elif is_binary and not is_optimize_for_precision:
    eval_metric = "auc"
    scoring = "f1"
else:
    scoring = "accuracy"

In [None]:
# hyperparameters generated in feature selection


with open('./data/hyperparametertuning.json', 'r') as f:
    data = json.load(f)
    ne = data['ne']
    lr = data['lr']
    md = data['md']
    gm = data['gm']



In [None]:
# Build First Classifier Model
classifier = XGBClassifier(
    objective=objective,
    booster="gbtree",
    # eval_metric=eval_metric,
    subsample = 0.8,
    colsample_bytree=1,
    random_state=1,
    use_label_encoder=False,

    n_estimators=ne,
    learning_rate=lr,
    max_depth=md,
    gamma=gm,
)

In [None]:
# Fit the model
eval_set = [(X_train,y_train) , (X_test,y_test)]

classifier.fit(
    X_train,
    y_train,
    eval_metric=eval_metric_list,
    eval_set=eval_set,
    verbose=False
)


# MAKE PREDICTIONS
- binary if the next day will be up (1) or down (0) the next day 

In [None]:
# training data
next_day_target_predictions = classifier.predict(X_train)
train_yhat_probability = classifier.predict_proba(X_train)


# OUTPUT
prediction_probabilities = zip(next_day_target_predictions , train_yhat_probability)
print("Predcition \t|\t Binary Confidence (zero or one)")
print("___________________________________________________\n")
for prediction , probability_confidence in list(prediction_probabilities)[:10]:
    print(prediction, "\t\t|\t",  probability_confidence)

# TEST MODEL ACCURACY

In [None]:
# TEST Y HAT
test_next_day_target_predictions = classifier.predict(X_test)
test_yhat_probability = classifier.predict_proba(X_test)


# OUTPUT
prediction_probabilities = zip(test_next_day_target_predictions , test_yhat_probability)
print("Predcition \t|\t Binary Confidence (zero or one)")
print("___________________________________________________\n")
for prediction , probability_confidence in list(prediction_probabilities)[:10]:
    print(prediction, "\t\t|\t",  probability_confidence)

# K FOLD CROSS VALIDATION

In [None]:
# KFOLD CROSS VALIDATION
cv = RepeatedStratifiedKFold(n_splits=5 , n_repeats=1, random_state=1)

In [None]:
# Training results 
train_results = cross_val_score(classifier, X_train, y_train , cv=cv , n_jobs=1 )

# EVALUTION METRICS - Loss & Overfitting