### Import of libraries and defining paths to files

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import xgboost as xgb
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
BASE_PATH = r"C:\Users\micha\OneDrive\Pulpit\Project\Bitcoin_pred\data"

PATH_TO_ORIGINAL_DATA = f"{BASE_PATH}\\Bitcoin.csv"
PATH_TO_CLEANED_DATA = f"{BASE_PATH}/cleaned_data.csv"
PATH_TO_TARGET_DATA = f"{BASE_PATH}/target_data.csv"
PATH_TO_TIME_FEATURES = f"{BASE_PATH}/time_features.csv"
PATH_TO_SPECIFIC_FEATURES = f"{BASE_PATH}/specific_features.csv"

PATH_TO_TRAINING_DATA = f"{BASE_PATH}/training_data.csv"
PATH_TO_VALIDATION_DATA = f"{BASE_PATH}/validation_data.csv"
PATH_TO_TESTING_DATA = f"{BASE_PATH}/testing_data.csv"

### Division of data into training, validation, and test sets





In [16]:
cleaned_df = pd.read_csv(PATH_TO_CLEANED_DATA)
target_df = pd.read_csv(PATH_TO_TARGET_DATA)
time_df = pd.read_csv(PATH_TO_TIME_FEATURES)
features_df = pd.read_csv(PATH_TO_SPECIFIC_FEATURES)

cleaned_df['date'] = pd.to_datetime(cleaned_df['date'])
target_df['date'] = pd.to_datetime(target_df['date'])
time_df['date'] = pd.to_datetime(time_df['date'])
features_df['date'] = pd.to_datetime(features_df['date'])

dfs = [cleaned_df, time_df, features_df, target_df]
training_df = reduce(lambda left, right: pd.merge(left, right, on='date', how='inner'), dfs)
training_df.set_index('date', inplace=True)

In [17]:
training_df

Unnamed: 0_level_0,price,total_volume,market_cap,year,month,day,dayofweek,quarter,dayofyear,weekofyear,...,total_volume_MACD,total_volume_MACD_Signal,total_volume_Bollinger_Upper,total_volume_Bollinger_Lower,market_cap_RSI,market_cap_MACD,market_cap_MACD_Signal,market_cap_Bollinger_Upper,market_cap_Bollinger_Lower,price_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-04,260.936000,2.055001e+08,3.571640e+09,2015,1,4,6,1,4,1,...,0.000000e+00,0.000000e+00,,,,0.000000e+00,0.000000e+00,,,313.992000
2015-01-05,273.220000,1.550381e+08,3.740880e+09,2015,1,5,0,1,5,2,...,-4.025461e+06,-8.050921e+05,,,100.000000,1.350068e+07,2.700136e+06,,,314.446000
2015-01-06,285.373800,9.700290e+07,3.908408e+09,2015,1,6,1,1,6,2,...,-1.176303e+07,-2.996679e+06,,,100.000000,3.728832e+07,9.617772e+06,,,286.572000
2015-01-07,295.872500,1.106742e+08,4.053239e+09,2015,1,7,2,1,7,2,...,-1.660058e+07,-5.717460e+06,,,100.000000,6.705385e+07,2.110499e+07,,,260.936000
2015-01-08,284.452500,8.657054e+07,3.897824e+09,2015,1,8,3,1,8,2,...,-2.212431e+07,-8.998830e+06,,,75.602625,7.721254e+07,3.232650e+07,,,273.220000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-10,46105.946078,3.988792e+10,9.021669e+11,2024,1,10,2,1,10,2,...,2.721649e+09,1.338238e+09,4.358349e+10,3.405973e+09,63.588921,1.896935e+10,1.538745e+10,9.046099e+11,8.075741e+11,43956.120717
2024-01-11,46632.313148,5.203006e+10,9.152593e+11,2024,1,11,3,1,11,2,...,4.472061e+09,1.965003e+09,4.876756e+10,1.230042e+09,63.066093,2.124470e+10,1.655890e+10,9.142188e+11,8.036599e+11,43883.743879
2024-01-12,46314.355542,4.919813e+10,9.098464e+11,2024,1,12,4,1,12,2,...,5.566594e+09,2.685321e+09,5.241930e+10,4.147187e+08,65.049323,2.235347e+10,1.771781e+10,9.211350e+11,8.015461e+11,46936.185561
2024-01-13,42893.929606,4.591330e+10,8.383835e+11,2024,1,13,5,1,13,2,...,6.098659e+09,3.367989e+09,5.438851e+10,2.049636e+09,52.545842,1.726669e+10,1.762759e+10,9.210678e+11,7.997667e+11,46105.946078


In [18]:
training_df.sort_values(by='date', inplace=True)

n_rows = len(training_df)
train_idx = int(n_rows * 0.7)
valid_idx = int(n_rows * 0.85)

train_data = training_df[:train_idx]
valid_data = training_df[train_idx:valid_idx]
test_data = training_df[valid_idx:]

X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
X_valid, y_valid = valid_data.iloc[:, :-1], valid_data.iloc[:, -1]
X_test, y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]