### Importing Modules

In [1]:
import numpy as np
import pandas as pd
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, f1_score
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.optimizers import AdamW # type: ignore
from sklearn.decomposition import PCA

<hr>

### Data preparation

##### Note: unzip btc_data.csv.zip in advance

In [2]:
btc_data = pd.read_csv('btc_data.csv', header=0, index_col=0)

In [3]:
btc_data_copy = btc_data.copy()

In [4]:
nan_counts = btc_data.isna().sum().sort_values(ascending=False)
print(nan_counts.head(15).index)

Index(['emv_ma', 'cmf', 'adl', 'emv', 'mfi', 'swi', 'vrc', 'cmo', 'vhf', 'cci',
       'wr', 'uosc', '%D', 'imi', '%K'],
      dtype='object')


Delete NaN before applying PCA

In [5]:
btc_data_copy = btc_data_copy.dropna()

In [6]:
btc_data_copy_targets = btc_data_copy.target_label
btc_data_copy_features = btc_data_copy.drop('target_label', axis=1)

In [7]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(btc_data_copy_features)

### PCA

In [8]:
pca = PCA(n_components=7)

principal_components = pca.fit_transform(df_scaled)

pca_df = pd.DataFrame(data=principal_components, index=btc_data_copy_features.index)

In [9]:
pca_df.shape

(1015072, 7)

In [10]:
btc_data_copy_targets.shape

(1015072,)

Will take into account class imbalance building models

In [11]:
btc_data.target_label.value_counts()

target_label
 1    2543652
-1    1223109
Name: count, dtype: int64

Simple split

In [12]:
split_index = int(len(pca_df) * 0.8)
X_train = pca_df.iloc[:split_index]
X_test = pca_df.iloc[split_index:]
y_train = btc_data_copy_targets.iloc[:split_index]
y_test = btc_data_copy_targets.iloc[split_index:]

Complex split

In [13]:
tscv = TimeSeriesSplit(n_splits=10)

# Models 

### lgb classifier

### Bayesian Optimization for hyperparameters tuning

In [14]:
def objective(trial):

    param = {
        "objective": "binary",
        "metric": "f1",
        "verbosity": -1,
        "n_jobs": -1,
        "random_state": 42,
        "is_unbalance": True,
        "subsample": 1.0,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        "boosting_type": 'gbdt',
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-1, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-1, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 10, 20),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 0.9),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.8),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 3),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_depth": trial.suggest_int('max_depth', 1, 5),
        'max_bin': trial.suggest_int('max_bin', 100, 150),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 150),
    }
    
    dtrain = lgb.Dataset(X_train, label=y_train)

    gbm = lgb.train(param, dtrain)
    y_pred = gbm.predict(X_test)
    y_pred = np.where(y_pred < 0.5, -1, 1)   
    f1 = f1_score(y_test, y_pred) # target metric
    return f1

Optimization for simple split

In [15]:
sampler = TPESampler(n_startup_trials=10, seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=20)

[I 2024-10-31 15:42:20,970] A new study created in memory with name: no-name-fd6e516e-dd69-41ac-a47f-03f7a6b4f215


[I 2024-10-31 15:42:23,390] Trial 0 finished with value: 0.7993542345435746 and parameters: {'learning_rate': 0.023688639503640783, 'lambda_l1': 7.969454818643936, 'lambda_l2': 2.9106359131330697, 'num_leaves': 16, 'feature_fraction': 0.22481491235394924, 'bagging_fraction': 0.2935967122017216, 'bagging_freq': 1, 'min_child_samples': 88, 'max_depth': 4, 'max_bin': 136, 'min_data_in_leaf': 22}. Best is trial 0 with value: 0.7993542345435746.
[I 2024-10-31 15:42:24,873] Trial 1 finished with value: 0.8299052188618398 and parameters: {'learning_rate': 0.09330606024425668, 'lambda_l1': 4.622589001020832, 'lambda_l2': 0.26587543983272705, 'num_leaves': 12, 'feature_fraction': 0.24672360788274705, 'bagging_fraction': 0.38254534577572263, 'bagging_freq': 2, 'min_child_samples': 46, 'max_depth': 2, 'max_bin': 131, 'min_data_in_leaf': 38}. Best is trial 1 with value: 0.8299052188618398.
[I 2024-10-31 15:42:26,997] Trial 2 finished with value: 0.7970873313525888 and parameters: {'learning_rate':

Optimization for complex split

In [16]:
for train_index, test_index in tscv.split(pca_df):
    X_train, X_test = pca_df.iloc[train_index], pca_df.iloc[test_index]
    y_train, y_test = btc_data_copy_targets.iloc[train_index], btc_data_copy_targets.iloc[test_index]
    sampler_complex = TPESampler(n_startup_trials=10, seed=42)
    study_complex = optuna.create_study(direction='maximize', sampler=sampler_complex)
    study_complex.optimize(objective, n_trials=20)

[I 2024-10-31 15:43:08,758] A new study created in memory with name: no-name-5a2e9d38-3023-47e1-a0f8-ab9f7abcc1f7
[I 2024-10-31 15:43:09,312] Trial 0 finished with value: 0.8384515075802068 and parameters: {'learning_rate': 0.023688639503640783, 'lambda_l1': 7.969454818643936, 'lambda_l2': 2.9106359131330697, 'num_leaves': 16, 'feature_fraction': 0.22481491235394924, 'bagging_fraction': 0.2935967122017216, 'bagging_freq': 1, 'min_child_samples': 88, 'max_depth': 4, 'max_bin': 136, 'min_data_in_leaf': 22}. Best is trial 0 with value: 0.8384515075802068.
[I 2024-10-31 15:43:09,605] Trial 1 finished with value: 0.8572641967319204 and parameters: {'learning_rate': 0.09330606024425668, 'lambda_l1': 4.622589001020832, 'lambda_l2': 0.26587543983272705, 'num_leaves': 12, 'feature_fraction': 0.24672360788274705, 'bagging_fraction': 0.38254534577572263, 'bagging_freq': 2, 'min_child_samples': 46, 'max_depth': 2, 'max_bin': 131, 'min_data_in_leaf': 38}. Best is trial 1 with value: 0.8572641967319

---

## Neural Networks (CHANGE THE DATA SPLITAGE)

In [91]:
X = combined_df.drop(['close', 'close_target'], axis=1)
y = combined_df["close_target"]
tscv = TimeSeriesSplit()

In [92]:
num = X.select_dtypes(include=['float64', 'int64']).columns

numeric = make_pipeline(SimpleImputer(strategy="median"),
                        StandardScaler())

preproccessing_pipeline = ColumnTransformer([
    ('num', numeric, num)
    ], remainder='passthrough')

In [104]:
# from sklearn.pipeline import make_pipeline, Pipeline

# preproccessing = ColumnTransformer([
#     ('num', Pipeline([
#         ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
#         ('scaler', StandardScaler())  # Scale numerical data
#     ]), num)
# ], remainder='passthrough')

# X_train_processed = preproccessing.fit_transform(X_train)
# X_test_processed = preproccessing.transform(X_test)

# tf.random.set_seed(42)
# nn_model = tf.keras.Sequential([
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(300, activation="relu"),
#     tf.keras.layers.Dense(1, activation="linear")  # Linear activation for regression
# ])

# adam_optimizer = AdamW(learning_rate=0.1)

# # Use mean squared error for regression
# nn_model.compile(loss="mean_squared_error",
#                  optimizer=adam_optimizer,
#                  metrics=[tf.keras.metrics.RootMeanSquaredError()])

# history = nn_model.fit(X_train_processed, y_train, epochs=30, batch_size=8,
#                        validation_data=(X_test_processed, y_test))


<hr>

In [105]:
# TimeSeriesSplit для кросс-валидации
tscv = TimeSeriesSplit(n_splits=5)

# Валидация на временных рядах
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Предобработка
    X_train_processed = preproccessing_pipeline.fit_transform(X_train)
    X_test_processed = preproccessing_pipeline.transform(X_test)
    
    # Построение и компиляция модели
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(400, activation="relu"),
        tf.keras.layers.Dense(1, activation="linear")
    ])
    
    adam_optimizer = AdamW(learning_rate=0.01)
    
    nn_model.compile(loss="mean_squared_error",
                     optimizer = adam_optimizer,
                     metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    # Обучение модели
    history = nn_model.fit(X_train_processed, y_train, epochs=30, batch_size=8,
                           validation_data=(X_test_processed, y_test))


Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 166ms/step - loss: 3650553600.0000 - root_mean_squared_error: 60418.8945 - val_loss: 3968795904.0000 - val_root_mean_squared_error: 62998.3789
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 3649643008.0000 - root_mean_squared_error: 60411.3555 - val_loss: 3967686400.0000 - val_root_mean_squared_error: 62989.5742
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 3648480256.0000 - root_mean_squared_error: 60401.7305 - val_loss: 3966282496.0000 - val_root_mean_squared_error: 62978.4297
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 3646931712.0000 - root_mean_squared_error: 60388.9102 - val_loss: 3964512768.0000 - val_root_mean_squared_error: 62964.3789
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 3644932608.0000 - root_mean_squared_error: 60372