# Importing Modules

In [7]:
import requests
import datetime
import numpy as np
import pandas as pd
import tti.indicators as ti  # Импорт всех индикаторов
import inspect
import warnings
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import make_scorer, root_mean_squared_error, mean_absolute_percentage_error
from scipy.stats import randint, uniform
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import tensorflow as tf
from sklearn.tree import DecisionTreeRegressor
from tensorflow.keras.optimizers import AdamW # type: ignore
from sklearn.model_selection import TimeSeriesSplit

<hr>

# Data

In [14]:
from dotenv import load_dotenv
import os

load_dotenv()  # Загружаем переменные из .env файла
api_key = os.getenv("API_KEY")


url = 'https://api.coingecko.com/api/v3/coins/bitcoin/market_chart'

headers = {"accept": "application/json",
    "x-cg-api-key": api_key}

# Параметры запроса
params = {
    'vs_currency': 'usd',  # Валюта для отображения цены (например, USD)
    'days': '365',         # Данные за последний год
    'interval': 'daily'    # Получение данных на ежедневной основе
}
response = requests.get(url, params=params, headers=headers)
data = response.json()


In [3]:
# Define the parameters
coin_id = 'bitcoin'
vs_currency = 'usd'
days = '365'

# Make the API request
url_ohlc = f'https://api.coingecko.com/api/v3/coins/{coin_id}/ohlc?vs_currency={vs_currency}&days={days}'
response = requests.get(url_ohlc, headers=headers)
ohlc_data = response.json()
ohlc_data[:5]

[[1697932800000, 29677.0, 30253.0, 29484.0, 29920.0],
 [1698278400000, 29920.0, 35066.0, 29741.0, 34472.0],
 [1698624000000, 34498.0, 34819.0, 33450.0, 34556.0],
 [1698969600000, 34525.0, 35878.0, 34108.0, 34924.0],
 [1699315200000, 34937.0, 35366.0, 34123.0, 35031.0]]

In [4]:
columns = ['timestamp', 'open', 'high', 'low', 'close']

df_ohlc = pd.DataFrame(ohlc_data, columns=columns)
df_ohlc.timestamp = pd.to_datetime(df_ohlc.timestamp, unit='ms')
df_ohlc.head()

Unnamed: 0,timestamp,open,high,low,close
0,2023-10-22,29677.0,30253.0,29484.0,29920.0
1,2023-10-26,29920.0,35066.0,29741.0,34472.0
2,2023-10-30,34498.0,34819.0,33450.0,34556.0
3,2023-11-03,34525.0,35878.0,34108.0,34924.0
4,2023-11-07,34937.0,35366.0,34123.0,35031.0


In [5]:
# Создаем список для хранения отформатированных данных
formatted_data = []

# Извлекаем данные
for i in range(len(data["prices"])):
    # Извлекаем timestamp (одинаковый для всех ключей)
    timestamp = data["prices"][i][0]
    
    # Переводим метку времени из миллисекунд в секунды
    timestamp_in_seconds = timestamp / 1000
    
    # Преобразуем timestamp в объект даты
    date = datetime.datetime.fromtimestamp(timestamp_in_seconds).strftime('%Y-%m-%d')
    
    # Извлекаем уникальные значения для каждого ключа
    price = data["prices"][i][1]
    market_cap = data["market_caps"][i][1]
    total_volumes = data["total_volumes"][i][1]
    
    # Добавляем отформатированные данные в список
    formatted_data.append([date, price, market_cap, total_volumes])

# Создаем DataFrame с колонками "timestamp", "price", "market_cap", "total_volumes"
df = pd.DataFrame(formatted_data, columns=['timestamp', 'prices', 'market_caps', 'total_volumes'])
df.timestamp = pd.to_datetime(df.timestamp)
df = df.iloc[:, [0, -1]]
# Выводим DataFrame
df.head()

Unnamed: 0,timestamp,total_volumes
0,2023-10-22,7452489000.0
1,2023-10-23,11330300000.0
2,2023-10-24,35558860000.0
3,2023-10-25,46464710000.0
4,2023-10-26,23840460000.0


In [6]:
df_final = pd.merge(df_ohlc, df[:365], on='timestamp', how='inner')
df_final = df_final.rename(columns={'timestamp':'date', 'total_volumes':'volume'}).set_index('date').sort_index(ascending=False)
df_final_copy = df_final.copy()
df_final_copy.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-10-20,67103.0,68970.0,66739.0,68389.0,14110890000.0
2024-10-16,62465.0,67803.0,62060.0,66962.0,51797930000.0
2024-10-12,62211.0,63362.0,58935.0,62392.0,32008660000.0
2024-10-08,60749.0,64500.0,60470.0,62287.0,33878880000.0
2024-10-04,65603.0,65603.0,59954.0,60728.0,37711140000.0


<hr>

# Feature engineering

## To-do
- try log 
- try data sampling 
- try forecast target

In [89]:
warnings.simplefilter(action='ignore', category=FutureWarning)

# Создадим новый DataFrame, который будет содержать ваши исходные данные OHLSW
combined_df = df_final.copy()

# Получаем все классы из tti.indicators, которые являются индикаторами
indicator_classes = [cls for _, cls in inspect.getmembers(ti, inspect.isclass)]

# Проходим по каждому индикатору и добавляем его данные в основной DataFrame
for indicator_class in indicator_classes:
    try:
        # Инициализируем индикатор с вашим OHLSW DataFrame
        indicator = indicator_class(input_data=df_final)
        
        # Получаем рассчитанные данные индикатора
        indicator_data = indicator.getTiData()

        # Присоединяем данные индикатора к основному DataFrame
        # Примечание: добавляем как новые колонки (проверяем пересечение по индексам)
        combined_df = combined_df.join(indicator_data, how='left')
        
        #print(f"Добавлен индикатор: {indicator_class.__name__}")
    except Exception as e:
        pass
        #print(f"Ошибка при вычислении {indicator_class.__name__}: {e}")

# Выводим объединённый DataFrame
combined_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,adl,middle_band,upper_band,lower_band,cmf,...,tp,uosc,vhf,vch,vosc,vrc,wc,ws,wad,wr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-10-16,62465.0,67803.0,62060.0,66962.0,51797930000.0,361902942122,61053.7,66929.4047,55177.9953,0.186,...,65608.3333,59.0712,0.4556,-11.2464,8234233000.0,98.5535,65946.75,62896.2072,4902.0,-9.4835
2024-10-12,62211.0,63362.0,58935.0,62392.0,32008660000.0,325275499076,61108.35,67218.5053,54998.1947,-0.112,...,61563.0,55.9873,0.5272,-19.7979,4416763000.0,147.3988,61770.25,61879.759,3457.0,-53.9312
2024-10-08,60749.0,64500.0,60470.0,62287.0,33878880000.0,307293674868,61285.85,67731.8764,54839.8236,-0.1993,...,62419.0,57.4873,0.3999,-24.7038,11082120000.0,-0.9134,62386.0,61751.6988,1817.0,-57.4512
2024-10-04,65603.0,65603.0,59954.0,60728.0,37711140000.0,310622715799,61506.0,68361.5014,54650.4986,-0.0346,...,62095.0,60.8172,0.4822,-20.0747,554574500.0,18.1436,61753.25,61617.8734,-4936.0,-64.3131
2024-09-30,63146.0,66439.0,62812.0,65664.0,12948710000.0,337999840733,61711.35,68705.9634,54716.7366,0.4622,...,64971.6667,65.5435,0.5498,-23.6821,-4098893000.0,-63.6838,65144.75,61840.3418,2852.0,-7.1323


In [90]:
combined_df['price_change'] = combined_df['close'].pct_change()
combined_df.sort_index(ascending=True)
combined_df['close_target'] = combined_df['close'].shift(-1)
combined_df = combined_df[:-1]
combined_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,adl,middle_band,upper_band,lower_band,cmf,...,vhf,vch,vosc,vrc,wc,ws,wad,wr,price_change,close_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-10-16,62465.0,67803.0,62060.0,66962.0,51797930000.0,361902942122,61053.7,66929.4047,55177.9953,0.186,...,0.4556,-11.2464,8234233000.0,98.5535,65946.75,62896.2072,4902.0,-9.4835,,62392.0
2024-10-12,62211.0,63362.0,58935.0,62392.0,32008660000.0,325275499076,61108.35,67218.5053,54998.1947,-0.112,...,0.5272,-19.7979,4416763000.0,147.3988,61770.25,61879.759,3457.0,-53.9312,-0.068248,62287.0
2024-10-08,60749.0,64500.0,60470.0,62287.0,33878880000.0,307293674868,61285.85,67731.8764,54839.8236,-0.1993,...,0.3999,-24.7038,11082120000.0,-0.9134,62386.0,61751.6988,1817.0,-57.4512,-0.001683,60728.0
2024-10-04,65603.0,65603.0,59954.0,60728.0,37711140000.0,310622715799,61506.0,68361.5014,54650.4986,-0.0346,...,0.4822,-20.0747,554574500.0,18.1436,61753.25,61617.8734,-4936.0,-64.3131,-0.025029,65664.0
2024-09-30,63146.0,66439.0,62812.0,65664.0,12948710000.0,337999840733,61711.35,68705.9634,54716.7366,0.4622,...,0.5498,-23.6821,-4098893000.0,-63.6838,65144.75,61840.3418,2852.0,-7.1323,0.08128,63152.0


<hr>

# Metrics

* RMSE
* RMSLE
* MAPE
* PnL metric (1 if profit, -1 if loss)

<hr>

# Models 

* Random Forest
* GB's
* NN's
- (try bayesian optimization)

## Preprocessing

In [91]:
X = combined_df.drop(['close', 'close_target'], axis=1)
y = combined_df["close_target"]
tscv = TimeSeriesSplit()

In [92]:
num = X.select_dtypes(include=['float64', 'int64']).columns

numeric = make_pipeline(SimpleImputer(strategy="median"),
                        StandardScaler())

preproccessing_pipeline = ColumnTransformer([
    ('num', numeric, num)
    ], remainder='passthrough')

## Random Forest Regressor test

In [93]:
errors_rf = []

rf_model = make_pipeline(preproccessing_pipeline,
                        RandomForestRegressor(random_state=42)
                        )

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_rf)
    errors_rf.append(error)

r_mean_error = np.mean(errors_rf)
print(f'Average RandomForestRegressor RMSE: {r_mean_error}')


Average RandomForestRegressor RMSE: 5965.883092888658


In [94]:
# rf_model = make_pipeline(preproccessing_pipeline,
#                         RandomForestRegressor(random_state=42)
#                         )
# rf_model.fit(X_train, y_train)
# y_pred_rf = rf_model.predict(X_test)
# print(root_mean_squared_error(y_test, y_pred_rf))

## XGBoost Regressor test

In [95]:
errors_xgb = []

for train_index, test_index in tscv.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    xgb_model = make_pipeline(preproccessing_pipeline,
                            xgb.XGBRegressor(
                                objective='reg:squarederror',
                                random_state=42
                            ))
    
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_xgb)
    errors_xgb.append(error)

r_mean_error = np.mean(errors_xgb)
print(f'Average XGBoost RMSE: {r_mean_error}')

Average XGBoost RMSE: 3730.7471875256233


In [96]:
# xgb_model = make_pipeline(preproccessing_pipeline,
#                           xgb.XGBRegressor(objective='reg:squarederror',
#                               random_state=42
#                           ))
# xgb_model.fit(X_train, y_train)
# y_pred_xgb = xgb_model.predict(X_test)
# print(root_mean_squared_error(y_test, y_pred_xgb))

## CatBoost Regression test

In [97]:
errors_cat = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    cat_model = make_pipeline(preproccessing_pipeline,
                            CatBoostRegressor(
                                loss_function='RMSE',
                                random_state=42,
                                verbose=0
                            ))
    
    cat_model.fit(X_train, y_train)
    y_pred_cat = cat_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_cat)
    errors_cat.append(error)

r_mean_error = np.mean(errors_cat)
print(f'Average CatBoost RMSE : {r_mean_error}')

Average CatBoost RMSE : 7670.239639801946


In [98]:
# cb_model = make_pipeline(preproccessing_pipeline,
#                           CatBoostRegressor(loss_function='RMSE',
#                               random_state=42,
#                               verbose=0
#                           ))
# cb_model.fit(X_train, y_train)
# y_pred_cb = cb_model.predict(X_test)
# print(root_mean_squared_error(y_test, y_pred_cb))

## LightGBM Regression test

In [99]:
errors_lgbm = []

for train_index, test_index in tscv.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    lgbm_model = make_pipeline(preproccessing_pipeline,
                                lgb.LGBMRegressor(
                                objective='regression', 
                                random_state=42,
                                max_bins=255,
                                min_data_in_bin= 1,
                                min_data_in_leaf=1,
                                  # уменьшите это значение
    ))
    
    lgbm_model.fit(X_train, y_train)
    y_pred_lgbm = lgbm_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_lgbm)
    errors_lgbm.append(error)

r_mean_error = np.mean(errors_lgbm)
print(f'Average LightGBM RMSE : {r_mean_error}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1110
[LightGBM] [Info] Number of data points in the train set: 16, number of used features: 70
[LightGBM] [Info] Start training from score 60597.937500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2077
[LightGBM] [Info] Number of data points in the train set: 31, number of used features: 70
[LightGBM] [Info] Start training from score 61710.096774
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3053
[LightGBM] [Info] Number of data points in the train s

In [100]:
# lgbm_model = make_pipeline(preproccessing_pipeline, 
#                            lgb.LGBMRegressor(
#     objective='regression', 
#     learning_rate=0.1, 
#     n_estimators=100, 
#     num_leaves=31, 
#     random_state=42))

# lgbm_model.fit(X_train, y_train)
# y_pred_lgbm = lgbm_model.predict(X_test)
# print(root_mean_squared_error(y_test, y_pred_lgbm))

## GradientBoostingRegressor

In [101]:
errors_grad_boost = []

X_new = X.fillna(1)

for train_index, test_index in tscv.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    grad_boost_model = make_pipeline(preproccessing_pipeline,
                                GradientBoostingRegressor(                         
                                  random_state=42,
    ))
    
  
    grad_boost_model.fit(X_train, y_train)
    y_pred_grad_boost = grad_boost_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_grad_boost)
    errors_grad_boost.append(error)

r_mean_error = np.mean(errors_grad_boost)
print(f'Average GradientBoosting RMSE: {r_mean_error}')

Average GradientBoosting RMSE: 5254.835064710971


## Ada Boost

In [102]:
errors_ada = []

for train_index, test_index in tscv.split(X):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    ada_model = make_pipeline(preproccessing_pipeline,
                                AdaBoostRegressor(
                                estimator = DecisionTreeRegressor(max_depth=8),
                                learning_rate = 0.1,
                                random_state = 42)
    )
    
    ada_model.fit(X_train, y_train)
    y_pred_ada = ada_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_ada)
    errors_ada.append(error)

r_mean_error = np.mean(errors_ada)
print(f'Average Ada RMSE: {r_mean_error}')

Average Ada RMSE: 5418.019110106931


## Neural Networks (CHANGE THE DATA SPLITAGE)

In [103]:
# X = combined_df.drop(['close', 'close_target'], axis=1)
# y = combined_df["close_target"]
# #tscv = TimeSeriesSplit()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [104]:
# from sklearn.pipeline import make_pipeline, Pipeline

# preproccessing = ColumnTransformer([
#     ('num', Pipeline([
#         ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
#         ('scaler', StandardScaler())  # Scale numerical data
#     ]), num)
# ], remainder='passthrough')

# X_train_processed = preproccessing.fit_transform(X_train)
# X_test_processed = preproccessing.transform(X_test)

# tf.random.set_seed(42)
# nn_model = tf.keras.Sequential([
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(300, activation="relu"),
#     tf.keras.layers.Dense(1, activation="linear")  # Linear activation for regression
# ])

# adam_optimizer = AdamW(learning_rate=0.1)

# # Use mean squared error for regression
# nn_model.compile(loss="mean_squared_error",
#                  optimizer=adam_optimizer,
#                  metrics=[tf.keras.metrics.RootMeanSquaredError()])

# history = nn_model.fit(X_train_processed, y_train, epochs=30, batch_size=8,
#                        validation_data=(X_test_processed, y_test))


<hr>

In [105]:
# TimeSeriesSplit для кросс-валидации
tscv = TimeSeriesSplit(n_splits=5)

# Валидация на временных рядах
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Предобработка
    X_train_processed = preproccessing_pipeline.fit_transform(X_train)
    X_test_processed = preproccessing_pipeline.transform(X_test)
    
    # Построение и компиляция модели
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(400, activation="relu"),
        tf.keras.layers.Dense(1, activation="linear")
    ])
    
    adam_optimizer = AdamW(learning_rate=0.01)
    
    nn_model.compile(loss="mean_squared_error",
                     optimizer = adam_optimizer,
                     metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    # Обучение модели
    history = nn_model.fit(X_train_processed, y_train, epochs=30, batch_size=8,
                           validation_data=(X_test_processed, y_test))


Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 166ms/step - loss: 3650553600.0000 - root_mean_squared_error: 60418.8945 - val_loss: 3968795904.0000 - val_root_mean_squared_error: 62998.3789
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 3649643008.0000 - root_mean_squared_error: 60411.3555 - val_loss: 3967686400.0000 - val_root_mean_squared_error: 62989.5742
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 3648480256.0000 - root_mean_squared_error: 60401.7305 - val_loss: 3966282496.0000 - val_root_mean_squared_error: 62978.4297
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 3646931712.0000 - root_mean_squared_error: 60388.9102 - val_loss: 3964512768.0000 - val_root_mean_squared_error: 62964.3789
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 3644932608.0000 - root_mean_squared_error: 60372

# TG integration

# CEX integration

# Deployment

# Security