In [19]:
import requests
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import make_scorer, root_mean_squared_error, mean_absolute_percentage_error
from scipy.stats import randint, uniform
import xgboost as xgb
from catboost import CatBoostRegressor
import lightgbm as lgb

In [20]:
api_key = 'CG-zcrDBGKQkMdhBSPJWtAijUdT'


url = 'https://api.coingecko.com/api/v3/coins/bitcoin/market_chart'

headers = {"accept": "application/json",
    "x-cg-api-key": api_key}

# Параметры запроса
params = {
    'vs_currency': 'usd',  # Валюта для отображения цены (например, USD)
    'days': '365',         # Данные за последний год
    'interval': 'daily'    # Получение данных на ежедневной основе
}
response = requests.get(url, params=params, headers=headers)
data = response.json()

In [21]:
# Define the parameters
coin_id = 'bitcoin'
vs_currency = 'usd'
days = '365'

# Make the API request
url_ohlc = f'https://api.coingecko.com/api/v3/coins/{coin_id}/ohlc?vs_currency={vs_currency}&days={days}'
response = requests.get(url_ohlc, headers=headers)
ohlc_data = response.json()
print(ohlc_data)

[[1697241600000, 26759.0, 27101.0, 26695.0, 26841.0], [1697587200000, 26861.0, 29483.0, 26816.0, 28418.0], [1697932800000, 28414.0, 30253.0, 28188.0, 29920.0], [1698278400000, 29920.0, 35066.0, 29741.0, 34472.0], [1698624000000, 34498.0, 34819.0, 33450.0, 34556.0], [1698969600000, 34525.0, 35878.0, 34108.0, 34924.0], [1699315200000, 34937.0, 35366.0, 34123.0, 35031.0], [1699660800000, 35058.0, 37936.0, 34546.0, 37344.0], [1700006400000, 37319.0, 37404.0, 35110.0, 35545.0], [1700352000000, 35541.0, 37904.0, 35389.0, 36582.0], [1700697600000, 36588.0, 37795.0, 35788.0, 37465.0], [1701043200000, 37427.0, 38349.0, 36972.0, 37492.0], [1701388800000, 37472.0, 38351.0, 36770.0, 37712.0], [1701734400000, 37720.0, 42403.0, 37632.0, 41974.0], [1702080000000, 41985.0, 44676.0, 41453.0, 44202.0], [1702425600000, 44158.0, 44346.0, 40322.0, 41451.0], [1702771200000, 41459.0, 43329.0, 40712.0, 42247.0], [1703116800000, 42248.0, 44201.0, 40571.0, 43634.0], [1703462400000, 43660.0, 44243.0, 42895.0, 43

In [22]:
columns = ['timestamp', 'open', 'high', 'low', 'close']

df_ohlc = pd.DataFrame(ohlc_data, columns=columns)
df_ohlc.timestamp = pd.to_datetime(df_ohlc.timestamp, unit='ms')
df_ohlc.head()

Unnamed: 0,timestamp,open,high,low,close
0,2023-10-14,26759.0,27101.0,26695.0,26841.0
1,2023-10-18,26861.0,29483.0,26816.0,28418.0
2,2023-10-22,28414.0,30253.0,28188.0,29920.0
3,2023-10-26,29920.0,35066.0,29741.0,34472.0
4,2023-10-30,34498.0,34819.0,33450.0,34556.0


In [23]:
# Создаем список для хранения отформатированных данных
formatted_data = []

# Извлекаем данные
for i in range(len(data["prices"])):
    # Извлекаем timestamp (одинаковый для всех ключей)
    timestamp = data["prices"][i][0]
    
    # Переводим метку времени из миллисекунд в секунды
    timestamp_in_seconds = timestamp / 1000
    
    # Преобразуем timestamp в объект даты
    date = datetime.datetime.fromtimestamp(timestamp_in_seconds).strftime('%Y-%m-%d')
    
    # Извлекаем уникальные значения для каждого ключа
    price = data["prices"][i][1]
    market_cap = data["market_caps"][i][1]
    total_volumes = data["total_volumes"][i][1]
    
    # Добавляем отформатированные данные в список
    formatted_data.append([date, price, market_cap, total_volumes])

# Создаем DataFrame с колонками "timestamp", "price", "market_cap", "total_volumes"
df = pd.DataFrame(formatted_data, columns=['timestamp', 'prices', 'market_caps', 'total_volumes'])
df.timestamp = pd.to_datetime(df.timestamp)
df = df.iloc[:, [0, -1]]
# Выводим DataFrame
print(df)

     timestamp  total_volumes
0   2023-10-16   6.604846e+09
1   2023-10-17   2.279280e+10
2   2023-10-18   1.040325e+10
3   2023-10-19   1.346103e+10
4   2023-10-20   1.119392e+10
..         ...            ...
361 2024-10-11   2.948197e+10
362 2024-10-12   3.200866e+10
363 2024-10-13   1.766596e+10
364 2024-10-14   1.678455e+10
365 2024-10-14   1.740380e+10

[366 rows x 2 columns]


In [24]:
df_final = pd.merge(df_ohlc, df, on='timestamp', how='inner')
df_final = df_final.rename(columns={'timestamp':'date', 'total_volumes':'volume'}).set_index('date').sort_index(ascending=False)
df_final
df_final_copy = df_final.copy()
df_final_copy

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-10-12,62211.0,63362.0,58935.0,62392.0,3.200866e+10
2024-10-08,60749.0,64500.0,60470.0,62287.0,3.387888e+10
2024-10-04,65603.0,65603.0,59954.0,60728.0,3.771114e+10
2024-09-30,63146.0,66439.0,62812.0,65664.0,1.294871e+10
2024-09-26,63406.0,64755.0,62479.0,63152.0,2.608764e+10
...,...,...,...,...,...
2023-11-03,34525.0,35878.0,34108.0,34924.0,2.069019e+10
2023-10-30,34498.0,34819.0,33450.0,34556.0,1.035901e+10
2023-10-26,29920.0,35066.0,29741.0,34472.0,2.384046e+10
2023-10-22,28414.0,30253.0,28188.0,29920.0,7.452489e+09


## Feature engineering

In [None]:
import tti.indicators as ti  # Импорт всех индикаторов
import inspect

# Предположим, ваш DataFrame называется df и содержит OHLSW данные (open, high, low, close, volume)
# Индекс дат уже установлен

# Создадим новый DataFrame, который будет содержать ваши исходные данные OHLSW
combined_df = df_final.copy()

# Получаем все классы из tti.indicators, которые являются индикаторами
indicator_classes = [cls for _, cls in inspect.getmembers(ti, inspect.isclass)]

# Проходим по каждому индикатору и добавляем его данные в основной DataFrame
for indicator_class in indicator_classes:
    try:
        # Инициализируем индикатор с вашим OHLSW DataFrame
        indicator = indicator_class(input_data=df_final)
        
        # Получаем рассчитанные данные индикатора
        indicator_data = indicator.getTiData()

        # Присоединяем данные индикатора к основному DataFrame
        # Примечание: добавляем как новые колонки (проверяем пересечение по индексам)
        combined_df = combined_df.join(indicator_data, how='left')
        
        #print(f"Добавлен индикатор: {indicator_class.__name__}")
    except Exception as e:
        print(f"Ошибка при вычислении {indicator_class.__name__}: {e}")

# Выводим объединённый DataFrame
#print(combined_df.head())

In [26]:
combined_df['price_change'] = combined_df['close'].pct_change()
combined_df.sort_index(ascending=True)
combined_df['close_target'] = combined_df['close'].shift(-1)
combined_df = combined_df[:-1]
combined_df

Unnamed: 0_level_0,open,high,low,close,volume,adl,middle_band,upper_band,lower_band,cmf,...,vhf,vch,vosc,vrc,wc,ws,wad,wr,price_change,close_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-10-12,62211.0,63362.0,58935.0,62392.0,3.200866e+10,325065280070,61108.35,67218.5053,54998.1947,-0.1120,...,0.5272,-19.7979,4.416763e+09,147.3988,61770.25,61879.7590,3457.0,-53.9312,,62287.0
2024-10-08,60749.0,64500.0,60470.0,62287.0,3.387888e+10,307083455862,61285.85,67731.8764,54839.8236,-0.1993,...,0.3999,-24.7038,1.108212e+10,-0.9134,62386.00,61751.6988,1817.0,-57.4512,-0.001683,60728.0
2024-10-04,65603.0,65603.0,59954.0,60728.0,3.771114e+10,310412496794,61506.00,68361.5014,54650.4986,-0.0346,...,0.4822,-20.0747,5.545745e+08,18.1436,61753.25,61617.8734,-4936.0,-64.3131,-0.025029,65664.0
2024-09-30,63146.0,66439.0,62812.0,65664.0,1.294871e+10,337789621727,61711.35,68705.9634,54716.7366,0.4622,...,0.5498,-23.6821,-4.098893e+09,-63.6838,65144.75,61840.3418,2852.0,-7.1323,0.081280,63152.0
2024-09-26,63406.0,64755.0,62479.0,63152.0,2.608764e+10,330374549717,61297.55,68287.0619,54308.0381,0.5058,...,0.7815,-27.3084,-8.645548e+09,-13.0579,63384.50,60884.4273,-1603.0,-13.2864,-0.038255,63403.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-07,34937.0,35366.0,34123.0,35031.0,1.271735e+10,36293488874,,,,0.4556,...,0.7729,,1.691867e+09,22.2440,34887.75,32972.6000,908.0,-11.0143,-0.061938,34924.0
2023-11-03,34525.0,35878.0,34108.0,34924.0,2.069019e+10,30431026694,,,,0.4183,...,,,9.755198e+08,,34958.50,32458.0000,816.0,-10.5275,-0.003054,34556.0
2023-10-30,34498.0,34819.0,33450.0,34556.0,1.035901e+10,32044159873,,,,,...,,,,,34345.25,,1106.0,,-0.010537,34472.0
2023-10-26,29920.0,35066.0,29741.0,34472.0,2.384046e+10,25665307947,,,,,...,,,,,33437.75,,4731.0,,-0.002431,29920.0


### try log
### try data sampling
### try forecast target

# Metrics

* RMSE
* RMSLE
* MAPE
* PnL metric (1 if profit, -1 if loss)

# Models (try bayesian optimization)

* Random Forest
* GB's
* NN's

In [27]:
X = combined_df.drop(['close', 'close_target'], axis=1)
y = combined_df.close_target
tscv = TimeSeriesSplit()
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [28]:
# num = X_train.select_dtypes(include=['float64', 'int64']).columns
# preproccessing_pipeline = ColumnTransformer([
#     ('num', StandardScaler(), num)
#     ],
#     remainder='passthrough')

## Random Forest Regressor test

In [45]:
errors_rf = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    num = X_train.select_dtypes(include=['float64', 'int64']).columns
    preproccessing_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num)
    ],
    remainder='passthrough')
    
    rf_model = make_pipeline(preproccessing_pipeline,
                        RandomForestRegressor(random_state=42)
                        )
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_rf)
    errors_rf.append(error)

r_mean_error = np.mean(errors_rf)
print(f'Average RMSE across all splits: {r_mean_error}')


Average RMSE across all splits: 5615.29884511445


In [30]:
# rf_model = make_pipeline(preproccessing_pipeline,
#                         RandomForestRegressor(random_state=42)
#                         )
# rf_model.fit(X_train, y_train)
# y_pred_rf = rf_model.predict(X_test)
# print(root_mean_squared_error(y_test, y_pred_rf))

## XGBoost Regressor test

In [46]:
errors_xgb = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # num = X_train.select_dtypes(include=['float64', 'int64']).columns
    # preproccessing_pipeline = ColumnTransformer([
    # ('num', StandardScaler(), num)
    # ],
    # remainder='passthrough')
    
    xgb_model = make_pipeline(#preproccessing_pipeline,

                            xgb.XGBRegressor(objective='reg:squarederror',
                            random_state=42
                            ))
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_xgb)
    errors_xgb.append(error)

r_mean_error = np.mean(errors_xgb)
print(f'Average RMSE across all splits: {r_mean_error}')

Average RMSE across all splits: 3646.654143185998


In [None]:
# xgb_model = make_pipeline(preproccessing_pipeline,
#                           xgb.XGBRegressor(objective='reg:squarederror',
#                               random_state=42
#                           ))
# xgb_model.fit(X_train, y_train)
# y_pred_xgb = xgb_model.predict(X_test)
# print(root_mean_squared_error(y_test, y_pred_xgb))

## CatBoost Regression test

In [47]:
errors_cat = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # num = X_train.select_dtypes(include=['float64', 'int64']).columns
    # preproccessing_pipeline = ColumnTransformer([
    # ('num', StandardScaler(), num)
    # ],
    # remainder='passthrough')
    
    cat_model = make_pipeline(#preproccessing_pipeline,
                        CatBoostRegressor(loss_function='RMSE',
                        random_state=42,
                        verbose=0
                        ))
    cat_model.fit(X_train, y_train)
    y_pred_cat = cat_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_cat)
    errors_cat.append(error)

r_mean_error = np.mean(errors_cat)
print(f'Average RMSE across all splits: {r_mean_error}')

Average RMSE across all splits: 7602.149872420082


In [None]:
# cb_model = make_pipeline(preproccessing_pipeline,
#                           CatBoostRegressor(loss_function='RMSE',
#                               random_state=42,
#                               verbose=0
#                           ))
# cb_model.fit(X_train, y_train)
# y_pred_cb = cb_model.predict(X_test)
# print(root_mean_squared_error(y_test, y_pred_cb))

## LightGBM Regression test

In [48]:
errors_lgbm = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # num = X_train.select_dtypes(include=['float64', 'int64']).columns
    # preproccessing_pipeline = ColumnTransformer([
    # ('num', StandardScaler(), num)
    # ],
    # remainder='passthrough')
    
    lgbm_model = make_pipeline(#preproccessing_pipeline,
                                lgb.LGBMRegressor(
                                objective='regression', 
                                random_state=42))
    lgbm_model.fit(X_train, y_train)
    y_pred_lgbm = lgbm_model.predict(X_test)
    error = root_mean_squared_error(y_test, y_pred_lgbm)
    errors_lgbm.append(error)

r_mean_error = np.mean(errors_lgbm)
print(f'Average RMSE across all splits: {r_mean_error}')

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 15, number of used features: 0
[LightGBM] [Info] Start training from score 60478.333333
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 30, number of used features: 0
[LightGBM] [Info] Start training from score 61687.366667
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000272 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1071
[LightGBM] [Info] Number of data points in the train set: 45, number of used features: 68
[LightGBM] [Info] Start training from score 63108.066667
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1412
[LightGBM] [Info] Number of data points in the train set: 60, number of used features: 70
[LightGBM] [Info]

In [None]:
# lgbm_model = make_pipeline(preproccessing_pipeline, 
#                            lgb.LGBMRegressor(
#     objective='regression', 
#     learning_rate=0.1, 
#     n_estimators=100, 
#     num_leaves=31, 
#     random_state=42))

# lgbm_model.fit(X_train, y_train)
# y_pred_lgbm = lgbm_model.predict(X_test)
# print(root_mean_squared_error(y_test, y_pred_lgbm))

# TG integration

# CEX integration

# Deployment

# Security