<a href="https://colab.research.google.com/github/Nekhaenko/test/blob/main/dataset_trd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
pip install ta lightautoml

Collecting lightautoml
  Downloading lightautoml-0.4.1-py3-none-any.whl.metadata (13 kB)
Collecting autowoe>=1.3.3 (from lightautoml)
  Downloading autowoe-1.3.3-py3-none-any.whl.metadata (2.9 kB)
Collecting catboost>=0.26.1 (from lightautoml)
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting cmaes (from lightautoml)
  Downloading cmaes-0.11.1-py3-none-any.whl.metadata (18 kB)
Collecting json2html (from lightautoml)
  Downloading json2html-1.3.0.tar.gz (7.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy (from ta)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna (from lightautoml)
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting poetry-core<2.0.0,>=1.0.0 (from lightautoml)
  Downloading poetry_core-

In [3]:
import pandas as pd
import numpy as np
from ta import add_all_ta_features
from ta.utils import dropna

In [2]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
pth = 'drive/MyDrive/datasets/traders/MNTUSDT.csv'

In [5]:
def create_pump_dataset(
    df: pd.DataFrame,
    window_size: int = 30,
    forecast_horizon: int = 7,
    pump_threshold: float = 0.10
) -> tuple[np.ndarray, np.ndarray, list]:
    """
    Формирует датасет для классификации пампов:
      - X: матрица признаков формы (n_samples, window_size * n_features)
      - y: вектор меток (0/1)
      - dates: список дат, соответствующих последнему дню каждого окна

    Параметры:
      df              – DataFrame с колонками ['date','open','high','low','close','volume']
      window_size     – длина истории (N) для формирования признаков
      forecast_horizon– горизонт прогноза (M) в днях
      pump_threshold  – относительный порог роста цены (например, 0.10 для +10 %)
    """
    # 1. Сортировка и удаление пропусков
    df = df.sort_values('date').reset_index(drop=True)
    df = dropna(df)  # удаляем строки с NaN, возникающие при вычислениях :contentReference[oaicite:3]{index=3}

    # 2. Генерация всех стандартных TA-признаков
    df = add_all_ta_features(
        df, open="open", high="high", low="low",
        close="close", volume="volume", fillna=True
    )  # включает SMA, EMA, RSI, MACD, BBANDS и др. :contentReference[oaicite:4]{index=4}

    # 3. Разметка «пампа»: наличие роста ≥ pump_threshold в пределах следующих M дней
    df['future_max'] = (
        df['close']
        .shift(-1)
        .rolling(window=forecast_horizon, min_periods=1)
        .max()
    )
    df['label'] = (
        df['future_max'] >= df['close'] * (1 + pump_threshold)
    ).astype(int)  # 1 – если условие выполнено :contentReference[oaicite:5]{index=5}

    # 4. Формирование выборки через скользящее окно
    feature_cols = [c for c in df.columns
                    if c not in ('date', 'future_max', 'label')]
    X, y, dates = [], [], []

    # Проходим по всем возможным окнам
    for start in range(len(df) - window_size - forecast_horizon + 1):
        end = start + window_size
        window = df.iloc[start:end]

        X.append(window[feature_cols].values.flatten())
        y.append(df['label'].iloc[end - 1])
        dates.append(df['date'].iloc[end - 1])  # дата последнего дня окна

    X = np.array(X)  # shape = (n_samples, window_size * n_features)
    y = np.array(y)  # метки 0/1
    return X, y, dates  # :contentReference[oaicite:6]{index=6}

In [6]:
# Загружаем CSV с колонками date,open,high,low,close,volume
df = pd.read_csv(pth,
                 parse_dates=["date"])
df[:3]

Unnamed: 0,date,open,high,low,close,volume
0,2023-10-02,0.41279,0.42457,0.40154,0.41257,12448969.0
1,2023-10-03,0.41257,0.41424,0.40877,0.41126,3789101.0
2,2023-10-04,0.41126,0.41162,0.40538,0.40702,4584710.0


In [7]:
X, y, dates = create_pump_dataset(df,
                                  window_size=30,
                                  forecast_horizon=7,
                                  pump_threshold=0.10)

print("Признаки:", X.shape)
print("Меток '1':", y.sum(), "/", len(y))

Признаки: (544, 2730)
Меток '1': 110 / 544


In [16]:
data = pd.DataFrame(X)
data.columns = [f'col_{col}' for col in data.columns]
data['target'] = y

In [17]:
data[:3]

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_2721,col_2722,col_2723,col_2724,col_2725,col_2726,col_2727,col_2728,col_2729,target
0,0.41279,0.42457,0.40154,0.41257,12448969.0,-524337.8,12448969.0,-0.042119,0.0,0.0,...,-0.94993,1.155894,-11.726028,-4.132885,-7.593143,0.362588,0.118437,0.118367,-7.797465,0
1,0.41257,0.41424,0.40877,0.41126,3789101.0,-863763.7,8659868.0,-0.053194,-4963.72231,-0.000224,...,-0.676263,1.094665,-9.832328,-5.272774,-4.559554,0.363563,1.101472,1.09545,-6.781879,0
2,0.41126,0.41162,0.40538,0.40702,4584710.0,-3038562.0,4075158.0,-0.145925,-7031.643466,-0.000409,...,-0.332835,1.373715,-2.812487,-4.780716,1.96823,0.369707,5.694376,5.53815,-1.473689,0


In [None]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 300
TARGET_NAME = 'TARGET'
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [18]:
train_data, test_data = train_test_split(data,
                                         test_size=TEST_SIZE,
                                        #  stratify=data[TARGET_NAME],
                                         random_state=RANDOM_STATE
                                         )

In [19]:
automl = TabularAutoML(task = Task(name = 'binary',
                                   metric = 'auc'))

oof_preds = automl.fit_predict(train_data,
                               roles = {'target': 'target'},
                               path_to_save='model.jbl').data

test_preds = automl.predict(test_data).data

INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO:lightautoml.automl.presets.base:Task: binary

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 3600.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 4 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (435, 2731)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 3566.59 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16

In [20]:
from sklearn.metrics import roc_auc_score

In [25]:
roc_auc_score(test_data['target'].values, test_preds[:, 0])

0.9367647058823529

In [24]:
roc_auc_score(train_data['target'].values, oof_preds[:, 0])

0.8786233091224095