<a href="https://colab.research.google.com/github/Nekhaenko/test/blob/main/dataset_trd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=1b397732e15b16abcbd94517425bf4a1d54e8be162269988b2723f4458cb9ca4
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [3]:
import pandas as pd
import numpy as np
from ta import add_all_ta_features
from ta.utils import dropna

In [4]:
pth = 'drive/MyDrive/datasets/traders/MNTUSDT.csv'

In [5]:
def create_pump_dataset(
    df: pd.DataFrame,
    window_size: int = 30,
    forecast_horizon: int = 7,
    pump_threshold: float = 0.10
) -> tuple[np.ndarray, np.ndarray, list]:
    """
    Формирует датасет для классификации пампов:
      - X: матрица признаков формы (n_samples, window_size * n_features)
      - y: вектор меток (0/1)
      - dates: список дат, соответствующих последнему дню каждого окна

    Параметры:
      df              – DataFrame с колонками ['date','open','high','low','close','volume']
      window_size     – длина истории (N) для формирования признаков
      forecast_horizon– горизонт прогноза (M) в днях
      pump_threshold  – относительный порог роста цены (например, 0.10 для +10 %)
    """
    # 1. Сортировка и удаление пропусков
    df = df.sort_values('date').reset_index(drop=True)
    df = dropna(df)  # удаляем строки с NaN, возникающие при вычислениях :contentReference[oaicite:3]{index=3}

    # 2. Генерация всех стандартных TA-признаков
    df = add_all_ta_features(
        df, open="open", high="high", low="low",
        close="close", volume="volume", fillna=True
    )  # включает SMA, EMA, RSI, MACD, BBANDS и др. :contentReference[oaicite:4]{index=4}

    # 3. Разметка «пампа»: наличие роста ≥ pump_threshold в пределах следующих M дней
    df['future_max'] = (
        df['close']
        .shift(-1)
        .rolling(window=forecast_horizon, min_periods=1)
        .max()
    )
    df['label'] = (
        df['future_max'] >= df['close'] * (1 + pump_threshold)
    ).astype(int)  # 1 – если условие выполнено :contentReference[oaicite:5]{index=5}

    # 4. Формирование выборки через скользящее окно
    feature_cols = [c for c in df.columns
                    if c not in ('date', 'future_max', 'label')]
    X, y, dates = [], [], []

    # Проходим по всем возможным окнам
    for start in range(len(df) - window_size - forecast_horizon + 1):
        end = start + window_size
        window = df.iloc[start:end]

        X.append(window[feature_cols].values.flatten())
        y.append(df['label'].iloc[end - 1])
        dates.append(df['date'].iloc[end - 1])  # дата последнего дня окна

    X = np.array(X)  # shape = (n_samples, window_size * n_features)
    y = np.array(y)  # метки 0/1
    return X, y, dates  # :contentReference[oaicite:6]{index=6}

In [6]:
# Загружаем CSV с колонками date,open,high,low,close,volume
df = pd.read_csv(pth, parse_dates=["date"])
df[:3]

Unnamed: 0,date,open,high,low,close,volume
0,2023-10-02,0.41279,0.42457,0.40154,0.41257,12448969.0
1,2023-10-03,0.41257,0.41424,0.40877,0.41126,3789101.0
2,2023-10-04,0.41126,0.41162,0.40538,0.40702,4584710.0


In [7]:
X, y, dates = create_pump_dataset(df,
                                  window_size=30,
                                  forecast_horizon=7,
                                  pump_threshold=0.10)

print("Признаки:", X.shape)
print("Меток '1':", y.sum(), "/", len(y))

Признаки: (544, 2730)
Меток '1': 110 / 544


In [8]:
X

array([[ 0.41279   ,  0.42457   ,  0.40154   , ...,  0.11843664,
         0.11836656, -7.79746467],
       [ 0.41257   ,  0.41424   ,  0.40877   , ...,  1.10147213,
         1.09545011, -6.78187944],
       [ 0.41126   ,  0.41162   ,  0.40538   , ...,  5.69437583,
         5.53814967, -1.47368931],
       ...,
       [ 0.8438    ,  0.8532    ,  0.8343    , ...,  3.886121  ,
         3.81251228, 76.89119422],
       [ 0.8415    ,  0.8484    ,  0.8372    , ...,  1.4798575 ,
         1.46901445, 79.50893182],
       [ 0.8399    ,  0.8456    ,  0.8128    , ...,  0.09451796,
         0.09447332, 79.6786    ]])

In [9]:
dates

[Timestamp('2023-10-31 00:00:00'),
 Timestamp('2023-11-01 00:00:00'),
 Timestamp('2023-11-02 00:00:00'),
 Timestamp('2023-11-03 00:00:00'),
 Timestamp('2023-11-04 00:00:00'),
 Timestamp('2023-11-05 00:00:00'),
 Timestamp('2023-11-06 00:00:00'),
 Timestamp('2023-11-07 00:00:00'),
 Timestamp('2023-11-08 00:00:00'),
 Timestamp('2023-11-09 00:00:00'),
 Timestamp('2023-11-10 00:00:00'),
 Timestamp('2023-11-11 00:00:00'),
 Timestamp('2023-11-12 00:00:00'),
 Timestamp('2023-11-13 00:00:00'),
 Timestamp('2023-11-14 00:00:00'),
 Timestamp('2023-11-15 00:00:00'),
 Timestamp('2023-11-16 00:00:00'),
 Timestamp('2023-11-17 00:00:00'),
 Timestamp('2023-11-18 00:00:00'),
 Timestamp('2023-11-19 00:00:00'),
 Timestamp('2023-11-20 00:00:00'),
 Timestamp('2023-11-21 00:00:00'),
 Timestamp('2023-11-22 00:00:00'),
 Timestamp('2023-11-23 00:00:00'),
 Timestamp('2023-11-24 00:00:00'),
 Timestamp('2023-11-25 00:00:00'),
 Timestamp('2023-11-26 00:00:00'),
 Timestamp('2023-11-27 00:00:00'),
 Timestamp('2023-11-