<a href="https://colab.research.google.com/github/Nekhaenko/test/blob/main/dataset_trd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
pip install ta lightautoml

Collecting lightautoml
  Downloading lightautoml-0.4.1-py3-none-any.whl.metadata (13 kB)
Collecting autowoe>=1.3.3 (from lightautoml)
  Downloading autowoe-1.3.3-py3-none-any.whl.metadata (2.9 kB)
Collecting catboost>=0.26.1 (from lightautoml)
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting cmaes (from lightautoml)
  Downloading cmaes-0.11.1-py3-none-any.whl.metadata (18 kB)
Collecting json2html (from lightautoml)
  Downloading json2html-1.3.0.tar.gz (7.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy (from ta)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna (from lightautoml)
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting poetry-core<2.0.0,>=1.0.0 (from lightautoml)
  Downloading poetry_core-

In [3]:
import pandas as pd
import numpy as np
from ta import add_all_ta_features
from ta.utils import dropna

In [2]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
pth = 'drive/MyDrive/datasets/traders/MNTUSDT.csv'

In [5]:
def create_pump_dataset(
    df: pd.DataFrame,
    window_size: int = 30,
    forecast_horizon: int = 7,
    pump_threshold: float = 0.10
) -> tuple[np.ndarray, np.ndarray, list]:
    """
    Формирует датасет для классификации пампов:
      - X: матрица признаков формы (n_samples, window_size * n_features)
      - y: вектор меток (0/1)
      - dates: список дат, соответствующих последнему дню каждого окна

    Параметры:
      df              – DataFrame с колонками ['date','open','high','low','close','volume']
      window_size     – длина истории (N) для формирования признаков
      forecast_horizon– горизонт прогноза (M) в днях
      pump_threshold  – относительный порог роста цены (например, 0.10 для +10 %)
    """
    # 1. Сортировка и удаление пропусков
    df = df.sort_values('date').reset_index(drop=True)
    df = dropna(df)  # удаляем строки с NaN, возникающие при вычислениях :contentReference[oaicite:3]{index=3}

    # 2. Генерация всех стандартных TA-признаков
    df = add_all_ta_features(
        df, open="open", high="high", low="low",
        close="close", volume="volume", fillna=True
    )  # включает SMA, EMA, RSI, MACD, BBANDS и др. :contentReference[oaicite:4]{index=4}

    # 3. Разметка «пампа»: наличие роста ≥ pump_threshold в пределах следующих M дней
    df['future_max'] = (
        df['close']
        .shift(-1)
        .rolling(window=forecast_horizon, min_periods=1)
        .max()
    )
    df['label'] = (
        df['future_max'] >= df['close'] * (1 + pump_threshold)
    ).astype(int)  # 1 – если условие выполнено :contentReference[oaicite:5]{index=5}

    # 4. Формирование выборки через скользящее окно
    feature_cols = [c for c in df.columns
                    if c not in ('date', 'future_max', 'label')]
    X, y, dates = [], [], []

    # Проходим по всем возможным окнам
    for start in range(len(df) - window_size - forecast_horizon + 1):
        end = start + window_size
        window = df.iloc[start:end]

        X.append(window[feature_cols].values.flatten())
        y.append(df['label'].iloc[end - 1])
        dates.append(df['date'].iloc[end - 1])  # дата последнего дня окна

    X = np.array(X)  # shape = (n_samples, window_size * n_features)
    y = np.array(y)  # метки 0/1
    return X, y, dates  # :contentReference[oaicite:6]{index=6}

In [29]:
# Загружаем CSV с колонками date,open,high,low,close,volume
df = pd.read_csv(pth,
                 parse_dates=["date"])
df[:3]

Unnamed: 0,date,open,high,low,close,volume
0,2023-10-02,0.41279,0.42457,0.40154,0.41257,12448969.0
1,2023-10-03,0.41257,0.41424,0.40877,0.41126,3789101.0
2,2023-10-04,0.41126,0.41162,0.40538,0.40702,4584710.0


In [7]:
X, y, dates = create_pump_dataset(df,
                                  window_size=30,
                                  forecast_horizon=7,
                                  pump_threshold=0.10)

print("Признаки:", X.shape)
print("Меток '1':", y.sum(), "/", len(y))

Признаки: (544, 2730)
Меток '1': 110 / 544


In [28]:
import plotly.graph_objects as go

# Параметры пампа
pump_threshold = 0.10  # 10% рост
forecast_horizon = 7   # в течение следующих 7 дней

# Вычисление максимальной цены в будущем горизонте
df['future_max'] = df['close'].shift(-1).rolling(window=forecast_horizon, min_periods=1).max()

# Определение метки пампа
df['pump'] = (df['future_max'] >= df['close'] * (1 + pump_threshold)).astype(int)

# Фильтрация дней с пампом
pump_days = df[df['pump'] == 1]

# Создание свечного графика
fig = go.Figure(data=[go.Candlestick(x=df['date'],
                                     open=df['open'],
                                     high=df['high'],
                                     low=df['low'],
                                     close=df['close'],
                                     name='OHLC')])

# Добавление меток пампа
fig.add_trace(go.Scatter(x=pump_days['date'],
                         y=pump_days['high'],
                         mode='markers',
                         marker=dict(color='red', size=10, symbol='triangle-up'),
                         name='Памп'))

# Настройка макета графика
fig.update_layout(title='Свечной график с метками пампа',
                  xaxis_title='Дата',
                  yaxis_title='Цена',
                  xaxis_rangeslider_visible=False)

# Отображение графика
fig.show()

In [16]:
data = pd.DataFrame(X)
data.columns = [f'col_{col}' for col in data.columns]
data['target'] = y

In [17]:
data[:3]

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_2721,col_2722,col_2723,col_2724,col_2725,col_2726,col_2727,col_2728,col_2729,target
0,0.41279,0.42457,0.40154,0.41257,12448969.0,-524337.8,12448969.0,-0.042119,0.0,0.0,...,-0.94993,1.155894,-11.726028,-4.132885,-7.593143,0.362588,0.118437,0.118367,-7.797465,0
1,0.41257,0.41424,0.40877,0.41126,3789101.0,-863763.7,8659868.0,-0.053194,-4963.72231,-0.000224,...,-0.676263,1.094665,-9.832328,-5.272774,-4.559554,0.363563,1.101472,1.09545,-6.781879,0
2,0.41126,0.41162,0.40538,0.40702,4584710.0,-3038562.0,4075158.0,-0.145925,-7031.643466,-0.000409,...,-0.332835,1.373715,-2.812487,-4.780716,1.96823,0.369707,5.694376,5.53815,-1.473689,0


In [None]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 300
TARGET_NAME = 'TARGET'
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [18]:
train_data, test_data = train_test_split(data,
                                         test_size=TEST_SIZE,
                                        #  stratify=data[TARGET_NAME],
                                         random_state=RANDOM_STATE
                                         )

In [None]:
automl = TabularAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS,
                     'cv': N_FOLDS,
                     'random_state': RANDOM_STATE},
)

In [None]:
automl = TabularAutoML(task = Task(name = 'binary',
                                   metric = 'auc'))

In [19]:
oof_preds = automl.fit_predict(train_data,
                               roles = {'target': 'target'},
                               path_to_save='model.jbl').data

test_preds = automl.predict(test_data).data

INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO:lightautoml.automl.presets.base:Task: binary

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 3600.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 4 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (435, 2731)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 3566.59 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16

In [20]:
from sklearn.metrics import roc_auc_score

In [25]:
roc_auc_score(test_data['target'].values, test_preds[:, 0])

0.9367647058823529

In [24]:
roc_auc_score(train_data['target'].values, oof_preds[:, 0])

0.8786233091224095

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl_rd.model.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

In [35]:
# Определение пампа: рост на 30% за 1 день
df["pump"] = 0
for i in range(len(df) - 1):
    if (df["high"].iloc[i + 1] / df["close"].iloc[i] - 1) >= 0.1:
        df["pump"].iloc[max(0, i - 3):i] = 1  # Метка за 3 дня до пампа


ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




In [37]:
df[df['pump'] == 1][-4:]

Unnamed: 0,date,open,high,low,close,volume,pump
537,2025-03-22,0.7811,0.7913,0.7762,0.7884,4036245.0,1
566,2025-04-20,0.6627,0.6724,0.6531,0.6579,5910595.0,1
567,2025-04-21,0.6579,0.6702,0.6562,0.6655,6844604.0,1
568,2025-04-22,0.6655,0.7,0.655,0.6951,12792868.0,1


**deepseek**

In [38]:
import pandas as pd
import numpy as np
from ta import add_all_ta_features
from ta.utils import dropna
from sklearn.model_selection import train_test_split

In [41]:
def load_and_preprocess_data(filepath):
    """Загрузка и предобработка сырых данных"""
    df = pd.read_csv(filepath, parse_dates=['date'], index_col='date')
    df = df.sort_index()
    df = dropna(df)  # Очистка от NaN
    return df

In [39]:
def create_target(df, pump_threshold=0.3, lookahead_days=3, buffer_days=5):
    """
    Создание целевой переменной с учетом:
    - pump_threshold: минимальный рост для определения пампа
    - lookahead_days: за сколько дней предсказывать памп
    - buffer_days: буфер между событиями
    """
    df = df.copy()
    close_pct = df['close'].pct_change(lookahead_days).shift(-lookahead_days)

    # Находим даты пампа
    pump_dates = df[close_pct >= pump_threshold].index

    # Создаем маску буферных периодов
    buffer_mask = np.zeros(len(df), dtype=bool)
    for date in pump_dates:
        start = df.index.get_loc(date) - buffer_days
        end = df.index.get_loc(date) + buffer_days
        buffer_mask[start:end] = True

    # Метка: 1 за N дней до пампа, исключая буферные зоны
    df['target'] = 0
    for date in pump_dates:
        idx = df.index.get_loc(date)
        label_start = max(0, idx - lookahead_days - 3)  # За 3 дня до события
        label_end = max(0, idx - lookahead_days)
        df.iloc[label_start:label_end, -1] = 1

    df['target'] = df['target'].where(~buffer_mask, 0)
    return df

def generate_features(df, lookback_windows=[3, 7, 14]):
    """Генерация признаков с временными окнами"""
    df = df.copy()

    # Базовые технические индикаторы
    df = add_all_ta_features(df, open="open", high="high", low="low", close="close", volume="volume")

    # Лаговые признаки
    for lag in [1, 2, 3, 5]:
        df[f'close_lag_{lag}'] = df['close'].shift(lag)
        df[f'volume_lag_{lag}'] = df['volume'].shift(lag)

    # Скользящие статистики
    for window in lookback_windows:
        df[f'close_ma_{window}'] = df['close'].rolling(window).mean()
        df[f'close_volatility_{window}'] = df['close'].rolling(window).std()
        df[f'volume_ma_{window}'] = df['volume'].rolling(window).mean()
        df[f'range_{window}'] = (df['high'] - df['low']).rolling(window).mean()

    # Производные признаки
    df['price_volume_ratio'] = df['volume'] / df['close']
    df['daily_return'] = df['close'].pct_change()

    # Удаление NaN из-за оконных функций
    df = df.dropna()
    return df

def time_aware_split(df, test_size=0.2):
    """Хронологическое разделение данных"""
    split_idx = int(len(df) * (1 - test_size))
    train = df.iloc[:split_idx]
    test = df.iloc[split_idx:]
    return train, test

In [42]:
# 1. Загрузка данных
df = load_and_preprocess_data(pth)
df[:3]

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-10-02,0.41279,0.42457,0.40154,0.41257,12448969.0
2023-10-03,0.41257,0.41424,0.40877,0.41126,3789101.0
2023-10-04,0.41126,0.41162,0.40538,0.40702,4584710.0


In [44]:
# 2. Создание целевой переменной
df = create_target(df, pump_threshold=0.3, lookahead_days=3)
df[df['target'] == 1][-4:]

Unnamed: 0_level_0,open,high,low,close,volume,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-01,0.6439,0.651,0.6352,0.6438,4665933.0,1
2024-03-18,0.8711,0.8714,0.8128,0.8332,12621248.0,1
2024-10-31,0.6056,0.61,0.5908,0.5975,14211113.0,1


In [None]:




# 3. Генерация признаков
df = generate_features(df)

# 4. Фильтрация фичей и таргета
features = df.columns.drop(['target', 'open', 'high', 'low', 'close', 'volume'])
X = df[features]
y = df['target']

# 5. Разделение данных
X_train, X_test, y_train, y_test = time_aware_split(X, y, test_size=0.2)

print(f"Dataset shape: {X.shape}")
print(f"Positive samples: {y.sum()} ({y.mean():.2%})")