ML

In [10]:
import os, glob, pandas as pd

FEATURES_OUT = "data/features/ml/SBER/1m"
LABELS_OUT   = "data/labels/SBER/1m"

# 1) читаем фичи и лейблы
feat_pq = sorted(glob.glob(os.path.join(FEATURES_OUT, "*.parquet")))[0]
lab_pq  = sorted(glob.glob(os.path.join(LABELS_OUT,   "*.parquet")))[0]

dfX = pd.read_parquet(feat_pq)
dfY = pd.read_parquet(lab_pq)

# 2) унифицируем ключи
keys = [c for c in ["time","bar_id","date"] if c in dfX.columns and c in dfY.columns]
assert keys, "нет общих ключей для merge"

# 3) полный ML-датасет
ml = dfX.merge(dfY, on=keys, how="inner").sort_values(keys)
ml.info()
ml.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14835 entries, 0 to 14834
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   time           14835 non-null  datetime64[ns, UTC]
 1   open_x         14835 non-null  float64            
 2   high_x         14835 non-null  float64            
 3   low_x          14835 non-null  float64            
 4   close_x        14835 non-null  float64            
 5   volume_x       14835 non-null  int64              
 6   ret_1_x        14834 non-null  float64            
 7   ret_5_x        14830 non-null  float64            
 8   rng_1_x        14834 non-null  float64            
 9   vol_20_x       14825 non-null  float64            
 10  vwap_20_x      14831 non-null  float64            
 11  dist_vwap_x    14831 non-null  float64            
 12  date           14835 non-null  object             
 13  bar_id         14835 non-null  int64          

Unnamed: 0,time,open_x,high_x,low_x,close_x,volume_x,ret_1_x,ret_5_x,rng_1_x,vol_20_x,...,ret_1_y,ret_5_y,rng_1_y,vol_20_y,vwap_20_y,dist_vwap_y,is_open_30_y,is_close_30_y,ret_fwd_5,y_tb_20
0,2025-09-20 00:00:00+00:00,296.13,296.13,296.13,296.13,117,,,,,...,,,,,,,1,0,0.000203,0
1,2025-09-20 00:01:00+00:00,296.13,296.13,296.13,296.13,1,0.0,,0.0,,...,0.0,,0.0,,,,1,0,0.000203,0
2,2025-09-20 00:03:00+00:00,296.12,296.13,296.12,296.13,21,0.0,,3.4e-05,,...,0.0,,3.4e-05,,,,1,0,0.000203,-1
3,2025-09-20 00:04:00+00:00,296.13,296.13,296.13,296.13,53,0.0,,0.0,,...,0.0,,0.0,,,,1,0,0.000203,-1
4,2025-09-20 00:05:00+00:00,296.19,296.19,296.19,296.19,5,0.000203,,0.0,,...,0.000203,,0.0,,296.131168,0.000199,1,0,-3.4e-05,-1


In [11]:
# только целевые столбцы
targets = [c for c in ml.columns if c.startswith("y_") or c.startswith("ret_fwd_")]
ml[targets].describe(include="all")

# баланс классов для triple-barrier
ml["y_tb_20"].value_counts(dropna=False)

# сохранить полный датасет
ml.to_parquet(os.path.join(FEATURES_OUT, "features_labeled_full.parquet"))


In [12]:
ml

Unnamed: 0,time,open_x,high_x,low_x,close_x,volume_x,ret_1_x,ret_5_x,rng_1_x,vol_20_x,...,ret_1_y,ret_5_y,rng_1_y,vol_20_y,vwap_20_y,dist_vwap_y,is_open_30_y,is_close_30_y,ret_fwd_5,y_tb_20
0,2025-09-20 00:00:00+00:00,296.13,296.13,296.13,296.13,117,,,,,...,,,,,,,1,0,0.000203,0
1,2025-09-20 00:01:00+00:00,296.13,296.13,296.13,296.13,1,0.000000,,0.000000,,...,0.000000,,0.000000,,,,1,0,0.000203,0
2,2025-09-20 00:03:00+00:00,296.12,296.13,296.12,296.13,21,0.000000,,0.000034,,...,0.000000,,0.000034,,,,1,0,0.000203,-1
3,2025-09-20 00:04:00+00:00,296.13,296.13,296.13,296.13,53,0.000000,,0.000000,,...,0.000000,,0.000000,,,,1,0,0.000203,-1
4,2025-09-20 00:05:00+00:00,296.19,296.19,296.19,296.19,5,0.000203,,0.000000,,...,0.000203,,0.000000,,296.131168,0.000199,1,0,-0.000034,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14830,2025-10-03 23:52:00+00:00,282.80,283.00,282.80,283.00,2530,0.000742,0.000566,0.000707,0.001236,...,0.000742,0.000566,0.000707,0.001236,282.433803,0.002001,0,1,0.045352,0
14831,2025-10-03 23:53:00+00:00,283.00,283.00,282.99,282.99,115,-0.000035,0.001910,0.000035,0.001227,...,-0.000035,0.001910,0.000035,0.001227,282.438960,0.001947,0,1,0.045387,0
14832,2025-10-03 23:54:00+00:00,282.99,282.99,282.99,282.99,11,0.000000,0.000566,0.000000,0.001215,...,0.000000,0.000566,0.000000,0.001215,282.436074,0.001957,0,1,0.045387,0
14833,2025-10-03 23:56:00+00:00,282.22,282.22,282.21,282.21,83,-0.002760,-0.002053,0.000035,0.001302,...,-0.002760,-0.002053,0.000035,0.001302,282.434593,-0.000796,0,1,0.048147,0


CV

In [17]:
import os
import glob
import pandas as pd

# ---- Параметры рендера (подставь свои при необходимости) ----
TICKER = "SBER"
TF     = "1m"
SIZE   = 64          # окно баров, с которым рендерили
STEP   = 16          # шаг окна при рендере

# ---- Пути согласно scripts/render_cv_images.py ----
IMAGES_DIR   = f"data/cv/images/{TICKER}/{TF}/win{SIZE}_step{STEP}"
MANIFEST_CSV = f"data/features/cv/{TICKER}/{TF}/manifest_win{SIZE}_step{STEP}.csv"

# 1) список изображений
imgs = sorted(glob.glob(os.path.join(IMAGES_DIR, "**", "*.png"), recursive=True))
print("images:", len(imgs))
print("sample:", imgs)

# 2) манифест (метаданные: time, путь, метка и т.д.)
cv_meta = None
if os.path.exists(MANIFEST_CSV):
    cv_meta = pd.read_csv(MANIFEST_CSV)
    print("manifest rows:", len(cv_meta))
    display(cv_meta.head())
else:
    print("manifest not found:", MANIFEST_CSV)

# 3) при необходимости — соединить с ML-фичами
# FEATURES_OUT = f"data/features/ml/{TICKER}/{TF}"
# feat_pq = sorted(glob.glob(os.path.join(FEATURES_OUT, "*.parquet")))[0]
# dfX = pd.read_parquet(feat_pq)
# df_full = cv_meta.merge(dfX, on="time", how="left") if cv_meta is not None else None
# display(df_full.head() if df_full is not None else "no merge performed")


images: 924
sample: ['data/cv/images/SBER/1m/win64_step16\\flat\\img_0000000.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000001.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000002.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000003.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000004.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000005.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000006.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000007.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000008.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000009.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000010.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000011.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000012.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000013.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0000014.png', 'data/cv/images/SBER/1m/win64_step16\\flat\\img_0

Unnamed: 0,path,ticker,tf,t_start,t_end,bars,ymin,ymax,vol_mode,vmin,vmax,label
0,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 00:00:00+00:00,2025-09-20 01:10:00+00:00,64,293.12,296.39,raw,1.0,40288.0,flat
1,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 00:20:00+00:00,2025-09-20 01:26:00+00:00,64,293.12,296.39,raw,1.0,40288.0,flat
2,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 00:38:00+00:00,2025-09-20 01:43:00+00:00,64,293.23,294.64,raw,1.0,7138.0,flat
3,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 00:54:00+00:00,2025-09-20 02:00:00+00:00,64,293.32,294.24,raw,1.0,7138.0,flat
4,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 01:11:00+00:00,2025-09-20 02:17:00+00:00,64,293.32,294.61,raw,1.0,7138.0,flat


In [18]:
cv_meta

Unnamed: 0,path,ticker,tf,t_start,t_end,bars,ymin,ymax,vol_mode,vmin,vmax,label
0,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 00:00:00+00:00,2025-09-20 01:10:00+00:00,64,293.12,296.39,raw,1.0,40288.0,flat
1,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 00:20:00+00:00,2025-09-20 01:26:00+00:00,64,293.12,296.39,raw,1.0,40288.0,flat
2,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 00:38:00+00:00,2025-09-20 01:43:00+00:00,64,293.23,294.64,raw,1.0,7138.0,flat
3,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 00:54:00+00:00,2025-09-20 02:00:00+00:00,64,293.32,294.24,raw,1.0,7138.0,flat
4,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-09-20 01:11:00+00:00,2025-09-20 02:17:00+00:00,64,293.32,294.61,raw,1.0,7138.0,flat
...,...,...,...,...,...,...,...,...,...,...,...,...
919,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-10-03 19:33:00+00:00,2025-10-03 20:36:00+00:00,64,282.82,283.50,raw,163.0,162658.0,flat
920,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-10-03 19:49:00+00:00,2025-10-03 23:02:00+00:00,64,281.00,284.07,raw,111.0,162658.0,flat
921,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-10-03 20:05:00+00:00,2025-10-03 23:18:00+00:00,64,281.00,284.07,raw,1.0,37535.0,flat
922,data\cv\images\SBER\1m\win64_step16\flat\img_0...,SBER,1m,2025-10-03 20:21:00+00:00,2025-10-03 23:37:00+00:00,64,281.00,284.07,raw,1.0,37535.0,flat


In [None]:
import pandas as pd, glob

for t in ["SBER","GAZP","LKOH","ROSN","GMKN"]:
    m = glob.glob(f"data/cv/images/{t}/1m/win64_step16/*/manifest_with_vol.csv")
    print(t, "classes:", m)
