In [1]:
import sys; sys.path.append("../")

import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.model_selection import train_test_split

In [2]:
from common.load_ticker import load_ticker

spy_df = load_ticker(
    base_dir = "../../../parquet_minute/", 
    time_col = "Date",
    symbol_col="Symbol",
    seed = 42, 
    symbol= "SPY",
    verbose=True
)

spy_df

[LOAD TICKER] Loaded SPY: 61 files -> shape (488041, 7)
[LOAD TICKER] Date range: 2020-08-21 13:30:00+00:00  to  2025-08-21 19:59:00+00:00


Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2020-08-21 13:30:00+00:00,SPY,337.920013,338.029999,337.839996,337.929993,530274
1,2020-08-21 13:31:00+00:00,SPY,337.920013,338.079987,337.920013,338.000000,280279
2,2020-08-21 13:32:00+00:00,SPY,338.019989,338.040009,337.875000,338.000000,180445
3,2020-08-21 13:33:00+00:00,SPY,338.010010,338.083405,337.910004,338.029999,303505
4,2020-08-21 13:34:00+00:00,SPY,338.019989,338.089996,337.940002,338.059998,132404
...,...,...,...,...,...,...,...
488036,2025-08-21 19:55:00+00:00,SPY,635.669983,635.889893,635.659973,635.850098,407279
488037,2025-08-21 19:56:00+00:00,SPY,635.854980,635.885010,635.669983,635.700012,194769
488038,2025-08-21 19:57:00+00:00,SPY,635.710022,635.775024,635.409973,635.409973,315516
488039,2025-08-21 19:58:00+00:00,SPY,635.414978,635.474976,635.145020,635.184998,375316


In [3]:
from common.clean_data import clean_data

spy_clean_df = clean_data(
    df=spy_df,
    timestamp_col = "Date",
    symbol_col = "Symbol",
    drop_duplicate_rows = True,
    drop_duplicate_cols = True,
    drop_constant_columns = True,
    drop_constant_rows = True,
    replace_placeholders = True,
    placeholders=("Null", "null", "NULL", "NaN", "nan", "NAN", "None", "none", "NONE"),
    fill_missing = True,
    convert_numeric = True,
    sort_by = "timestamp",
    verbose = True,
)

spy_clean_df

[---CLEAN---] Starting Shape=(488041, 7)
[---CLEAN---] Preserving: Symbol and Date
[---CLEAN---] Step 1: Remove Duplicate Columns.
[---CLEAN---] ------- Original Column Count: 7, After: 7, Removed: 0 in 0.04093s
[---CLEAN---] Step 2: Remove Duplicate Rows.
[---CLEAN---] ------- Original Row Count: 488041, After: 488041, Removed: 0 in 0.07910s
[---CLEAN---] Step 3: Remove Constant Columns.
[---CLEAN---] ------- Original Column Count: 7, After: 7, Removed: 0 in 0.02794s
[---CLEAN---] Step 4: Remove Constant Rows.
[---CLEAN---] ------- Original Row Count: 488041, After: 488041, Removed: 0 in 0.10471s
[---CLEAN---] Step 5: Replacing Placeholder Values
[---CLEAN---] ------- Total Nulls After Replacement: 0 in 0.01385s
[---CLEAN---] Step 6: Sorting by Timestamp.
[---CLEAN---] ------- Sorted by timestamp in 0.01407s
[---CLEAN---] Step 7: Interpolating Missing and NaN Values.
[---CLEAN---] ------- Initial Nulls: 0, After Fill: 0, Filled: 0 in 0.031s
[---CLEAN---] Step 8: Converting Data to Num

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2020-08-21 13:30:00+00:00,SPY,337.920013,338.029999,337.839996,337.929993,530274
1,2020-08-21 13:31:00+00:00,SPY,337.920013,338.079987,337.920013,338.000000,280279
2,2020-08-21 13:32:00+00:00,SPY,338.019989,338.040009,337.875000,338.000000,180445
3,2020-08-21 13:33:00+00:00,SPY,338.010010,338.083405,337.910004,338.029999,303505
4,2020-08-21 13:34:00+00:00,SPY,338.019989,338.089996,337.940002,338.059998,132404
...,...,...,...,...,...,...,...
488036,2025-08-21 19:55:00+00:00,SPY,635.669983,635.889893,635.659973,635.850098,407279
488037,2025-08-21 19:56:00+00:00,SPY,635.854980,635.885010,635.669983,635.700012,194769
488038,2025-08-21 19:57:00+00:00,SPY,635.710022,635.775024,635.409973,635.409973,315516
488039,2025-08-21 19:58:00+00:00,SPY,635.414978,635.474976,635.145020,635.184998,375316


In [4]:
from iteration_003.charting.generate_features import generate_features
spy_clean_df = spy_clean_df.tail(1000)
gen_df = generate_features(
        df = spy_clean_df
        )

gen_df.head()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,CandleDirectionLabel,SwingTypeLabel,RangeHigh,RangeLow,ZoneLow,ZoneHigh,ZoneType,ZoneId,ZonePivotIdx,ZonePending,ZoneActive,ZoneInvalidated,ZoneLifecycle
487041,2025-08-19 16:20:00+00:00,SPY,640.559998,640.590027,640.450012,640.52002,79429,Medium-Low Bearish Sentiment,,,,,,,,,[],[],[],[]
487042,2025-08-19 16:21:00+00:00,SPY,640.539978,640.609924,640.48999,640.580017,45488,Medium-Low Bullish Sentiment,,,,,,,,,[],[],[],[]
487043,2025-08-19 16:22:00+00:00,SPY,640.575012,640.73999,640.544983,640.690002,48824,Neutral Bullish Sentiment,Accumulation,640.73999,,,,,,,[],[],[],[]
487044,2025-08-19 16:23:00+00:00,SPY,640.669983,640.869995,640.659973,640.744995,81730,Medium-Low Bullish Sentiment,Accumulation,640.869995,,640.659973,640.869995,Resistance,R1,3.0,"[{'id': 'R1', 'type': 'Resistance', 'low': 640...",[],[],"[{'id': 'R1', 'type': 'Resistance', 'low': 640..."
487045,2025-08-19 16:24:00+00:00,SPY,640.76001,640.77002,640.562012,640.630005,43538,Neutral Bearish Sentiment,Accumulation,640.869995,,640.390015,640.77002,Support,S1,4.0,"[{'id': 'R1', 'type': 'Resistance', 'low': 640...",[],[],"[{'id': 'R1', 'type': 'Resistance', 'low': 640..."


In [5]:
# Suppose gen_df is your labeled DataFrame, sorted by time
n = len(gen_df)
train_end = int(n * 0.8)       # 80% for training
val_end = int(n * 0.9)         # next 10% for validation, last 10% for test

df_train = gen_df.iloc[:train_end].copy()
df_val   = gen_df.iloc[train_end:val_end].copy()
df_test  = gen_df.iloc[val_end:].copy()

print(len(df_train), len(df_val), len(df_test))


800 100 100


In [6]:
from iteration_003.charting.generate_images import generate



generate(
    df_train, 
    image_dir="./dataset/images/train",
    # label_dir="../../iteration_002/dataset/labels/train",
    # label_to_id=label_to_id
    )

# generate( 
#     df_val, 
#     image_dir="../../iteration_003/dataset/images/val",
#     # label_dir="../../iteration_002/dataset/labels/val",
#     # label_to_id=label_to_id
#     )

# generate(
#     df_test,     
#     image_dir="../../iteration_003/dataset/images/test",
#     # label_dir="../../iteration_002/dataset/labels/test",
#     # label_to_id=label_to_id
#     )


Generating zone images: 100%|██████████| 700/700 [00:08<00:00, 85.72it/s]
