In [1]:
import sys; sys.path.append("../../")

import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.model_selection import train_test_split

In [2]:
from common.load_ticker import load_ticker

spy_df = load_ticker(
    base_dir = "../sample", 
    time_col = "Date",
    symbol_col="Symbol",
    seed = 42, 
    symbol= "SPY",
    verbose=False
)

spy_df

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2020-08-21 13:30:00+00:00,SPY,337.920013,338.029999,337.839996,337.929993,530274
1,2020-08-21 13:31:00+00:00,SPY,337.920013,338.079987,337.920013,338.000000,280279
2,2020-08-21 13:32:00+00:00,SPY,338.019989,338.040009,337.875000,338.000000,180445
3,2020-08-21 13:33:00+00:00,SPY,338.010010,338.083405,337.910004,338.029999,303505
4,2020-08-21 13:34:00+00:00,SPY,338.019989,338.089996,337.940002,338.059998,132404
...,...,...,...,...,...,...,...
488036,2025-08-21 19:55:00+00:00,SPY,635.669983,635.889893,635.659973,635.850098,407279
488037,2025-08-21 19:56:00+00:00,SPY,635.854980,635.885010,635.669983,635.700012,194769
488038,2025-08-21 19:57:00+00:00,SPY,635.710022,635.775024,635.409973,635.409973,315516
488039,2025-08-21 19:58:00+00:00,SPY,635.414978,635.474976,635.145020,635.184998,375316


In [3]:
from common.clean_data import clean_data

spy_clean_df = clean_data(
    df=spy_df,
    timestamp_col = "Date",
    symbol_col = "Symbol",
    drop_duplicate_rows = True,
    drop_duplicate_cols = True,
    drop_constant_columns = True,
    drop_constant_rows = True,
    replace_placeholders = True,
    placeholders=("Null", "null", "NULL", "NaN", "nan", "NAN", "None", "none", "NONE"),
    fill_missing = True,
    convert_numeric = True,
    sort_by = "timestamp",
    verbose = True,
)

spy_clean_df

[---CLEAN---] Starting Shape=(488041, 7)
[---CLEAN---] Preserving: Symbol and Date
[---CLEAN---] Step 1: Remove Duplicate Columns.
[---CLEAN---] ------- Original Column Count: 7, After: 7, Removed: 0 in 0.03225s
[---CLEAN---] Step 2: Remove Duplicate Rows.
[---CLEAN---] ------- Original Row Count: 488041, After: 488041, Removed: 0 in 0.06295s
[---CLEAN---] Step 3: Remove Constant Columns.
[---CLEAN---] ------- Original Column Count: 7, After: 7, Removed: 0 in 0.01932s
[---CLEAN---] Step 4: Remove Constant Rows.
[---CLEAN---] ------- Original Row Count: 488041, After: 488041, Removed: 0 in 0.09392s
[---CLEAN---] Step 5: Replacing Placeholder Values
[---CLEAN---] ------- Total Nulls After Replacement: 0 in 0.01120s
[---CLEAN---] Step 6: Sorting by Timestamp.
[---CLEAN---] ------- Sorted by timestamp in 0.01593s
[---CLEAN---] Step 7: Interpolating Missing and NaN Values.
[---CLEAN---] ------- Initial Nulls: 0, After Fill: 0, Filled: 0 in 0.031s
[---CLEAN---] Step 8: Converting Data to Num

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2020-08-21 13:30:00+00:00,SPY,337.920013,338.029999,337.839996,337.929993,530274
1,2020-08-21 13:31:00+00:00,SPY,337.920013,338.079987,337.920013,338.000000,280279
2,2020-08-21 13:32:00+00:00,SPY,338.019989,338.040009,337.875000,338.000000,180445
3,2020-08-21 13:33:00+00:00,SPY,338.010010,338.083405,337.910004,338.029999,303505
4,2020-08-21 13:34:00+00:00,SPY,338.019989,338.089996,337.940002,338.059998,132404
...,...,...,...,...,...,...,...
488036,2025-08-21 19:55:00+00:00,SPY,635.669983,635.889893,635.659973,635.850098,407279
488037,2025-08-21 19:56:00+00:00,SPY,635.854980,635.885010,635.669983,635.700012,194769
488038,2025-08-21 19:57:00+00:00,SPY,635.710022,635.775024,635.409973,635.409973,315516
488039,2025-08-21 19:58:00+00:00,SPY,635.414978,635.474976,635.145020,635.184998,375316


In [4]:
from iteration_002.candlesticks.generate_labels import generate_labels
# spy_clean_df = spy_clean_df.tail(1000)
gen_df = generate_labels(
        df = spy_clean_df
        )

gen_df.head()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,Label
0,2020-08-21 13:30:00+00:00,SPY,337.920013,338.029999,337.839996,337.929993,530274,Low Bullish Sentiment
1,2020-08-21 13:31:00+00:00,SPY,337.920013,338.079987,337.920013,338.0,280279,Neutral Bullish Sentiment
2,2020-08-21 13:32:00+00:00,SPY,338.019989,338.040009,337.875,338.0,180445,Low Bearish Sentiment
3,2020-08-21 13:33:00+00:00,SPY,338.01001,338.083405,337.910004,338.029999,303505,Low Bullish Sentiment
4,2020-08-21 13:34:00+00:00,SPY,338.019989,338.089996,337.940002,338.059998,132404,Medium-Low Bullish Sentiment


In [5]:
# Suppose gen_df is your labeled DataFrame, sorted by time
n = len(gen_df)
train_end = int(n * 0.8)       # 80% for training
val_end = int(n * 0.9)         # next 10% for validation, last 10% for test

df_train = gen_df.iloc[:train_end].copy()
df_val   = gen_df.iloc[train_end:val_end].copy()
df_test  = gen_df.iloc[val_end:].copy()

print(len(df_train), len(df_val), len(df_test))


390432 48804 48805


In [6]:
%pip install opencv-python


Note: you may need to restart the kernel to use updated packages.


In [7]:
from iteration_002.candlesticks.generate_images import generate

label_to_id = {
    "High Bearish Sentiment": 0,
    "Medium-High Bearish Sentiment": 1,
    "Neutral Bearish Sentiment": 2,
    "Medium-Low Bearish Sentiment": 3,
    "Low Bearish Sentiment": 4,
    "High Bullish Sentiment": 5,
    "Medium-High Bullish Sentiment": 6,
    "Neutral Bullish Sentiment": 7,
    "Medium-Low Bullish Sentiment": 8,
    "Low Bullish Sentiment": 9,
    "Doji": 10
}

generate(
    df_train, 
    image_dir="../../iteration_002/dataset/images/train",
    label_dir="../../iteration_002/dataset/labels/train",
    label_to_id=label_to_id
    )

generate( 
    df_val, 
    image_dir="../../iteration_002/dataset/images/val",
    label_dir="../../iteration_002/dataset/labels/val",
    label_to_id=label_to_id
    )

generate(
    df_test,     
    image_dir="../../iteration_002/dataset/images/test",
    label_dir="../../iteration_002/dataset/labels/test",
    label_to_id=label_to_id
    )


Generating images: 100%|██████████| 390427/390427 [13:31<00:00, 480.89it/s]
Generating images: 100%|██████████| 48799/48799 [01:42<00:00, 477.93it/s]
Generating images: 100%|██████████| 48800/48800 [01:44<00:00, 467.42it/s]


In [8]:
import numpy as np
import cv2

# Canvas
H, W = 640, 640
img = np.ones((H, W, 3), dtype=np.uint8) * 255  # white background

# Candle parameters in pixel space
x_center = 320
body_width = 20
open_px = 400
close_px = 300
high_px = 250
low_px  = 450
color = (0, 200, 0) if close_px < open_px else (0, 0, 200)

# Wick
cv2.line(img, (x_center, high_px), (x_center, low_px), color, 2)

# Body
x0 = x_center - body_width // 2
x1 = x_center + body_width // 2
y0 = min(open_px, close_px)
y1 = max(open_px, close_px)
cv2.rectangle(img, (x0, y0), (x1, y1), color, -1)

# --- YOLO with padding ---
pad = 5  # pixels of padding around candle
x_min = min(x0, x1) - pad
x_max = max(x0, x1) + pad
y_min = high_px - pad
y_max = low_px + pad

# normalize
xc = (x_min + x_max) / 2 / W
yc = (y_min + y_max) / 2 / H
bw = (x_max - x_min) / W
bh = (y_max - y_min) / H

label_str = f"0 {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}"
print("YOLO label with padding:", label_str)

# draw padded YOLO box
cv2.rectangle(img, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

cv2.imwrite("test_candle_with_padding.png", img)


YOLO label with padding: 0 0.500000 0.546875 0.046875 0.328125


True