In [4]:
import pandas as pd
import json
import re

# ============================
# LOAD RETURN SERIES
# ============================

df = pd.read_csv("market_features_master.csv", index_col=0, parse_dates=True)

# Select the return columns for LLM training
return_cols = [
    "SP500_ret", "NASDAQ_ret", "SPY_ret", 
    "QQQ_ret", "VTI_ret", "IVV_ret", "ARKK_ret"
]

# Combine all returns into a single long sequence (as in Gruver/Delphyne)
# series = df[return_cols].stack().dropna()
# values = series.values  # numpy array of floats

# ============================
# PARAMETERS
# ============================

WINDOW = 60   # past 60 timesteps → predict next


# ============================
# DIGIT-LEVEL TOKENIZER (GRUVER)
# ============================

def digit_tokenize(x):
    """
    Convert numeric value into digit tokens (cap at 6 decimal places):
    0.0123 → "0 1 2 3 0 0"
    0.123456 → "1 2 3 4 5 6"
    1.5   → "1 5 0 0 0 0 0"
    -0.5   → "- 5 0 0 0 0 0"
    """
    # s = "{:.6f}".format(x)   # fixed length
    # s = s.rstrip("0")        # remove trailing zeros
    # if s.endswith("."):
    #     s += "0"

    # tokens = []
    # for ch in s:
    #     if ch == '.':
    #         tokens.append(".")
    #     elif ch == '-':
    #         tokens.append("-")
    #     else:
    #         tokens.append(ch)
    # return " ".join(tokens)
    
    neg = x < 0
    x = abs(x)

    # extract decimal digits
    dec = f"{x:.6f}".split(".")[1]   # always 6 digits

    # remove trailing zeros? No — we KEEP & PAD to exactly 6 digits
    digits = list(dec[:6])           # ensure max 6

    # negative sign should be its own token:
    if neg:
        return "- " + " ".join(digits)
    else:
        return " ".join(digits)


# ============================
# SIMPLE NUMERIC TOKENIZER (DELPHYNE)
# ============================

def numeric_tokenize(x):
    """
    Convert number into a single text token:
    0.0123 → "0.0123"
    """
    return "{:.6f}".format(x).rstrip("0").rstrip(".")


# ============================
# MAIN LOOP – ONE OUTPUT PER SERIES
# ============================

for col in return_cols:

    print(f"Processing {col}...")

    # extract one series
    series = df[col].dropna().values

    # ---------------------------
    # BUILD GRUVER DATASET
    # ---------------------------
    gruver_samples = []

    for i in range(len(series) - WINDOW):
        input_seq = series[i : i + WINDOW]
        target = series[i + WINDOW]

        tokenized_input = " | ".join(digit_tokenize(v) for v in input_seq)
        tokenized_target = digit_tokenize(target)

        gruver_samples.append({
            "input_text": tokenized_input,
            "target_text": tokenized_target
        })

    out_file = f"dataset_gruver_{col}.jsonl"
    with open(out_file, "w") as f:
        for item in gruver_samples:
            f.write(json.dumps(item) + "\n")

    print(f"  ✓ Saved {out_file}  ({len(gruver_samples)} samples)")


    # ---------------------------
    # BUILD DELPHYNE DATASET
    # ---------------------------
    delphyne_samples = []

    for i in range(len(series) - WINDOW):
        input_seq = series[i : i + WINDOW]
        target = series[i + WINDOW]

        input_str = " ".join(numeric_tokenize(v) for v in input_seq)
        target_str = numeric_tokenize(target)

        delphyne_samples.append({
            "input_text": input_str,
            "target_text": target_str
        })

    out_file = f"dataset_delphyne_{col}.jsonl"
    with open(out_file, "w") as f:
        for item in delphyne_samples:
            f.write(json.dumps(item) + "\n")

    print(f"  ✓ Saved {out_file}  ({len(delphyne_samples)} samples)")


    print("\nExample Gruver-style sample for " + col + ":")
    print(json.dumps(gruver_samples[0], indent=2))

    print("\nExample Delphyne-style sample for " + col + ":")
    print(json.dumps(delphyne_samples[0], indent=2))

print("\nAll datasets built successfully.")






# ============================
# ARCHIVE: SINGLE DATASET FOR ALL SERIES
# ============================

# ============================
# BUILD DATASET 1 (GRUVER)
# ============================

# gruver_samples = []

# for i in range(len(values) - WINDOW):
#     input_seq = values[i : i + WINDOW]
#     target = values[i + WINDOW]

#     # Tokenize each element
#     tokenized_input = " | ".join(digit_tokenize(v) for v in input_seq)
#     tokenized_target = digit_tokenize(target)

#     gruver_samples.append({
#         "input_text": tokenized_input,
#         "target_text": tokenized_target
#     })

# # Save
# with open("dataset_gruver_digit.jsonl", "w") as f:
#     for item in gruver_samples:
#         f.write(json.dumps(item) + "\n")

# print(f"Saved Gruver-style dataset with {len(gruver_samples)} samples.")


# ============================
# BUILD DATASET 2 (DELPHYNE)
# ============================


# delphyne_samples = []

# for i in range(len(values) - WINDOW):
#     input_seq = values[i : i + WINDOW]
#     target = values[i + WINDOW]

#     # Tokenize numerically
#     input_str = " ".join(numeric_tokenize(v) for v in input_seq)
#     target_str = numeric_tokenize(target)

#     delphyne_samples.append({
#         "input_text": input_str,
#         "target_text": target_str
#     })

# # Save
# with open("dataset_delphyne_numeric.jsonl", "w") as f:
#     for item in delphyne_samples:
#         f.write(json.dumps(item) + "\n")

# print(f"Saved Delphyne-style dataset with {len(delphyne_samples)} samples.")


# ============================
# PREVIEW SAMPLES
# ============================

# print("\nExample Gruver-style sample:")
# print(json.dumps(gruver_samples[0], indent=2))

# print("\nExample Delphyne-style sample:")
# print(json.dumps(delphyne_samples[0], indent=2))


Processing SP500_ret...
  ✓ Saved dataset_gruver_SP500_ret.jsonl  (1702 samples)
  ✓ Saved dataset_delphyne_SP500_ret.jsonl  (1702 samples)

Example Gruver-style sample for SP500_ret:
{
  "input_text": "- 0 1 8 2 7 8 | - 0 0 8 8 9 3 | 0 1 1 6 3 0 | 0 1 7 8 8 8 | - 0 0 8 4 0 4 | - 0 0 8 0 9 4 | - 0 0 2 5 7 9 | - 0 0 5 8 1 3 | - 0 0 9 2 4 8 | 0 1 3 4 2 4 | 0 0 1 5 5 0 | 0 0 4 7 3 2 | 0 1 5 2 7 0 | - 0 0 5 4 9 2 | 0 0 2 5 6 8 | - 0 1 3 3 8 8 | - 0 1 3 4 9 6 | 0 0 9 5 3 5 | - 0 1 2 9 9 2 | 0 1 2 9 6 2 | 0 1 4 4 3 9 | - 0 0 4 1 5 6 | 0 1 0 2 9 1 | - 0 0 3 4 1 8 | - 0 0 4 2 4 7 | 0 1 0 6 7 6 | - 0 0 0 0 2 9 | 0 0 9 6 4 5 | 0 0 4 0 7 5 | 0 0 1 5 9 8 | - 0 0 0 3 1 4 | - 0 0 1 0 6 2 | 0 0 6 1 2 7 | - 0 0 0 3 0 3 | 0 0 2 7 5 9 | - 0 0 0 7 6 6 | - 0 0 1 4 7 6 | - 0 0 2 9 5 6 | 0 0 6 1 2 5 | - 0 0 4 5 3 9 | - 0 0 4 3 8 9 | 0 0 1 1 9 6 | - 0 1 4 1 7 4 | 0 0 3 9 4 4 | - 0 1 6 9 6 1 | - 0 0 1 9 1 8 | 0 1 2 6 0 1 | - 0 0 6 0 7 5 | 0 1 3 5 3 4 | - 0 0 3 3 2 0 | 0 1 2 1 5 8 | - 0 0 4 8 7 3 | 0 0 9 0 1 3