In [1]:
import pandas as pd
import json
import re

# ============================
# LOAD RETURN SERIES
# ============================

df = pd.read_csv("market_features_master.csv", index_col=0, parse_dates=True)

# Select the return columns for LLM training
return_cols = [
    "SP500_ret", "NASDAQ_ret", "SPY_ret", 
    "QQQ_ret", "VTI_ret", "IVV_ret", "ARKK_ret"
]

# Combine all returns into a single long sequence (as in Gruver/Delphyne)
series = df[return_cols].stack().dropna()
values = series.values  # numpy array of floats

# ============================
# PARAMETERS
# ============================

WINDOW = 60   # past 60 timesteps → predict next


# ============================
# DIGIT-LEVEL TOKENIZER (GRUVER)
# ============================

def digit_tokenize(x):
    """
    Convert numeric value into digit tokens:
    0.0123 → "0 . 0 1 2 3"
    -0.5   → "- . 5"
    """
    s = "{:.6f}".format(x)   # fixed length
    s = s.rstrip("0")        # remove trailing zeros
    if s.endswith("."):
        s += "0"

    tokens = []
    for ch in s:
        if ch == '.':
            tokens.append(".")
        elif ch == '-':
            tokens.append("-")
        else:
            tokens.append(ch)
    return " ".join(tokens)


# ============================
# BUILD DATASET 1 (GRUVER)
# ============================

gruver_samples = []

for i in range(len(values) - WINDOW):
    input_seq = values[i : i + WINDOW]
    target = values[i + WINDOW]

    # Tokenize each element
    tokenized_input = " | ".join(digit_tokenize(v) for v in input_seq)
    tokenized_target = digit_tokenize(target)

    gruver_samples.append({
        "input_text": tokenized_input,
        "target_text": tokenized_target
    })

# Save
with open("dataset_gruver_digit.jsonl", "w") as f:
    for item in gruver_samples:
        f.write(json.dumps(item) + "\n")

print(f"Saved Gruver-style dataset with {len(gruver_samples)} samples.")


# ============================
# BUILD DATASET 2 (DELPHYNE)
# ============================

def numeric_tokenize(x):
    """
    Convert number into a single text token:
    0.0123 → "0.0123"
    """
    return "{:.6f}".format(x).rstrip("0").rstrip(".")


delphyne_samples = []

for i in range(len(values) - WINDOW):
    input_seq = values[i : i + WINDOW]
    target = values[i + WINDOW]

    # Tokenize numerically
    input_str = " ".join(numeric_tokenize(v) for v in input_seq)
    target_str = numeric_tokenize(target)

    delphyne_samples.append({
        "input_text": input_str,
        "target_text": target_str
    })

# Save
with open("dataset_delphyne_numeric.jsonl", "w") as f:
    for item in delphyne_samples:
        f.write(json.dumps(item) + "\n")

print(f"Saved Delphyne-style dataset with {len(delphyne_samples)} samples.")


# ============================
# PREVIEW SAMPLES
# ============================

print("\nExample Gruver-style sample:")
print(json.dumps(gruver_samples[0], indent=2))

print("\nExample Delphyne-style sample:")
print(json.dumps(delphyne_samples[0], indent=2))


Saved Gruver-style dataset with 5064 samples.
Saved Delphyne-style dataset with 5064 samples.

Example Gruver-style sample:
{
  "input_text": "0 . 0 1 3 8 9 8 | 0 . 0 1 5 6 2 2 | 0 . 0 1 4 1 4 | 0 . 0 1 6 3 1 4 | 0 . 0 1 4 3 4 4 | 0 . 0 1 4 1 9 5 | 0 . 0 2 9 9 4 | 0 . 0 0 1 0 0 9 | - 0 . 0 0 0 1 6 5 | 0 . 0 0 0 7 8 6 | - 0 . 0 0 3 9 6 7 | 0 . 0 0 1 4 9 9 | 0 . 0 0 0 7 8 3 | - 0 . 0 0 5 3 1 | 0 . 0 1 0 8 5 3 | 0 . 0 1 2 2 8 5 | 0 . 0 1 1 3 6 6 | 0 . 0 1 1 8 2 7 | 0 . 0 1 2 6 7 3 | 0 . 0 1 1 3 2 4 | 0 . 0 1 5 3 3 1 | 0 . 0 0 3 8 9 7 | 0 . 0 0 5 7 0 2 | 0 . 0 0 3 9 3 6 | 0 . 0 0 3 3 9 2 | 0 . 0 0 5 6 1 7 | 0 . 0 0 3 9 2 1 | 0 . 0 0 7 4 8 2 | 0 . 0 0 7 3 9 9 | 0 . 0 0 9 4 7 9 | 0 . 0 0 7 2 2 2 | 0 . 0 0 6 6 9 9 | 0 . 0 0 8 9 6 6 | 0 . 0 0 7 5 3 | 0 . 0 2 5 7 5 9 | - 0 . 0 0 1 1 1 4 | 0 . 0 0 1 4 3 4 | - 0 . 0 0 0 6 6 6 | - 0 . 0 0 0 2 1 | 0 . 0 0 0 3 8 8 | - 0 . 0 0 0 8 4 2 | 0 . 0 1 0 5 6 7 | - 0 . 0 0 0 3 4 5 | - 0 . 0 0 2 5 1 1 | - 0 . 0 0 0 4 3 6 | - 0 . 0 0 2 2 7 9 | - 0 . 0 0 0 4 3 7