In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("market_features_master.csv", index_col=0, parse_dates=True)

# The return columns we will be using for LLM training
return_cols = [
    "SP500_ret", "NASDAQ_ret", "SPY_ret", 
    "QQQ_ret", "VTI_ret", "IVV_ret", "ARKK_ret"
]

In [13]:
# Actual Python internal representation of the floating point is ~17 meaningful digits
df["SPY_ret"].apply(repr).head()

Date
2015-01-05    -0.0180597287401987
2015-01-06    -0.0094187320900542
2015-01-07     0.0124613651012828
2015-01-08     0.0177451647955477
2015-01-09    -0.0080138261034526
Name: SPY_ret, dtype: object

In [14]:
df = df[return_cols]
df.head()
# Pandas displays up to 6 decimal places by default, which is the amount that gets stored during tokenization later

Unnamed: 0_level_0,SP500_ret,NASDAQ_ret,SPY_ret,QQQ_ret,VTI_ret,IVV_ret,ARKK_ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-05,-0.018278,-0.015706,-0.01806,-0.014669,-0.017182,-0.01756,-0.024095
2015-01-06,-0.008893,-0.012859,-0.009419,-0.013408,-0.009799,-0.00901,-0.003759
2015-01-07,0.01163,0.01257,0.012461,0.012891,0.011933,0.012371,0.004589
2015-01-08,0.017888,0.018432,0.017745,0.01914,0.01764,0.017864,0.022335
2015-01-09,-0.008404,-0.006782,-0.008014,-0.006583,-0.00829,-0.008438,-0.008441


In [15]:
df.shape

(1762, 7)

In [16]:
df.isna().sum() # No missing values

SP500_ret     0
NASDAQ_ret    0
SPY_ret       0
QQQ_ret       0
VTI_ret       0
IVV_ret       0
ARKK_ret      0
dtype: int64

In [17]:
# Count decimal digits for each value in the return columns
def count_decimal_digits(x):
    if pd.isna(x):
        return np.nan
    # Convert to string safely
    s = format(x, 'f')       # ensures no scientific notation
    if '.' in s:
        return len(s.split('.')[1])
    else:
        return 0  # integer-like
        

digit_counts = df.applymap(count_decimal_digits)

# Summarize the counts of the number returns with a certain number of decimal digits in each return column
# CONCLUSION: All returns have 6 decimal places of precision
precision_summary = {
    col: digit_counts[col].value_counts().sort_index()
    for col in return_cols
}

for col, counts in precision_summary.items():
    print(f"\n{col}:")
    print(counts)


SP500_ret:
SP500_ret
6    1762
Name: count, dtype: int64

NASDAQ_ret:
NASDAQ_ret
6    1762
Name: count, dtype: int64

SPY_ret:
SPY_ret
6    1762
Name: count, dtype: int64

QQQ_ret:
QQQ_ret
6    1762
Name: count, dtype: int64

VTI_ret:
VTI_ret
6    1762
Name: count, dtype: int64

IVV_ret:
IVV_ret
6    1762
Name: count, dtype: int64

ARKK_ret:
ARKK_ret
6    1762
Name: count, dtype: int64


  digit_counts = df.applymap(count_decimal_digits)


In [20]:
# Combine all returns into a single long sequence (as in Gruver/Delphyne)
series = df[return_cols].stack().dropna()
print(series.head(21))

Date                  
2015-01-05  SP500_ret    -0.018278
            NASDAQ_ret   -0.015706
            SPY_ret      -0.018060
            QQQ_ret      -0.014669
            VTI_ret      -0.017182
            IVV_ret      -0.017560
            ARKK_ret     -0.024095
2015-01-06  SP500_ret    -0.008893
            NASDAQ_ret   -0.012859
            SPY_ret      -0.009419
            QQQ_ret      -0.013408
            VTI_ret      -0.009799
            IVV_ret      -0.009010
            ARKK_ret     -0.003759
2015-01-07  SP500_ret     0.011630
            NASDAQ_ret    0.012570
            SPY_ret       0.012461
            QQQ_ret       0.012891
            VTI_ret       0.011933
            IVV_ret       0.012371
            ARKK_ret      0.004589
dtype: float64


In [21]:
values = series.values  # numpy array of floats
print(values)
print(values.shape)

[-0.01827811 -0.0157062  -0.01805973 ... -0.00317916 -0.00248867
 -0.0218201 ]
(12334,)


In [22]:
y = df[col].dropna()
print(y.head())

Date
2015-01-05   -0.024095
2015-01-06   -0.003759
2015-01-07    0.004589
2015-01-08    0.022335
2015-01-09   -0.008441
Name: ARKK_ret, dtype: float64


In [6]:
train_start = "2015-01-02"
train_end = "2020-01-01" # exclusive
test_start = "2021-01-04"
test_end = "2022-01-01" # exclusive 

In [10]:
train = df[df.index < train_end]
test = df[df.index >= test_start]

y_train = train["SP500_ret"].dropna()
y_test = test["SP500_ret"].dropna()

print(y_train.head())
print(y_train.tail())
print(y_test.head())
print(y_test.tail())

Date
2015-01-05   -0.018278
2015-01-06   -0.008893
2015-01-07    0.011630
2015-01-08    0.017888
2015-01-09   -0.008404
Name: SP500_ret, dtype: float64
Date
2019-12-24   -0.000195
2019-12-26    0.005128
2019-12-27    0.000034
2019-12-30   -0.005781
2019-12-31    0.002946
Name: SP500_ret, dtype: float64
Date
2021-01-04   -0.014755
2021-01-05    0.007083
2021-01-06    0.005710
2021-01-07    0.014847
2021-01-08    0.005492
Name: SP500_ret, dtype: float64
Date
2021-12-27    0.013839
2021-12-28   -0.001010
2021-12-29    0.001402
2021-12-30   -0.002990
2021-12-31   -0.002626
Name: SP500_ret, dtype: float64
