In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("market_features_master.csv", index_col=0, parse_dates=True)

# The return columns we will be using for LLM training
return_cols = [
    "SP500_ret", "NASDAQ_ret", "SPY_ret", 
    "QQQ_ret", "VTI_ret", "IVV_ret", "ARKK_ret"
]

In [None]:
# Actual Python internal representation of the floating point is ~17 meaningful digits
df["SPY_ret"].apply(repr).head()

Date
2021-02-02    0.0141402145018096
2021-02-03    0.0007863040259381
2021-02-04    0.0113658170146655
2021-02-05    0.0039358541278653
2021-02-08    0.0072218158210388
Name: SPY_ret, dtype: object

In [None]:
df = df[return_cols]
df.head()
# Pandas displays up to 6 decimal places by default, which is the amount that gets stored during tokenization later

Unnamed: 0_level_0,SP500_ret,NASDAQ_ret,SPY_ret,QQQ_ret,VTI_ret,IVV_ret,ARKK_ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-02-02,0.013898,0.015622,0.01414,0.016314,0.014344,0.014195,0.02994
2021-02-03,0.001009,-0.000165,0.000786,-0.003967,0.001499,0.000783,-0.00531
2021-02-04,0.010853,0.012285,0.011366,0.011827,0.012673,0.011324,0.015331
2021-02-05,0.003897,0.005702,0.003936,0.003392,0.005617,0.003921,0.007482
2021-02-08,0.007399,0.009479,0.007222,0.006699,0.008966,0.00753,0.025759


In [8]:
df.shape

(732, 7)

In [None]:
df.isna().sum() # No missing values

SP500_ret     0
NASDAQ_ret    0
SPY_ret       0
QQQ_ret       0
VTI_ret       0
IVV_ret       0
ARKK_ret      0
dtype: int64

In [None]:
# Count decimal digits for each value in the return columns
def count_decimal_digits(x):
    if pd.isna(x):
        return np.nan
    # Convert to string safely
    s = format(x, 'f')       # ensures no scientific notation
    if '.' in s:
        return len(s.split('.')[1])
    else:
        return 0  # integer-like
        

digit_counts = df.applymap(count_decimal_digits)

# Summarize the counts of the number returns with a certain number of decimal digits in each return column
# CONCLUSION: All returns have 6 decimal places of precision
precision_summary = {
    col: digit_counts[col].value_counts().sort_index()
    for col in return_cols
}

for col, counts in precision_summary.items():
    print(f"\n{col}:")
    print(counts)


SP500_ret:
SP500_ret
6    732
Name: count, dtype: int64

NASDAQ_ret:
NASDAQ_ret
6    732
Name: count, dtype: int64

SPY_ret:
SPY_ret
6    732
Name: count, dtype: int64

QQQ_ret:
QQQ_ret
6    732
Name: count, dtype: int64

VTI_ret:
VTI_ret
6    732
Name: count, dtype: int64

IVV_ret:
IVV_ret
6    732
Name: count, dtype: int64

ARKK_ret:
ARKK_ret
6    732
Name: count, dtype: int64


  digit_counts = df.applymap(count_decimal_digits)


In [13]:
# Combine all returns into a single long sequence (as in Gruver/Delphyne)
series = df[return_cols].stack().dropna()
print(series)

Date                  
2021-02-02  SP500_ret     0.013898
            NASDAQ_ret    0.015622
            SPY_ret       0.014140
            QQQ_ret       0.016314
            VTI_ret       0.014344
                            ...   
2023-12-28  SPY_ret       0.000378
            QQQ_ret      -0.000486
            VTI_ret      -0.000504
            IVV_ret       0.000459
            ARKK_ret     -0.002212
Length: 5124, dtype: float64


In [15]:
values = series.values  # numpy array of floats
print(values)
print(values.shape)

[ 0.01389822  0.01562221  0.01414021 ... -0.00050359  0.0004594
 -0.00221155]
(5124,)
