In [None]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

INPUT_DIR = Path("../AlphathonDataSets/pre-elec/")
GLOB = "pre-elec-*.parquet"
OUTPUT_PATH = Path("pre-elec-pivot-all.parquet")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

VALUE_COLS = [
    "log_mid",
    "quote_updates",
    "avg_rsprd",
    "pct_trades_iso",
    "pct_volume_iso",
    "total_flow_non_iso",
    "total_flow_iso",
    "num_trades",
    "num_trades_iso",
    "total_volume",
    "total_flow",
    "iso_flow_intensity",
]

files = sorted(INPUT_DIR.glob(GLOB))
if not files:
    raise FileNotFoundError(f"No parquet files matched {INPUT_DIR / GLOB}")

# Remove existing output file if it exists
if OUTPUT_PATH.exists():
    OUTPUT_PATH.unlink()
import gc

for i, fp in enumerate(tqdm(files)):
    cols = ["bucket", "ticker"] + VALUE_COLS
    df = pd.read_parquet(fp, columns=cols)

    if "ticker" in df.columns:
        df["ticker"] = df["ticker"].astype("category")

    melted = pd.melt(
        df,
        id_vars=["bucket", "ticker"],
        value_vars=VALUE_COLS,
        var_name="feature",
        value_name="value",
    )

    # Free memory as soon as we don't need df anymore
    del df
    gc.collect()

    melted["feature_ticker"] = melted["feature"] + "_" + melted["ticker"].astype(str)

    df_pivoted = (
        melted[["bucket", "feature_ticker", "value"]]
        .pivot_table(
            index="bucket", columns="feature_ticker", values="value", aggfunc="first"
        )
        .sort_index(axis=1)
        .reset_index()
    )

    # Free memory as soon as we don't need melted anymore
    del melted
    gc.collect()

    float_cols = [c for c in df_pivoted.columns if c != "bucket"]
    df_pivoted[float_cols] = df_pivoted[float_cols].astype("float32")

    # Append to parquet file using fastparquet
    if i == 0:
        df_pivoted.to_parquet(
            OUTPUT_PATH, engine="fastparquet", compression="snappy", index=False
        )
    else:
        df_pivoted.to_parquet(
            OUTPUT_PATH,
            engine="fastparquet",
            compression="snappy",
            index=False,
            append=True,
        )

    # Free memory after writing
    del df_pivoted, float_cols
    gc.collect()

print(f"Wrote pivoted dataset to: {OUTPUT_PATH}")

# Read back the final result to display
df_pivoted = pd.read_parquet(OUTPUT_PATH)
df_pivoted

  0%|          | 0/20 [00:00<?, ?it/s]

Wrote pivoted dataset to: pre-elec-pivot-all.parquet


feature_ticker,bucket,avg_rsprd_AAPL,avg_rsprd_AMD,avg_rsprd_AMZN,avg_rsprd_AVGO,avg_rsprd_BRK.B,avg_rsprd_CEG,avg_rsprd_DIA,avg_rsprd_DJT,avg_rsprd_EEM,...,total_volume_SQQQ,total_volume_TLT,total_volume_TQQQ,total_volume_TSLA,total_volume_VOO,total_volume_VUG,total_volume_XLE,total_volume_XLF,total_volume_XLU,total_volume_XOM
0,2024-10-08 13:30:00+00:00,0.000230,0.000289,0.000205,0.002690,0.002601,0.008743,0.000108,0.003023,0.000208,...,761024.0,737962.0,234757.0,627978.0,123593.0,39008.0,323426.0,134287.0,109977.0,1628.0
1,2024-10-08 13:30:01+00:00,0.000246,0.000323,0.000139,0.001020,0.002027,0.002982,0.000127,0.002130,0.000220,...,165168.0,70536.0,32928.0,57099.0,3620.0,400.0,26465.0,9009.0,3423.0,2658.0
2,2024-10-08 13:30:02+00:00,0.000250,0.000271,0.000134,0.001139,0.002173,0.003041,0.000138,0.003505,0.000217,...,28011.0,28420.0,27935.0,20013.0,1619.0,18.0,5759.0,5900.0,1475.0,527.0
3,2024-10-08 13:30:03+00:00,0.000278,0.000254,0.000165,0.001357,0.002173,0.004956,0.000134,0.002526,0.000210,...,5116.0,13897.0,16091.0,10054.0,147.0,130.0,16926.0,31914.0,14076.0,2319.0
4,2024-10-08 13:30:04+00:00,0.000189,0.000405,0.000160,0.001523,0.001383,0.007588,0.000135,0.006125,0.000210,...,17730.0,12151.0,9239.0,11816.0,1633.0,216.0,13932.0,9963.0,3444.0,1008.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467995,2024-11-04 20:59:55+00:00,0.000054,0.000172,0.000088,0.000291,0.000439,0.000332,0.000086,0.000292,0.000224,...,28759.0,117815.0,3063.0,13611.0,465.0,,2449.0,1048.0,1844.0,12629.0
467996,2024-11-04 20:59:56+00:00,0.000085,0.000132,0.000075,0.000327,0.000446,0.000339,0.000078,0.000529,0.000224,...,38934.0,63500.0,7841.0,11511.0,581.0,52.0,20068.0,15725.0,6185.0,15397.0
467997,2024-11-04 20:59:57+00:00,0.000073,0.000155,0.000063,0.000384,0.000408,0.000267,0.000078,0.000575,0.000224,...,21962.0,37066.0,6777.0,13883.0,3447.0,337.0,1528.0,3621.0,110.0,7301.0
467998,2024-11-04 20:59:58+00:00,0.000072,0.000110,0.000067,0.000367,0.000452,0.000290,0.000066,0.001296,0.000224,...,49023.0,25008.0,11246.0,17063.0,421.0,,1023.0,4849.0,207.0,6073.0


: 

In [None]:
df_pivoted.columns

In [None]:
df_pivoted["delta_log_mid_AAPL"]

0             NaN
1       -0.001070
2       -0.001295
3        0.000290
4        0.000223
           ...   
23395    0.000155
23396    0.000155
23397    0.000155
23398    0.000155
23399    0.000155
Name: delta_log_mid_AAPL, Length: 23400, dtype: float32