### Paths and imports

In [62]:
import os
import polars as pl
import shutil

from IPython.display import HTML, display

In [63]:
# TODO: Variable is scored. 

In [64]:
# Setup paths relative to this notebook's location
NOTEBOOK_DIR = os.getcwd()  # submissions folder
PROJ_DIR = os.path.dirname(NOTEBOOK_DIR)  # main project folder
DATA_DIR = os.path.join(PROJ_DIR, "jane-street-real-time-market-data-forecasting")
LOCAL_TEST_DIR = os.path.join(NOTEBOOK_DIR, "local_test_data")

# Create local test directory if it doesn't exist
os.makedirs(LOCAL_TEST_DIR, exist_ok=True)

### Grab a sample of train to turn into test data

In [65]:
# Load training data
train = pl.scan_parquet(os.path.join(DATA_DIR, "train.parquet"))

# Get the last 5 date_ids
max_date = train.select(pl.col('date_id').max()).collect().item()
test_dates = pl.Series(range(max_date-4, max_date+1))

# Get all data for these dates
local_test = train.filter(
    pl.col('date_id').is_in(test_dates)
).collect()

print(f"Sampled test data shape: {local_test.shape}")
print("\nDistribution in test data:")
print(local_test.group_by('date_id').agg(
    pl.n_unique('symbol_id').alias('n_symbols'),
    pl.n_unique('time_id').alias('n_times')
))

Sampled test data shape: (1870176, 93)

Distribution in test data:
shape: (50, 3)
┌─────────┬───────────┬─────────┐
│ date_id ┆ n_symbols ┆ n_times │
│ ---     ┆ ---       ┆ ---     │
│ i16     ┆ u32       ┆ u32     │
╞═════════╪═══════════╪═════════╡
│ 1656    ┆ 39        ┆ 968     │
│ 1659    ┆ 39        ┆ 968     │
│ 1650    ┆ 39        ┆ 968     │
│ 1653    ┆ 39        ┆ 968     │
│ 1662    ┆ 39        ┆ 968     │
│ …       ┆ …         ┆ …       │
│ 1688    ┆ 39        ┆ 968     │
│ 1691    ┆ 39        ┆ 968     │
│ 1685    ┆ 39        ┆ 968     │
│ 1697    ┆ 39        ┆ 968     │
│ 1694    ┆ 39        ┆ 968     │
└─────────┴───────────┴─────────┘


### Create test and lags from this sample

In [66]:
# Create test data matching competition format
local_test_formatted = local_test.select([
    pl.int_range(0, pl.len()).cast(pl.UInt64).alias('id'),
    pl.int_range(0, pl.len()).cast(pl.Int64).alias('row_id'),
    # Shift date_ids to start at 0 while preserving order
    (pl.col('date_id') - pl.col('date_id').min()).cast(pl.Int16).alias('date_id'),
    pl.col('time_id').cast(pl.Int16),
    pl.col('symbol_id').cast(pl.Int8),
    pl.col('weight').cast(pl.Float32),
    pl.lit(True).alias('is_scored'),  # All rows scored in our local test
    
    # Get all feature columns in order
    *[pl.col(f'feature_{i:02d}').cast(pl.Float32) for i in range(79)],
    # Keep responder_6 for scoring
    pl.col('responder_6').cast(pl.Float32)
])

In [67]:
# Create lags data matching competition format
local_lags_formatted = local_test.select([
    pl.int_range(0, pl.len()).cast(pl.UInt64).alias('id'),
    # Shift date_ids to start at 0 while preserving order
    (pl.col('date_id') - pl.col('date_id').min()).cast(pl.Int16).alias('date_id'),
    pl.col('time_id').cast(pl.Int16),
    pl.col('symbol_id').cast(pl.Int8),
    # Get all responders with _lag_1 suffix
    *[pl.col(f'responder_{i}').cast(pl.Float32).alias(f'responder_{i}_lag_1') 
      for i in range(9)]
])

### Have a peak

In [68]:
def create_title(title):
    return HTML(f"""
    <h3>{title}</h3>
    """)

# Look at the data before saving
with pl.Config(tbl_rows=4, tbl_cols=-1):
    display(create_title("First rows of our formatted test data"))
    display(local_test_formatted)

# Look at the data before saving
with pl.Config(tbl_rows=4, tbl_cols=-1):
    display(create_title("First rows of our formatted lags data"))
    display(local_lags_formatted)

id,row_id,date_id,time_id,symbol_id,weight,is_scored,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_6
u64,i64,i16,i16,i8,f32,bool,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0,0,0,0,3.003073,true,3.186338,-1.366338,3.008263,3.230091,-0.022933,-0.288832,-1.116216,-0.192898,0.028092,11.0,7.0,76.0,-1.415942,0.231454,-0.396361,,-0.384243,,-1.226073,-2.024567,1.108607,-0.17385,1.159326,0.686446,0.951065,-0.195814,1.03836,0.970259,0.705094,-0.487678,-0.577518,-0.163765,,,1.149333,1.065865,0.601366,-0.032697,-0.344836,,0.488839,,,-0.089759,,-1.218171,1.566622,0.159503,0.449522,0.640522,,0.566251,,,-1.386141,,-1.119459,1.108013,,0.14534,0.20483,1.01305,-0.287825,-0.233961,-0.48892,-2.994297,-1.702484,-1.052043,-0.017296,-0.685692,-0.905013,0.573387,-0.373927,,,0.817482,0.991265,0.139631,0.091396,-1.26792
1,1,0,0,1,2.352112,true,2.835768,-0.932864,3.076257,3.421653,0.629731,-0.280605,-1.218809,-0.230617,0.024665,11.0,7.0,76.0,-0.827229,0.765628,-0.388804,,-0.491722,,-1.424354,-1.667679,0.295056,-0.043796,0.696018,0.102291,0.892547,0.143796,-1.652797,-0.082011,1.000867,-0.586496,-0.799762,-0.049265,,,1.08281,1.135317,-0.416106,0.073537,-0.078972,,-0.62144,,,-1.014973,,-1.5496,1.411797,-0.01339,-1.559758,-0.654666,,-0.59432,,,-1.516951,,-2.807929,1.649815,,-1.272683,-0.341456,1.01305,-0.364148,-0.420267,-0.394358,-1.350513,-1.988172,-0.696023,1.382999,-0.273318,-1.261957,0.326739,-0.721075,,,1.685098,1.150916,-0.008306,0.000216,0.764267
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1870174,1870174,49,967,37,1.243116,true,2.663298,-0.889112,2.313155,3.101428,0.324454,0.618944,1.185663,1.599724,0.319719,34.0,4.0,214.0,0.759314,0.284057,0.41716,-0.611075,-0.513717,-0.891423,1.84994,0.406756,-1.608196,-0.252663,-0.271574,-0.051405,0.098146,-0.653961,0.173676,-0.016497,-0.404509,-0.577262,-0.731429,-0.21646,3.018564,-0.472061,3.13922,3.065858,0.842925,0.053283,-0.074403,0.500129,0.08263,0.336223,0.643934,-0.422367,-0.418195,0.203037,-0.702278,0.543305,-0.195764,0.693364,0.953293,0.352567,0.471775,1.876459,-0.143377,0.845516,0.301135,-0.395703,0.738038,-0.04124,1.270645,-1.101531,-0.358106,-0.141883,-0.255192,2.489247,0.537652,0.982107,-0.158009,0.137389,0.478357,0.782692,0.581421,-0.106056,-0.111017,0.163867,0.169331,-0.037563,-0.029483,-0.148711
1870175,1870175,49,967,38,3.193685,true,2.728506,-0.745238,2.788789,2.343393,0.454731,0.862839,0.964795,2.089673,0.344931,50.0,1.0,522.0,0.406531,0.618247,1.01327,-0.952069,-0.679168,-0.597603,0.375125,1.97537,-0.440974,-0.072018,1.741353,1.380735,-0.110494,-0.874806,0.553424,0.532243,0.263214,-0.757856,-0.869204,-0.062955,3.619233,-0.386316,3.54456,3.120631,-1.443649,-0.257411,-0.309567,1.366358,-0.220885,0.029798,1.094489,-0.051078,-0.114243,0.517313,0.852201,0.522199,-0.027275,0.471593,1.213111,0.263278,0.915804,1.862022,0.503819,1.310126,0.662521,1.654948,1.090367,0.535922,0.653011,-1.101531,-0.622853,-0.363631,-0.395652,-0.016812,2.016734,0.241486,0.253229,0.228745,0.462717,0.799635,0.706102,-0.376377,-0.286764,-0.359046,-0.246135,-0.288941,-0.247774,-0.138548


id,date_id,time_id,symbol_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
u64,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0,0,0,0.294667,0.063398,0.171153,-0.75924,0.302261,0.075915,-1.26792,0.399113,0.022011
1,0,0,1,0.158603,-0.067452,-0.217511,0.479521,0.157926,-0.139942,0.764267,0.200697,-0.043654
…,…,…,…,…,…,…,…,…,…,…,…,…
1870174,49,967,37,1.925987,0.479394,3.621867,-0.107114,-0.063599,1.204755,-0.148711,-0.026583,-0.256395
1870175,49,967,38,1.228778,0.512562,-0.050865,0.160883,0.080756,-0.078237,-0.138548,-0.038771,-0.21194


In [69]:
# Define paths
test_dir = os.path.join(LOCAL_TEST_DIR, "test.parquet")

# Remove existing test directory if it exists
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)

# Create main directory
os.makedirs(test_dir)

# Get max date_id
max_date = local_test_formatted['date_id'].max()

# Save data by date_id
for date_id in range(max_date + 1):
    # Create date subdirectory
    date_dir = os.path.join(test_dir, f"date_id={date_id}")
    os.makedirs(date_dir)
    
    # Filter data for this date and save
    local_test_formatted.filter(
        pl.col('date_id') == date_id
    ).write_parquet(
        os.path.join(date_dir, "part-0.parquet")
    )

print("Directory structure created:")
for root, dirs, files in os.walk(test_dir):
    print(f"Directory: {root}")
    for file in files:
        print(f"  File: {file}")

Directory structure created:
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/test.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/test.parquet/date_id=40
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/test.parquet/date_id=35
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/test.parquet/date_id=15
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/test.parquet/date_id=4
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/test.parquet/date_id=6
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/test.parquet/date_id=1
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing

In [70]:
# Define paths for lags
lags_dir = os.path.join(LOCAL_TEST_DIR, "lags.parquet")

# Remove existing lags directory if it exists
if os.path.exists(lags_dir):
    shutil.rmtree(lags_dir)

# Create main directory
os.makedirs(lags_dir)

# Save lags data by date_id
for date_id in range(max_date + 1):
    # Create date subdirectory
    date_dir = os.path.join(lags_dir, f"date_id={date_id}")
    os.makedirs(date_dir)
    
    # For each date_id in test, we want the previous day's responders
    # For date_id 0, we'll use the earliest data we have
    source_date = local_test_formatted['date_id'].min() if date_id == 0 else date_id - 1
    
    # Filter data for this date and save
    local_lags_formatted.filter(
        pl.col('date_id') == source_date
    ).write_parquet(
        os.path.join(date_dir, "part-0.parquet")
    )

print("\nLags directory structure created:")
for root, dirs, files in os.walk(lags_dir):
    print(f"Directory: {root}")
    for file in files:
        print(f"  File: {file}")


Lags directory structure created:
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/lags.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/lags.parquet/date_id=40
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/lags.parquet/date_id=35
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/lags.parquet/date_id=15
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/lags.parquet/date_id=4
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/lags.parquet/date_id=6
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data/lags.parquet/date_id=1
  File: part-0.parquet
Directory: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_t