In [1]:
import os
import polars as pl
import numpy as np

# Setup paths relative to this notebook's location
NOTEBOOK_DIR = os.getcwd()  # submissions folder
PROJ_DIR = os.path.dirname(NOTEBOOK_DIR)  # main project folder
DATA_DIR = os.path.join(PROJ_DIR, "jane-street-real-time-market-data-forecasting")
LOCAL_TEST_DIR = os.path.join(NOTEBOOK_DIR, "local_test_data")

# Create local test directory if it doesn't exist
os.makedirs(LOCAL_TEST_DIR, exist_ok=True)

Original data directory: /monfs01/projects/ys68/JaneStreet-Kaggle/jane-street-real-time-market-data-forecasting
Local test data will be saved to: /monfs01/projects/ys68/JaneStreet-Kaggle/submission_testing/local_test_data


In [2]:
# Load a small subset of training data
train = pl.scan_parquet(os.path.join(DATA_DIR, "train.parquet")).\
    select(
        pl.int_range(pl.len(), dtype=pl.UInt64).alias("id"),
        pl.all(),
    )

# Show schema to verify columns we need
print("Training data schema:")
train.schema

# Fetch first few rows to verify data
print("\nSample of training data:")
sample = train.fetch(5)
print(sample)

Training data schema:

Sample of training data:


  train.schema
  sample = train.fetch(5)


shape: (5, 94)
┌─────┬─────────┬─────────┬───────────┬───┬─────────────┬─────────────┬─────────────┬──────────────┐
│ id  ┆ date_id ┆ time_id ┆ symbol_id ┆ … ┆ responder_6 ┆ responder_7 ┆ responder_8 ┆ partition_id │
│ --- ┆ ---     ┆ ---     ┆ ---       ┆   ┆ ---         ┆ ---         ┆ ---         ┆ ---          │
│ u64 ┆ i16     ┆ i16     ┆ i8        ┆   ┆ f32         ┆ f32         ┆ f32         ┆ i64          │
╞═════╪═════════╪═════════╪═══════════╪═══╪═════════════╪═════════════╪═════════════╪══════════════╡
│ 0   ┆ 0       ┆ 0       ┆ 1         ┆ … ┆ 0.775981    ┆ 0.346999    ┆ 0.095504    ┆ 0            │
│ 1   ┆ 0       ┆ 0       ┆ 7         ┆ … ┆ 0.703665    ┆ 0.216683    ┆ 0.778639    ┆ 0            │
│ 2   ┆ 0       ┆ 0       ┆ 9         ┆ … ┆ 2.109352    ┆ 0.670881    ┆ 0.772828    ┆ 0            │
│ 3   ┆ 0       ┆ 0       ┆ 10        ┆ … ┆ 1.114137    ┆ 0.775199    ┆ -1.379516   ┆ 0            │
│ 4   ┆ 0       ┆ 0       ┆ 14        ┆ … ┆ -3.57282    ┆ -1.089123   ┆ -5.0

In [7]:
# Load training data
train = pl.scan_parquet(os.path.join(DATA_DIR, "train.parquet"))

# Get the last 5 date_ids
max_date = train.select(pl.col('date_id').max()).collect().item()
test_dates = pl.Series(range(max_date-4, max_date+1))

# Get all data for these dates
local_test = train.filter(
    pl.col('date_id').is_in(test_dates)
).collect()

print(f"Sampled test data shape: {local_test.shape}")
print("\nDistribution in test data:")
print(local_test.group_by('date_id').agg(
    pl.n_unique('symbol_id').alias('n_symbols'),
    pl.n_unique('time_id').alias('n_times')
))

Sampled test data shape: (187792, 93)

Distribution in test data:
shape: (5, 3)
┌─────────┬───────────┬─────────┐
│ date_id ┆ n_symbols ┆ n_times │
│ ---     ┆ ---       ┆ ---     │
│ i16     ┆ u32       ┆ u32     │
╞═════════╪═══════════╪═════════╡
│ 1698    ┆ 39        ┆ 968     │
│ 1695    ┆ 39        ┆ 968     │
│ 1696    ┆ 38        ┆ 968     │
│ 1694    ┆ 39        ┆ 968     │
│ 1697    ┆ 39        ┆ 968     │
└─────────┴───────────┴─────────┘


In [8]:
def create_title(title):
    return HTML(f"""
    <h3>{title}</h3>
    """)

# Look at the data before saving
with pl.Config(tbl_rows=16, tbl_cols=-1):
    display(create_title("First rows of our formatted test data"))
    display(local_test_formatted.head(16))

# Get some statistics
test_statistics = local_test_formatted.select([
    pl.all().mean().round(4).alias('mean'),
    pl.all().std().round(4).alias('std'),
    pl.all().min().alias('min'),
    pl.all().max().alias('max'),
    pl.all().null_count().alias('null_count')
]).transpose(
    column_names=['statistic'],
    include_header=True
)

with pl.Config(tbl_rows=-1, tbl_cols=-1):
    display(create_title("Statistical Summary of Test Data"))
    display(test_statistics)

Test data structure:
Schema([('row_id', Int64), ('date_id', Int16), ('time_id', Int16), ('symbol_id', Int8), ('weight', Float32), ('is_scored', Boolean), ('feature_00', Float32), ('feature_01', Float32), ('feature_02', Float32), ('feature_03', Float32), ('feature_04', Float32), ('feature_05', Float32), ('feature_06', Float32), ('feature_07', Float32), ('feature_08', Float32), ('feature_09', Float32), ('feature_10', Float32), ('feature_11', Float32), ('feature_12', Float32), ('feature_13', Float32), ('feature_14', Float32), ('feature_15', Float32), ('feature_16', Float32), ('feature_17', Float32), ('feature_18', Float32), ('feature_19', Float32), ('feature_20', Float32), ('feature_21', Float32), ('feature_22', Float32), ('feature_23', Float32), ('feature_24', Float32), ('feature_25', Float32), ('feature_26', Float32), ('feature_27', Float32), ('feature_28', Float32), ('feature_29', Float32), ('feature_30', Float32), ('feature_31', Float32), ('feature_32', Float32), ('feature_33', Float3

In [None]:
# Save to our local test directory
local_test_formatted.write_parquet(
    os.path.join(LOCAL_TEST_DIR, "test.parquet")
)