In [19]:
import random
from datetime import datetime, timedelta
from pathlib import Path

import polars as pl

from timelake import TimeLake

PATH = Path("./my-test-timelake")

In [None]:
def create_sample_data(
    num_rows: int = 100_000,
    asset_ids: list[str] = ["AAPL", "MSFT", "GOOG", "TSLA"],
    start_date: datetime = datetime(2023, 1, 1),
) -> pl.DataFrame:
    assert num_rows >= len(asset_ids), (
        "num_rows must be at least equal to the number of asset_ids"
    )

    rows_per_asset = num_rows // len(asset_ids)
    all_data = []

    for asset_id in asset_ids:
        dates = [start_date + timedelta(hours=i) for i in range(rows_per_asset)]
        prices = [
            round(100 + i * 0.05 + (i % 24) * 0.3 + random.uniform(-1, 1), 2)
            for i in range(rows_per_asset)
        ]
        volumes = [
            int(1000 + i * 2 + (i % 10) * 50 + random.randint(-20, 20))
            for i in range(rows_per_asset)
        ]
        asset_col = [asset_id] * rows_per_asset

        all_data.append(
            pl.DataFrame(
                {
                    "date": dates,
                    "asset_id": asset_col,
                    "price": prices,
                    "volume": volumes,
                }
            )
        )

    return pl.concat(all_data)


In [21]:
df = create_sample_data()
df.head(5)

date,asset_id,price,volume
datetime[μs],str,f64,i64
2023-01-01 00:00:00,"""AAPL""",100.21,1000
2023-01-01 01:00:00,"""AAPL""",100.07,1053
2023-01-01 02:00:00,"""AAPL""",100.87,1086
2023-01-01 03:00:00,"""AAPL""",101.65,1167
2023-01-01 04:00:00,"""AAPL""",102.09,1188


In [22]:
df.shape

(100000, 4)

In [23]:
lake = TimeLake.create(
    path=PATH,
    df=df,
    timestamp_column="date",
    partition_by=["asset_id"],
)

In [24]:
df = lake.read()

In [26]:
df.shape

(100000, 5)