In [None]:
import polars as pl
import datetime as dt
import pandas as pd
import os
from quant101.core_2.config import data_dir
test_file_dir = os.path.join(data_dir, "lake/test/polars.csv")

df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        # "birthdate": [
        #     dt.date(1997, 1, 10),
        #     dt.date(1985, 2, 15),
        #     dt.date(1983, 3, 22),
        #     dt.date(1981, 4, 30),
        # ],
        "birthdate": [
            '1997-01-10',
            '1985-02-15',
            '1983-03-22',
            '1981-04-30',
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)

df.cast({"weight": pl.Float32, "birthdate": pl.Date, "height": pl.Float32})
# df

# df.write_csv(test_file_dir)

# df_csv = pl.read_csv(test_file_dir, try_parse_dates=True)

# print(df_csv.head())

name,birthdate,weight,height
str,str,f64,f64
"""Alice Archer""","""1997-01-10""",57.9,1.56
"""Ben Brown""","""1985-02-15""",72.5,1.77
"""Chloe Cooper""","""1983-03-22""",53.6,1.65
"""Daniel Donovan""","""1981-04-30""",83.1,1.75


In [None]:
import polars as pl
from quant101.core_2.data_loader import data_dir_calculate
import os

lake_file_paths = data_dir_calculate(asset='us_stock_sip', data_type='minute_aggs_v1', start_date='2025-07-01', end_date='2025-07-31', lake=True)

lf = pl.scan_parquet(
        lake_file_paths,
    )

# 计算 VWAP (每个 ticker)
vwap = (
    lf.filter(pl.col("volume") > 0)
        .group_by("ticker")
        .agg((pl.col("close") * pl.col("volume")).sum() / pl.col("volume").sum())
        .rename({"close": "vwap"})
)

# print(vwap.collect().sort('ticker').head())


# 找到每个 ticker 当月最后的 close
last_close = (
    lf.sort("window_start")
      .group_by("ticker")
      .tail(1)
      .select(["ticker", "close"])
      .rename({"close": "last_close"})
)

# print(last_close.collect().sort('ticker').head())

# 合并结果，计算比率
result = (
    vwap.join(last_close, on="ticker")
        .with_columns(
            (
                pl.col("last_close") / pl.col("vwap")).alias("ratio"),
                (pl.col("last_close") - pl.col("vwap")).abs().alias("vwap_diff"),
                ((pl.col("last_close") - pl.col("vwap")) / pl.col("vwap")).abs().alias("ratio_stability_1"),
                abs((pl.col("last_close") - pl.col("vwap")) / pl.col("vwap")).cast(pl.Float32).alias("ratio_stability_2")
             )
        .sort("ratio_stability_1",'last_close', descending=[False, True])
)

print(result.collect().head(5))

shape: (5, 7)
┌────────┬────────────┬────────────┬───────┬───────────┬───────────────────┬───────────────────┐
│ ticker ┆ vwap       ┆ last_close ┆ ratio ┆ vwap_diff ┆ ratio_stability_1 ┆ ratio_stability_2 │
│ ---    ┆ ---        ┆ ---        ┆ ---   ┆ ---       ┆ ---               ┆ ---               │
│ str    ┆ f64        ┆ f32        ┆ f64   ┆ f64       ┆ f64               ┆ f32               │
╞════════╪════════════╪════════════╪═══════╪═══════════╪═══════════════════╪═══════════════════╡
│ ESGR   ┆ 337.910004 ┆ 337.910004 ┆ 1.0   ┆ 0.0       ┆ 0.0               ┆ 0.0               │
│ FIG    ┆ 115.5      ┆ 115.5      ┆ 1.0   ┆ 0.0       ┆ 0.0               ┆ 0.0               │
│ ZXIET  ┆ 100.0      ┆ 100.0      ┆ 1.0   ┆ 0.0       ┆ 0.0               ┆ 0.0               │
│ LBDAV  ┆ 91.860001  ┆ 91.860001  ┆ 1.0   ┆ 0.0       ┆ 0.0               ┆ 0.0               │
│ AEBIV  ┆ 83.260002  ┆ 83.260002  ┆ 1.0   ┆ 0.0       ┆ 0.0               ┆ 0.0               │
└────────┴──────

In [1]:
import duckdb

data_dir = "../data/lake/us_stocks_sip/minute_aggs_v1/2025/07/*.parquet"
con = duckdb.connect()

query = f"""
WITH vwap AS (
    SELECT 
        ticker,
        SUM(close * volume)::DOUBLE / NULLIF(SUM(volume),0) AS vwap
    FROM '{data_dir}'
    GROUP BY ticker
),
last_close AS (
    SELECT DISTINCT ON (ticker)
        ticker, close AS last_close
    FROM '{data_dir}'
    ORDER BY ticker, window_start DESC
)
SELECT 
    vwap.ticker,
    vwap.vwap,
    last_close.last_close,
    last_close.last_close / vwap.vwap AS ratio
FROM vwap
JOIN last_close USING (ticker)
ORDER BY ratio DESC
LIMIT 1;
"""

result = con.sql(query).df()
print(result)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  ticker      vwap  last_close      ratio
0    GVH  0.133029        5.56  41.795326


In [1]:
import polars as pl
import exchange_calendars as xcals

# 纽约证券交易所 (美国)
xnys = xcals.get_calendar('XNYS')
snys_schedule = xnys.schedule.loc['2005-01-01':'2025-12-31'] 

# 转换为 Polars DataFrame
df_schedule = pl.from_pandas(snys_schedule.reset_index())

df_schedule = df_schedule.with_columns([
    pl.col('open').dt.convert_time_zone('America/New_York'),
    pl.col('close').dt.convert_time_zone('America/New_York')
])

with pl.Config(tbl_cols=200, fmt_str_lengths=200, tbl_width_chars=3000, tbl_formatting="UTF8_FULL_CONDENSED"):
    df_schedule_filtered = df_schedule.filter(
        pl.col('close').dt.hour() != 16
    )
    print(df_schedule_filtered)

shape: (45, 5)
┌─────────────────────┬────────────────────────────────┬───────────────────┬───────────────────┬────────────────────────────────┐
│ index               ┆ open                           ┆ break_start       ┆ break_end         ┆ close                          │
│ ---                 ┆ ---                            ┆ ---               ┆ ---               ┆ ---                            │
│ datetime[ns]        ┆ datetime[ns, America/New_York] ┆ datetime[ns, UTC] ┆ datetime[ns, UTC] ┆ datetime[ns, America/New_York] │
╞═════════════════════╪════════════════════════════════╪═══════════════════╪═══════════════════╪════════════════════════════════╡
│ 2005-11-25 00:00:00 ┆ 2005-11-25 09:30:00 EST        ┆ null              ┆ null              ┆ 2005-11-25 13:00:00 EST        │
│ 2006-07-03 00:00:00 ┆ 2006-07-03 09:30:00 EDT        ┆ null              ┆ null              ┆ 2006-07-03 13:00:00 EDT        │
│ 2006-11-24 00:00:00 ┆ 2006-11-24 09:30:00 EST        ┆ null              

In [2]:
import polars as pl
from quant101.core_2.config import data_dir, data_dir_calculate
import os

# Get the file paths
lake_file_paths = data_dir_calculate(
    asset='us_stocks_sip', 
    data_type='day_aggs_v1', 
    start_date='2025-07-02',
    end_date='2025-07-07',
    lake=True
)

if not lake_file_paths:
        raise ValueError(
            f"No data files found."
        )

tickers = 'UVXY'
tickers = [t.strip().upper() for t in tickers.split(",")] if tickers else None
if tickers:
    lf = pl.scan_parquet(lake_file_paths).filter(pl.col('ticker').is_in(tickers))
else:
    lf = pl.scan_parquet(lake_file_paths)

lf = lf.with_columns(
    pl.from_epoch(pl.col('window_start'), time_unit='ns')
    .dt.convert_time_zone('America/New_York')
    .alias('timestamps')

).sort('timestamps').collect()
with pl.Config(tbl_cols=10):
    print(lf.head())


ImportError: cannot import name 'data_dir_calculate' from 'quant101.core_2.config' (/home/jerryhong/code-projects/quant101/src/quant101/core_2/config.py)

In [None]:
from quant101.core_2.data_loader import data_loader
import polars as pl
from quant101.core_2.plotter import plot_candlestick

tickers = ['NVDA']
# tickers = None
timeframe = "1h" # timeframe: '1m', '3m', '5m', '10m', '15m', '20m', '30m', '45m', '1h', '2h', '3h', '4h', '1d' 等
asset = 'us_stocks_sip'
data_type = ('day_aggs_v1' if timeframe == '1d' else 'minute_aggs_v1')
start_date = "2025-07-01"
end_date = "2025-07-08"
full_hour = True

lf_result = data_loader(
        tickers=tickers, 
        timeframe=timeframe, 
        asset=asset, 
        data_type=data_type, 
        start_date=start_date, 
        end_date=end_date,
        full_hour=full_hour
    ).collect()

print(lf_result.head())

# plot_candlestick(lf_result.to_pandas(), ticker=tickers, timeframe=timeframe)