In [1]:
import polars as pl

In [24]:
df = pl.scan_csv('https://www.nasdaqtrader.com/dynamic/SymDir/nasdaqtraded.txt', separator="|")
df.with_row_index('i').filter(Symbol='PRN').collect()

i,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
u32,str,str,str,str,str,str,i64,str,str,str,str,str
7936,"""Y""","""PRN""","""Invesco Dorsey Wright Industri…","""Q""","""G""","""Y""",100,"""N""","""N""",,"""PRN""","""N"""


In [41]:
import tqdm
import yfinance as yf
import pyarrow as pa
import pyarrow.parquet as pq

schema = pa.schema([
    ('symbol', pa.large_string()),
    ('date', pa.date32()),
    ('open', pa.float64()),
    ('close', pa.float64()),
    ('high', pa.float64()),
    ('low', pa.float64()),
    ('volume', pa.int64()),
])

with pq.ParquetWriter('stocks-data.parquet', schema=schema) as writer:
    for symbol in tqdm.tqdm(df.select('Symbol').collect()['Symbol']):
        history = pl.from_pandas(yf.Ticker(symbol).history(period='max'), include_index=True).select(
            pl.lit(symbol).alias('symbol'),
            pl.col('Date', 'Open', 'Close', 'High', 'Low', 'Volume').name.to_lowercase()
        ).with_columns(
            pl.col('date').cast(pl.Date)
        )
        if len(history) > 0:
            writer.write(history.to_arrow())

  0%|          | 6/11134 [00:01<43:10,  4.30it/s]  $AACT.U: possibly delisted; no timezone found
$AACT.W: possibly delisted; no timezone found
  0%|          | 12/11134 [00:03<46:14,  4.01it/s]  $AAM.U: possibly delisted; no timezone found
  0%|          | 13/11134 [00:04<1:24:49,  2.19it/s]$AAM.W: possibly delisted; no timezone found
  0%|          | 39/11134 [00:12<39:13,  4.71it/s]  ABLLW: Period 'max' is invalid, must be one of ['1d', '5d']
  0%|          | 41/11134 [00:12<29:44,  6.21it/s]ABLVW: Period 'max' is invalid, must be one of ['1d', '5d']
  0%|          | 47/11134 [00:13<39:01,  4.73it/s]$ABR$D: possibly delisted; no timezone found
  0%|          | 48/11134 [00:14<1:23:59,  2.20it/s]$ABR$E: possibly delisted; no timezone found
  0%|          | 49/11134 [00:15<1:50:00,  1.68it/s]$ABR$F: possibly delisted; no timezone found
  1%|          | 56/11134 [00:18<54:20,  3.40it/s]  ABVEW: Period 'max' is invalid, must be one of ['1d', '5d']
  1%|          | 62/11134 [00:19<27:47, 

AttributeError: 'NoneType' object has no attribute 'upper'

In [None]:
train_split = (
    pl.scan_parquet('stocks-data.parquet')
    .filter(~pl.col('symbol').is_in(['GOOG', 'TSLA', 'SHOP']))
    .filter(pl.col('date') < pl.date(2024, 1, 1))
)
train_split.sink_parquet('test-stocks.parquet')

In [None]:
test_split = (
    pl.scan_parquet('stocks-data.parquet')
    .filter(pl.col('date') >= pl.date(2024, 1, 1))
)
test_split.sink_parquet('train-stocks.parquet')

In [4]:
excluded = pl.scan_parquet('stocks-data.parquet').filter(pl.col('symbol').is_in(['GOOG', 'TSLA', 'SHOP']))
excluded.sink_parquet('excluded.parquet')

In [10]:
pl.scan_parquet('train-stocks.parquet').select(
    'symbol', 'date',
    gap=pl.col('close') / pl.col('close').shift().over('symbol', order_by='date')
).filter(~pl.col('gap').is_between(0.125, 8)).select(
    'symbol',
    n_gaps=pl.len().over('symbol')
).unique('symbol').sort('n_gaps', descending=True).collect()

symbol,n_gaps
str,u32
"""WKSP""",17
"""NVO""",11
"""BTTR""",9
"""ODV""",9
"""ABVC""",9
…,…
"""BTCT""",1
"""ALLK""",1
"""IMTE""",1
"""VSSYW""",1
