In [1]:
! pip install duckdb



In [2]:
# create a simple file-backed database, sitting inside our /tmp folder.

import duckdb

con = duckdb.connect('/tmp/duckdb-cache-selectedcols.duckdb')
con.execute('SET threads TO 4;')


# set DuckDB's AWS S3 credentials to be the ones this Workbench instance runs under; this means we can use the `parquet_query` function to provide a list of Parquet
# files that DuckDB can directly ingest.

from duckdb_support import set_duckdb_aws_credentials

set_duckdb_aws_credentials(con)

In [3]:
# import all NBBO quotes for product 'A', participant DirectEdgeX, from the 'mt_nbbo_quote' source table, for the 3-Jan-2022.

import datetime
import maystreet_data as md

con.execute('DROP TABLE IF EXISTS all_a_trades;')

files_list = md.parquet_query("mt_nbbo_quote", ["cqs_pillar"], datetime.date(2022, 1, 3))
as_strings = map(lambda f: f"'s3://{f}'" if not f.startswith('https://') else f"'{f}'", files_list)
as_string = f"CREATE TABLE all_a_trades AS SELECT DISTINCT AskPrice, AskQuantity, BidPrice, BidQuantity, BestBidParticipant, SequenceNumber FROM read_parquet([{', '.join(as_strings)}]) WHERE Product = 'A' AND BestAskParticipant = 'DirectEdgeX';"

con.execute(as_string)


# how many did we read in?

number_entered = con.execute('SELECT COUNT(*) FROM all_a_trades;').fetchdf()
number_entered

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,count_star()
0,7003


In [4]:
# a completely impractical and very slow way to retrieve the number of NBBO entries for cqs_pillar given the criteria below...

import datetime
import maystreet_data as md

files_list = md.parquet_query("mt_nbbo_quote", ["cqs_pillar"], datetime.date(2022, 1, 3))
as_strings = map(lambda f: f"'s3://{f}'", files_list)
as_string = f"SELECT COUNT(*) FROM read_parquet([{', '.join(as_strings)}]) WHERE Product = 'A' AND BestAskParticipant = 'DirectEdgeX';"

con.execute(as_string).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,count_star()
0,14006


In [5]:
# insert CSV data from our local directory into DuckDB.

con = duckdb.connect('/tmp/duckdb-cache-selectedcols.duckdb')

con.execute('DROP TABLE IF EXISTS example_csv;')

command_string = 'CREATE TABLE example_csv AS SELECT * FROM read_csv_auto("example_csv_file.csv");'
con.execute(command_string)

command_string = 'SELECT * FROM example_csv;'

data_frame = con.execute(command_string).fetchdf()
data_frame

Unnamed: 0,SequenceID,ExampleName
0,157521,This is an example
1,452142,This is another example


In [6]:
# retrieve the NBBOs from LLG's data joined with the data we supplied.

data_frame = con.execute('SELECT ex.*, at.* FROM example_csv ex LEFT JOIN all_a_trades at ON at.SequenceNumber = ex.SequenceID').fetchdf()
data_frame

Unnamed: 0,SequenceID,ExampleName,AskPrice,AskQuantity,BidPrice,BidQuantity,BestBidParticipant,SequenceNumber
0,157521,This is an example,163.2,300,155.0,100,NYSEArca,157521
1,452142,This is another example,159.1,100,158.59,800,MEMX,452142


In [7]:
# export data into a file in the local directory

con.execute("COPY all_a_trades TO '/home/workbench/all-a-trades-DirectEdgeX.csv' WITH (HEADER 1);")

<duckdb.DuckDBPyConnection at 0x7fdfbc5ddb70>

In [8]:
# finally close the connection

con.close()