In [1]:
import time
import sys
import duckdb
import numpy as np
import pandas as pd
from pathlib import Path
from sqlalchemy import create_engine

In [2]:
app_dir = Path().cwd().parent.absolute()
sys.path.insert(0, str(app_dir))

In [6]:
from app.configuration import Config
from app.functions import query1

In [4]:
config = Config()

In [5]:
engine = create_engine(config.tpch_database)

In [13]:
lineitems_ = """SELECT * FROM tpch.lineitem LIMIT 50"""
with engine.connect() as con:
    lineitem = pd.read_sql(lineitems_, con, parse_dates=['l_shipdate', 'l_commitdate', 'l_receiptdate'])
lineitem.head(2)

Unnamed: 0,l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment,l_dummy
0,1285152,48189,3859,2,595.47,9230.35,0.02,0.05,A,F,1994-05-28,1993-07-02,1997-09-20,COLLECT COD,FEDEX,random comment,
1,286757,72860,433,5,438.26,7210.52,0.08,0.05,A,F,1998-01-03,1995-02-16,1995-07-30,COLLECT COD,FEDEX,random comment,


In [15]:
start = time.time()
print(query1(lineitem))
end = time.time()

print('Base Pandas: ' + str(end - start) + " seconds")

                           l_quantity  l_extendedprice   disc_price  \
l_returnflag l_linestatus                                             
A            F             538.840682      5506.643636  228406.3871   
             O             425.020000      3956.205000    7827.8828   

                                  charge  l_discount  l_shipdate  
l_returnflag l_linestatus                                         
A            F             234582.209543    0.054773          44  
             O               7896.441000    0.040000           2  
Base Pandas: 0.010205268859863281 seconds


In [16]:
con = duckdb.connect(':memory:')
start = time.time()
print(con.from_df(lineitem).filter("l_shipdate <= cast('1998-09-02' AS date)").aggregate(
    '''
	l_returnflag,
    l_linestatus,
    sum(l_quantity) AS sum_qty,
    sum(l_extendedprice) AS sum_base_price,
    sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    avg(l_quantity) AS avg_qty,
    avg(l_extendedprice) AS avg_price,
    avg(l_discount) AS avg_disc,
    count(*) AS count_order'''
))
end = time.time()

print('DuckDB Scan: ' + str(end - start) + " seconds")

┌──────────────┬──────────────┬────────────────────┬────────────────────┬────────────────────┬────────────────────┬───────────────────┬───────────────────┬──────────────────────┬─────────────┐
│ l_returnflag │ l_linestatus │      sum_qty       │   sum_base_price   │   sum_disc_price   │     sum_charge     │      avg_qty      │     avg_price     │       avg_disc       │ count_order │
│   varchar    │   varchar    │       double       │       double       │       double       │       double       │      double       │      double       │        double        │    int64    │
├──────────────┼──────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┼───────────────────┼───────────────────┼──────────────────────┼─────────────┤
│ A            │ F            │ 23708.989999999994 │ 242292.31999999998 │ 228406.38710000002 │ 234582.20954299998 │ 538.8406818181817 │ 5506.643636363636 │ 0.054772727272727285 │          44 │
│ A            │ O            │    

In [19]:
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('lineitem', 22544),
 ('Config', 1072),
 ('Path', 904),
 ('create_engine', 144),
 ('open', 144),
 ('query1', 144),
 ('app_dir', 88),
 ('lineitems_', 85),
 ('np', 72),
 ('pd', 72),
 ('con', 56),
 ('config', 48),
 ('engine', 48),
 ('end', 24),
 ('start', 24)]