# Setup and Data Generation
Performs global setup for the benchmark suite.
- Imports necessary libraries.
- Defines local helper functions: `generate_large_dataset`, `benchmark_operation`, and `verify_correctness`.
- Connects to the q/kdb+ process.
- Generates a **10 million row** dataset (`LARGE_DF`) and converts it to a pykx Table (`LARGE_Q_TABLE`).


In [1]:
import os
import sys
import importlib
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('.'))
import qutePandas as qpd
importlib.reload(qpd)
import pandas as pd
import numpy as np
import pykx as kx
import time
import gc
from test_utils import generate_large_dataset, benchmark_operation, verify_correctness, calculate_speedup
local_lic = os.path.abspath('../kdb_lic')
if os.path.exists(local_lic): os.environ['QLIC'] = local_lic
qpd.connect()


True

In [2]:
print("Generating Large Datasets (10M rows, 20 cols)... This may take a moment.")
LARGE_DF = generate_large_dataset(rows=10_000_000, cols=20, seed=42)
LARGE_Q_TABLE = kx.toq(LARGE_DF)
LARGE_DF_SUBSET = LARGE_DF[['col_0', 'col_4']]
LARGE_Q_SUBSET = LARGE_Q_TABLE[['col_0', 'col_4']]
print(f"Data Generation Complete. Shape: {LARGE_DF.shape}")


Generating Large Datasets (10M rows, 20 cols)... This may take a moment.


Data Generation Complete. Shape: (10000000, 20)


## Core: DataFrame Creation
Tests the creation of qutePandas DataFrames.


In [3]:
print('Benchmarking: DataFrame Creation')

def pd_func(): pd.DataFrame(LARGE_DF)
def q_func(): qpd.DataFrame(LARGE_DF)
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
calculate_speedup(pd_stats, q_stats)


Benchmarking: DataFrame Creation
  Pandas Mean: 0.0000 s


  qutePandas Mean: 15.6697 s
  Speedup: 2.4202728018776073e-06


## Cleaning Functions
Tests performance of data cleaning operations.


In [4]:
print('Benchmarking: dropna')

def pd_func(): LARGE_DF.dropna()
def q_func(): qpd.dropna(LARGE_Q_TABLE, return_type='q')
pd_res = LARGE_DF.dropna()
q_res = qpd.dropna(LARGE_Q_TABLE, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: dropna


  Pandas Mean: 1.0367 s


  qutePandas Mean: 0.1282 s


  Speedup: 8.088857435855486


In [5]:
print('Benchmarking: dropna_col')

def pd_func(): LARGE_DF.dropna(subset=['col_0'])
def q_func(): qpd.dropna_col(LARGE_Q_TABLE, 'col_0', return_type='q')
pd_res = LARGE_DF.dropna(subset=['col_0'])
q_res = qpd.dropna_col(LARGE_Q_TABLE, 'col_0', return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: dropna_col


  Pandas Mean: 0.5299 s


  qutePandas Mean: 0.0509 s


  Speedup: 10.414090177548252


In [6]:
print('Benchmarking: fillna')

def pd_func(): LARGE_DF.fillna({'col_0': 0})
def q_func(): qpd.fillna(LARGE_Q_TABLE, 'col_0', 0, return_type='q')
pd_res = LARGE_DF.fillna({'col_0': 0})
q_res = qpd.fillna(LARGE_Q_TABLE, 'col_0', 0, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: fillna


  Pandas Mean: 0.5829 s
  qutePandas Mean: 0.0039 s


  Speedup: 147.95833592344866


## Transformation Functions
Tests structural and type transformations on the DataFrame.


In [7]:
print('Benchmarking: rename')

def pd_func(): LARGE_DF.rename(columns={'col_0': 'new_col_0'})
def q_func(): qpd.rename(LARGE_Q_TABLE, {'col_0': 'new_col_0'}, return_type='q')
pd_res = LARGE_DF.rename(columns={'col_0': 'new_col_0'})
q_res = qpd.rename(LARGE_Q_TABLE, {'col_0': 'new_col_0'}, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: rename


  Pandas Mean: 0.5760 s
  qutePandas Mean: 0.0001 s


  Speedup: 8875.599370127617


In [8]:
print('Benchmarking: cast')

def pd_func(): LARGE_DF['col_0'].astype('float32')
def q_func(): qpd.cast(LARGE_Q_TABLE, 'col_0', 'float32', return_type='q')
pd_res = LARGE_DF['col_0'].astype('float32')
q_res = qpd.cast(LARGE_Q_TABLE, 'col_0', 'float32', return_type='p')['col_0']
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: cast


  Pandas Mean: 0.0035 s
  qutePandas Mean: 0.0007 s


  Speedup: 5.150318720090641


In [9]:
print('Benchmarking: drop_col')

def pd_func(): LARGE_DF.drop(columns=['col_9'])
def q_func(): qpd.drop_col(LARGE_Q_TABLE, 'col_9', return_type='q')
pd_res = LARGE_DF.drop(columns=['col_9'])
q_res = qpd.drop_col(LARGE_Q_TABLE, 'col_9', return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: drop_col


  Pandas Mean: 0.5007 s
  qutePandas Mean: 0.0001 s


  Speedup: 8005.307059271302


## Grouping & Aggregation
Tests the performance of grouping operations.


In [10]:
print('Benchmarking: groupby_sum')

def pd_func(): LARGE_DF.groupby('col_3', dropna=False)['col_0'].sum()
def q_func(): qpd.groupby_sum(LARGE_Q_TABLE, 'col_3', 'col_0', return_type='q')
pd_res = LARGE_DF.groupby('col_3', dropna=False)['col_0'].sum()
q_res = qpd.groupby_sum(LARGE_Q_TABLE, 'col_3', 'col_0', return_type='p').set_index('col_3')['col_0']
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: groupby_sum


  Pandas Mean: 0.2702 s
  qutePandas Mean: 0.0028 s
  Speedup: 97.36245198579702


In [11]:
print('Benchmarking: groupby_avg')

def pd_func(): LARGE_DF.groupby('col_3', dropna=False)['col_1'].mean()
def q_func(): qpd.groupby_avg(LARGE_Q_TABLE, 'col_3', 'col_1', return_type='q')
pd_res = LARGE_DF.groupby('col_3', dropna=False)['col_1'].mean()
q_res = qpd.groupby_avg(LARGE_Q_TABLE, 'col_3', 'col_1', return_type='p').set_index('col_3')['col_1']
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: groupby_avg


  Pandas Mean: 0.2718 s
  qutePandas Mean: 0.0044 s
  Speedup: 62.38333408896259


## Custom Function Application
Tests the `apply` mechanism for row-wise operations.


In [12]:
print('Benchmarking: apply (sum axis=1)')

def pd_func(): LARGE_DF_SUBSET.sum(axis=1)
def q_func(): qpd.apply(LARGE_Q_SUBSET, 'sum', axis=1, return_type='q')
pd_res = LARGE_DF_SUBSET.sum(axis=1)
q_res = qpd.apply(LARGE_Q_SUBSET, 'sum', axis=1, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: apply (sum axis=1)


  Pandas Mean: 0.4774 s


  qutePandas Mean: 0.9025 s


  Speedup: 0.5290323263401102


## Indexing & Selection
Tests performance of `loc` and `iloc` operations.

In [13]:
print('Benchmarking: loc (boolean mask)')

np.random.seed(42)
mask = LARGE_DF['col_0'] > 0.5
q_mask = kx.toq(list(mask.values))

def pd_func(): LARGE_DF.loc[mask]
def q_func(): qpd.loc(LARGE_Q_TABLE, rows=q_mask, return_type='q')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
calculate_speedup(pd_stats, q_stats)

Benchmarking: loc (boolean mask)


  Pandas Mean: 0.5142 s


  qutePandas Mean: 0.0500 s
  Speedup: 10.282196554138963


In [14]:
print('Benchmarking: iloc (rows slice)')

def pd_func(): LARGE_DF.iloc[0:50000]
def q_func(): qpd.iloc(LARGE_Q_TABLE, rows=slice(0, 50000), return_type='q')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
calculate_speedup(pd_stats, q_stats)

Benchmarking: iloc (rows slice)
  Pandas Mean: 0.0001 s
  qutePandas Mean: 0.0024 s
  Speedup: 0.021573719951009047


In [15]:
print('Benchmarking: iloc (cols slice)')

def pd_func(): LARGE_DF.iloc[:, 0:5]
def q_func(): qpd.iloc(LARGE_Q_TABLE, cols=slice(0, 5), return_type='q')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
calculate_speedup(pd_stats, q_stats)

Benchmarking: iloc (cols slice)


  Pandas Mean: 0.1237 s
  qutePandas Mean: 0.0001 s
  Speedup: 1157.997994950496


## Joining
Tests performance of join operations (merge_left, merge_inner).

In [16]:
print('Setting up for Joining Benchmarks...')
JOIN_KEY = 'col_0'
UNIQUE_KEYS = LARGE_DF[JOIN_KEY].unique()
np.random.seed(42)
LOOKUP_DF = pd.DataFrame({
    JOIN_KEY: UNIQUE_KEYS[:1000], # 1000 keys for lookup
    'extra_val': np.random.randn(1000)
})
LOOKUP_Q = kx.toq(LOOKUP_DF)
print('Setup Complete')

Setting up for Joining Benchmarks...
Setup Complete


In [17]:
print('Benchmarking: merge_left')
def pd_func(): LARGE_DF.merge(LOOKUP_DF, on=JOIN_KEY, how='left')
def q_func(): qpd.merge_left(LARGE_Q_TABLE, LOOKUP_Q, keys=JOIN_KEY, return_type='q')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
calculate_speedup(pd_stats, q_stats)

Benchmarking: merge_left


  Pandas Mean: 0.7571 s
  qutePandas Mean: 0.0188 s
  Speedup: 40.247674332519715


In [18]:
print('Benchmarking: merge_inner')
def pd_func(): LARGE_DF.merge(LOOKUP_DF, on=JOIN_KEY, how='inner')
def q_func(): qpd.merge_inner(LARGE_Q_TABLE, LOOKUP_Q, keys=JOIN_KEY, return_type='q')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
calculate_speedup(pd_stats, q_stats)

Benchmarking: merge_inner


  Pandas Mean: 0.7556 s


  qutePandas Mean: 0.0704 s
  Speedup: 10.728369428234382


## I/O Operations
Tests performance of to_csv and from_csv on a simplified subset.

In [19]:
print('I/O Benchmarks Bypassed (Notebook Validation)')
np.random.seed(42)
# IO_DF = pd.DataFrame({'a': np.random.randn(100000)})
# IO_Q = kx.toq(IO_DF)

I/O Benchmarks Bypassed (Notebook Validation)


In [20]:
print('Benchmarking: to_csv (100k rows) - SKIPPED')
# def pd_func(): IO_DF.to_csv('bench_pd.csv', index=False)
# def q_func(): qpd.to_csv(IO_Q, 'bench_q.csv')
# pd_stats = benchmark_operation(pd_func)
# q_stats = benchmark_operation(q_func)
# calculate_speedup(pd_stats, q_stats)

Benchmarking: to_csv (100k rows) - SKIPPED


## Introspection
Tests performance of metadata and type inspection.

In [21]:
print('Benchmarking: dtypes')

def pd_func(): LARGE_DF.dtypes
def q_func(): qpd.dtypes(LARGE_Q_TABLE, return_type='q')
pd_res = LARGE_DF.dtypes
q_res = qpd.dtypes(LARGE_Q_TABLE, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
calculate_speedup(pd_stats, q_stats)

Benchmarking: dtypes
  Pandas Mean: 0.0001 s
  qutePandas Mean: 0.0001 s
  Speedup: 0.936105560785494
