# Setup and Data Generation
Performs global setup for the benchmark suite.
- Imports necessary libraries.
- Defines local helper functions: `generate_large_dataset`, `benchmark_operation`, and `verify_correctness`.
- Connects to the q/kdb+ process.
- Generates a **10 million row** dataset (`LARGE_DF`) and converts it to a pykx Table (`LARGE_Q_TABLE`).


In [1]:
import os
import sys
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('.'))
import qutePandas as qpd
import pandas as pd
import numpy as np
import pykx as kx
import time
import gc
from test_utils import generate_large_dataset, benchmark_operation, verify_correctness, calculate_speedup
local_lic = os.path.abspath('../kdb_lic')
if os.path.exists(local_lic): os.environ['QLIC'] = local_lic
qpd.connect()


True

In [2]:
print("Generating Large Datasets (10M rows, 20 cols)... This may take a moment.")
LARGE_DF = generate_large_dataset(rows=10_000_000, cols=20)
LARGE_Q_TABLE = kx.toq(LARGE_DF)
LARGE_DF_SUBSET = LARGE_DF[['col_0', 'col_4']]
LARGE_Q_SUBSET = LARGE_Q_TABLE[['col_0', 'col_4']]
print(f"Data Generation Complete. Shape: {LARGE_DF.shape}")


Generating Large Datasets (10M rows, 20 cols)... This may take a moment.
Data Generation Complete. Shape: (10000000, 20)


## Core: DataFrame Creation
Tests the creation of qutePandas DataFrames.


In [3]:
print('Benchmarking: DataFrame Creation')

def pd_func(): pd.DataFrame(LARGE_DF)
def q_func(): qpd.DataFrame(LARGE_DF)
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
calculate_speedup(pd_stats, q_stats)


Benchmarking: DataFrame Creation
  Pandas Mean: 0.0000 s
  qutePandas Mean: 15.8110 s
  Speedup: 2.3448956157500126e-06


## Cleaning Functions
Tests performance of data cleaning operations.


In [4]:
print('Benchmarking: dropna')

def pd_func(): LARGE_DF.dropna()
def q_func(): qpd.dropna(LARGE_Q_TABLE, return_type='q')
pd_res = LARGE_DF.dropna()
q_res = qpd.dropna(LARGE_Q_TABLE, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: drop_nulls
  Pandas Mean: 1.0563 s
  qutePandas Mean: 0.1322 s
  Speedup: 7.989108056379326


In [5]:
print('Benchmarking: dropna_col')

def pd_func(): LARGE_DF.dropna(subset=['col_0'])
def q_func(): qpd.dropna_col(LARGE_Q_TABLE, 'col_0', return_type='q')
pd_res = LARGE_DF.dropna(subset=['col_0'])
q_res = qpd.dropna_col(LARGE_Q_TABLE, 'col_0', return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: drop_nulls
  Pandas Mean: 1.0472 s
  qutePandas Mean: 0.1291 s
  Speedup: 8.109591835775436


In [6]:
print('Benchmarking: fillna')

def pd_func(): LARGE_DF.fillna({'col_0': 0})
def q_func(): qpd.fillna(LARGE_Q_TABLE, 'col_0', 0, return_type='q')
pd_res = LARGE_DF.fillna({'col_0': 0})
q_res = qpd.fillna(LARGE_Q_TABLE, 'col_0', 0, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: fill_null
  Pandas Mean: 0.6066 s
  qutePandas Mean: 0.0045 s
  Speedup: 135.00837439702923


## Transformation Functions
Tests structural and type transformations on the DataFrame.


In [7]:
print('Benchmarking: rename')

def pd_func(): LARGE_DF.rename(columns={'col_0': 'new_col_0'})
def q_func(): qpd.rename(LARGE_Q_TABLE, {'col_0': 'new_col_0'}, return_type='q')
pd_res = LARGE_DF.rename(columns={'col_0': 'new_col_0'})
q_res = qpd.rename(LARGE_Q_TABLE, {'col_0': 'new_col_0'}, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: rename
  Pandas Mean: 0.5788 s
  qutePandas Mean: 0.0001 s
  Speedup: 7967.240760443745


In [8]:
print('Benchmarking: cast')

def pd_func(): LARGE_DF['col_0'].astype('float32')
def q_func(): qpd.cast(LARGE_Q_TABLE, 'col_0', 'float32', return_type='q')
pd_res = LARGE_DF['col_0'].astype('float32')
q_res = qpd.cast(LARGE_Q_TABLE, 'col_0', 'float32', return_type='p')['col_0']
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: cast
  Pandas Mean: 0.0038 s
  qutePandas Mean: 0.0007 s
  Speedup: 5.1046475993512255


In [9]:
print('Benchmarking: drop_col')

def pd_func(): LARGE_DF.drop(columns=['col_9'])
def q_func(): qpd.drop_col(LARGE_Q_TABLE, 'col_9', return_type='q')
pd_res = LARGE_DF.drop(columns=['col_9'])
q_res = qpd.drop_col(LARGE_Q_TABLE, 'col_9', return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: drop_col
  Pandas Mean: 0.5197 s
  qutePandas Mean: 0.0001 s
  Speedup: 5889.654680966104


## Grouping & Aggregation
Tests the performance of grouping operations.


In [10]:
print('Benchmarking: groupby_sum')

def pd_func(): LARGE_DF.groupby('col_3', dropna=False)['col_0'].sum()
def q_func(): qpd.groupby_sum(LARGE_Q_TABLE, 'col_3', 'col_0', return_type='q')
pd_res = LARGE_DF.groupby('col_3', dropna=False)['col_0'].sum()
q_res = qpd.groupby_sum(LARGE_Q_TABLE, 'col_3', 'col_0', return_type='p').set_index('col_3')['col_0']
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: groupby_sum
  Pandas Mean: 0.2828 s
  qutePandas Mean: 0.0032 s
  Speedup: 88.06316355074743


In [11]:
print('Benchmarking: groupby_avg')

def pd_func(): LARGE_DF.groupby('col_3', dropna=False)['col_1'].mean()
def q_func(): qpd.groupby_avg(LARGE_Q_TABLE, 'col_3', 'col_1', return_type='q')
pd_res = LARGE_DF.groupby('col_3', dropna=False)['col_1'].mean()
q_res = qpd.groupby_avg(LARGE_Q_TABLE, 'col_3', 'col_1', return_type='p').set_index('col_3')['col_1']
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: groupby_avg
  Pandas Mean: 0.2750 s
  qutePandas Mean: 0.0047 s
  Speedup: 58.2727647664006


## Custom Function Application
Tests the `apply` mechanism for row-wise operations.


In [12]:
print('Benchmarking: apply (sum axis=1)')

def pd_func(): LARGE_DF_SUBSET.sum(axis=1)
def q_func(): qpd.apply(LARGE_Q_SUBSET, 'sum', axis=1, return_type='q')
pd_res = LARGE_DF_SUBSET.sum(axis=1)
q_res = qpd.apply(LARGE_Q_SUBSET, 'sum', axis=1, return_type='p')
pd_stats = benchmark_operation(pd_func)
print(f"  Pandas Mean: {pd_stats['mean']:.4f} s")
q_stats = benchmark_operation(q_func)
print(f"  qutePandas Mean: {q_stats['mean']:.4f} s")
assert verify_correctness(pd_res, q_res), 'Results do not match!'
calculate_speedup(pd_stats, q_stats)


Benchmarking: apply (sum axis=1)
  Pandas Mean: 0.4620 s
  qutePandas Mean: 0.9110 s
  Speedup: 0.5071774415705059
