# Edge Case Testing
This notebook tests edge cases and error handling for `qutePandas` against `pandas` baselines.

In [1]:
import os
import sys
import importlib
import pandas as pd
import numpy as np
import pykx as kx
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('.'))
import qutePandas as qpd
importlib.reload(qpd)
from test_utils import verify_correctness
local_lic = os.path.abspath('../kdb_lic')
if os.path.exists(local_lic): os.environ['QLIC'] = local_lic
qpd.connect()
print('Setup Complete')


  warn(f'Configuration value QLIC set to non directory value: {_qlic}')


Setup Complete


## Create Test DataFrame
Define a small DataFrame with realistic mixed-type data including null values.

In [2]:
# Pandas DataFrame with mixed dtypes
# Expected: Table is created with correct row count
df = pd.DataFrame({
    "a": [1, 2, 3],
    "b": ["x", "y", "z"]
})

q_df = qpd.DataFrame(df)
qpd.print(q_df)

assert isinstance(q_df, kx.Table)
assert q_df.shape[0] == 3


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ x │
│ 2 │ y │
│ 3 │ z │
└───┴───┘


In [3]:
# Pandas DataFrame with nulls and NaT
# Expected: Table is created and nulls are preserved
df = pd.DataFrame({
    "a": [1, None, 3],
    "b": [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2022-01-01")]
})

q_df = qpd.DataFrame(df)
qpd.print(q_df)

assert isinstance(q_df, kx.Table)
assert q_df.shape[0] == 3


┌─────┬─────────────────────┐
│ a   │ b                   │
├─────┼─────────────────────┤
│ 1.0 │ 2020-01-01 00:00:00 │
│ nan │ NaT                 │
│ 3.0 │ 2022-01-01 00:00:00 │
└─────┴─────────────────────┘


In [4]:
# Empty Pandas DataFrame
# Expected: Empty table with zero rows
df_empty = pd.DataFrame(columns=["a", "b"])

q_empty = qpd.DataFrame(df_empty)
qpd.print(q_empty)

assert isinstance(q_empty, kx.Table)
assert q_empty.shape[0] == 0


┌───┬───┐
│ a │ b │
├───┼───┤
└───┴───┘


In [5]:
# Dictionary input
# Expected: Table is created with matching rows
data = {"a": [1, 2], "b": [3, 4]}

q_df = qpd.DataFrame(data)
qpd.print(q_df)

assert isinstance(q_df, kx.Table)
assert q_df.shape[0] == 2


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 3 │
│ 2 │ 4 │
└───┴───┘


In [6]:
# Dictionary with unequal column lengths
# Expected: RuntimeError is raised
data = {"a": [1, 2], "b": [3]}

try:
    qpd.DataFrame(data)
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


In [7]:
# Dictionary with unequal column lengths
# Expected: RuntimeError is raised
data = {"a": [1, 2], "b": [3]}

try:
    qpd.DataFrame(data)
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


In [8]:
# List of lists with explicit columns
# Expected: Table is created with correct column count
data = [[1, "x"], [2, "y"], [3, "z"]]

q_df = qpd.DataFrame(data, columns=["a", "b"])
qpd.print(q_df)

assert isinstance(q_df, kx.Table)
assert q_df.shape[1] == 2


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ x │
│ 2 │ y │
│ 3 │ z │
└───┴───┘


In [9]:
# List of lists without columns
# Expected: Columns are auto-generated
data = [[1, 2], [3, 4]]

q_df = qpd.DataFrame(data)
qpd.print(q_df)

assert isinstance(q_df, kx.Table)
assert q_df.shape[1] == 2


┌───────┬───────┐
│ col_0 │ col_1 │
├───────┼───────┤
│ 1     │ 2     │
│ 3     │ 4     │
└───────┴───────┘


In [10]:
# Empty list input
# Expected: Empty table is created
q_df = qpd.DataFrame([])
qpd.print(q_df)

assert isinstance(q_df, kx.Table)
assert q_df.shape[0] == 0


┌──────┐
│ data │
├──────┤
└──────┘


In [11]:
# Generator input
# Expected: Generator is materialized into table
gen = (i for i in range(5))

q_df = qpd.DataFrame(gen)
qpd.print(q_df)

assert isinstance(q_df, kx.Table)
assert q_df.shape[0] == 5


┌──────┐
│ data │
├──────┤
│ 0    │
│ 1    │
│ 2    │
│ 3    │
│ 4    │
└──────┘


In [12]:
# Scalar input
# Expected: Scalar is wrapped into single-row table
q_df = qpd.DataFrame(42)
qpd.print(q_df)

assert isinstance(q_df, kx.Table)
assert q_df.shape[0] == 1


┌──────┐
│ data │
├──────┤
│ 42   │
└──────┘


In [13]:
# Existing kx.Table input
# Expected: Table is returned unchanged
t = kx.q("([] a:1 2 3; b:4 5 6)")

q_df = qpd.DataFrame(t)
qpd.print(q_df)

assert q_df is t


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 4 │
│ 2 │ 5 │
│ 3 │ 6 │
└───┴───┘


In [14]:
# Keyed table input
# Expected: Keyed table is accepted
kt = kx.q("([k:1 2 3] v:10 20 30)")

q_df = qpd.DataFrame(kt)
assert isinstance(q_df, kx.KeyedTable)


In [15]:
# Invalid input type
# Expected: RuntimeError is raised
try:
    qpd.DataFrame(object())
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


In [16]:
# Column length mismatch
# Expected: RuntimeError is raised
data = [[1, 2, 3]]

try:
    qpd.DataFrame(data, columns=["a", "b"])
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


## Cleaning Functions
Test data cleaning operations on the mixed-type DataFrame.

In [17]:
# dropna_col on Pandas DataFrame
# Expected: Rows with nulls in specified column are removed
df = pd.DataFrame({
    "a": [1, None, 3],
    "b": [10, 20, 30]
})

q_df = qpd.DataFrame(df)
q_res = qpd.dropna_col(q_df, "a", return_type="q")

qpd.print(q_res)

pd_res = df.dropna(subset=["a"])
assert verify_correctness(pd_res, qpd.dropna_col(q_df, "a", return_type="p"))


┌─────┬────┐
│ a   │ b  │
├─────┼────┤
│ 1.0 │ 10 │
│ 3.0 │ 30 │
└─────┴────┘


In [18]:
# dropna_col where no nulls exist
# Expected: Table remains unchanged
df = pd.DataFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6]
})

q_df = qpd.DataFrame(df)
q_res = qpd.dropna_col(q_df, "a", return_type="q")

qpd.print(q_res)

pd_res = df.dropna(subset=["a"])
assert verify_correctness(pd_res, qpd.dropna_col(q_df, "a", return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 4 │
│ 2 │ 5 │
│ 3 │ 6 │
└───┴───┘


In [19]:
# dropna_col on empty table
# Expected: Empty table is returned
df = pd.DataFrame(columns=["a", "b"])

q_df = qpd.DataFrame(df)
q_res = qpd.dropna_col(q_df, "a", return_type="q")

qpd.print(q_res)

pd_res = df.dropna(subset=["a"])
assert verify_correctness(pd_res, qpd.dropna_col(q_df, "a", return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
└───┴───┘


In [20]:
# dropna_col on column with all nulls
# Expected: Empty table is returned
df = pd.DataFrame({
    "a": [None, None],
    "b": [1, 2]
})

q_df = qpd.DataFrame(df)
q_res = qpd.dropna_col(q_df, "a", return_type="q")

qpd.print(q_res)

pd_res = df.dropna(subset=["a"])
assert verify_correctness(pd_res, qpd.dropna_col(q_df, "a", return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
└───┴───┘


In [21]:
# dropna removing any row with at least one null
# Expected: Rows containing any null are removed
df = pd.DataFrame({
    "a": [1, None, 3],
    "b": [10, 20, None]
})

q_df = qpd.DataFrame(df)
q_res = qpd.dropna(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.dropna()
assert verify_correctness(pd_res, qpd.dropna(q_df, return_type="p"))


┌─────┬──────┐
│ a   │ b    │
├─────┼──────┤
│ 1.0 │ 10.0 │
└─────┴──────┘


In [22]:
# dropna where no nulls exist
# Expected: Table remains unchanged
df = pd.DataFrame({
    "a": [1, 2],
    "b": [3, 4]
})

q_df = qpd.DataFrame(df)
q_res = qpd.dropna(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.dropna()
assert verify_correctness(pd_res, qpd.dropna(q_df, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 3 │
│ 2 │ 4 │
└───┴───┘


In [23]:
# dropna on empty table
# Expected: Empty table is returned
df = pd.DataFrame(columns=["a", "b"])

q_df = qpd.DataFrame(df)
q_res = qpd.dropna(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.dropna()
assert verify_correctness(pd_res, qpd.dropna(q_df, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
└───┴───┘


In [24]:
# dropna on table with all rows containing nulls
# Expected: Empty table is returned
df = pd.DataFrame({
    "a": [None, None],
    "b": [None, None]
})

q_df = qpd.DataFrame(df)
q_res = qpd.dropna(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.dropna()
assert verify_correctness(pd_res, qpd.dropna(q_df, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
└───┴───┘


In [25]:
# fillna with single-column dictionary
# Expected: Nulls in specified column are filled
df = pd.DataFrame({
    "a": [1, None, 3],
    "b": [10, 20, 30]
})

q_df = qpd.DataFrame(df)
q_res = qpd.fillna(q_df, {"a": 0}, return_type="q")

qpd.print(q_res)

pd_res = df.fillna({"a": 0})
assert verify_correctness(pd_res, qpd.fillna(q_df, {"a": 0}, return_type="p"))


┌─────┬────┐
│ a   │ b  │
├─────┼────┤
│ 1.0 │ 10 │
│ 0.0 │ 20 │
│ 3.0 │ 30 │
└─────┴────┘


In [26]:
# fillna with multiple columns dictionary
# Expected: Each column is filled independently
df = pd.DataFrame({
    "a": [1, None, 3],
    "b": [None, 2, None]
})

q_df = qpd.DataFrame(df)
q_res = qpd.fillna(q_df, {"a": 0, "b": 9}, return_type="q")

qpd.print(q_res)

pd_res = df.fillna({"a": 0, "b": 9})
assert verify_correctness(pd_res, qpd.fillna(q_df, {"a": 0, "b": 9}, return_type="p"))


┌─────┬─────┐
│ a   │ b   │
├─────┼─────┤
│ 1.0 │ 9.0 │
│ 0.0 │ 2.0 │
│ 3.0 │ 9.0 │
└─────┴─────┘


In [27]:
# fillna with string values
# Expected: String nulls are replaced correctly
df = pd.DataFrame({
    "a": ["x", None, "z"],
    "b": [1, 2, 3]
})

q_df = qpd.DataFrame(df)
q_res = qpd.fillna(q_df, {"a": "y"}, return_type="q")

qpd.print(q_res)

pd_res = df.fillna({"a": "y"})
assert verify_correctness(pd_res, qpd.fillna(q_df, {"a": "y"}, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ x │ 1 │
│ y │ 2 │
│ z │ 3 │
└───┴───┘


In [28]:
# fillna where no nulls exist
# Expected: Table remains unchanged
df = pd.DataFrame({
    "a": [1, 2],
    "b": [3, 4]
})

q_df = qpd.DataFrame(df)
q_res = qpd.fillna(q_df, {"a": 0, "b": 0}, return_type="q")

qpd.print(q_res)

pd_res = df.fillna({"a": 0, "b": 0})
assert verify_correctness(pd_res, qpd.fillna(q_df, {"a": 0, "b": 0}, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 3 │
│ 2 │ 4 │
└───┴───┘


In [29]:
# fillna with empty dictionary
# Expected: Table is returned unchanged
df = pd.DataFrame({
    "a": [1, None],
    "b": [2, None]
})

q_df = qpd.DataFrame(df)
q_res = qpd.fillna(q_df, {}, return_type="q")

qpd.print(q_res)

pd_res = df.fillna({})
assert verify_correctness(pd_res, qpd.fillna(q_df, {}, return_type="p"))


┌─────┬─────┐
│ a   │ b   │
├─────┼─────┤
│ 1.0 │ 2.0 │
│ nan │ nan │
└─────┴─────┘


In [30]:
# fillna on empty table
# Expected: Empty table is returned
df = pd.DataFrame(columns=["a", "b"])

q_df = qpd.DataFrame(df)
q_res = qpd.fillna(q_df, {"a": 0}, return_type="q")

qpd.print(q_res)

pd_res = df.fillna({"a": 0})
assert verify_correctness(pd_res, qpd.fillna(q_df, {"a": 0}, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
└───┴───┘


In [31]:
# fillna with missing column in dictionary
# Expected: RuntimeError is raised
df = pd.DataFrame({
    "a": [1, None]
})

q_df = qpd.DataFrame(df)

try:
    qpd.fillna(q_df, {"b": 0})
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


In [32]:
# fillna with non-dictionary input
# Expected: RuntimeError is raised
df = pd.DataFrame({
    "a": [1, None]
})

q_df = qpd.DataFrame(df)

try:
    qpd.fillna(q_df, "a")
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


In [33]:
# remove_duplicates on DataFrame with duplicate rows
# Expected: Duplicate rows are removed, first occurrence kept
df = pd.DataFrame({
    "a": [1, 1, 2, 2],
    "b": [10, 10, 20, 20]
})

q_df = qpd.DataFrame(df)
q_res = qpd.remove_duplicates(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.drop_duplicates()
assert verify_correctness(pd_res, qpd.remove_duplicates(q_df, return_type="p"))


┌───┬────┐
│ a │ b  │
├───┼────┤
│ 1 │ 10 │
│ 2 │ 20 │
└───┴────┘


In [34]:
# remove_duplicates where no duplicates exist
# Expected: Table remains unchanged
df = pd.DataFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6]
})

q_df = qpd.DataFrame(df)
q_res = qpd.remove_duplicates(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.drop_duplicates()
assert verify_correctness(pd_res, qpd.remove_duplicates(q_df, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 4 │
│ 2 │ 5 │
│ 3 │ 6 │
└───┴───┘


In [35]:
# remove_duplicates on DataFrame with all rows duplicated
# Expected: Single unique row remains
df = pd.DataFrame({
    "a": [1, 1, 1],
    "b": [2, 2, 2]
})

q_df = qpd.DataFrame(df)
q_res = qpd.remove_duplicates(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.drop_duplicates()
assert verify_correctness(pd_res, qpd.remove_duplicates(q_df, return_type="p"))

┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 2 │
└───┴───┘


In [36]:
# remove_duplicates on empty DataFrame
# Expected: Empty table is returned
df = pd.DataFrame(columns=["a", "b"])

q_df = qpd.DataFrame(df)
q_res = qpd.remove_duplicates(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.drop_duplicates()
assert verify_correctness(pd_res, qpd.remove_duplicates(q_df, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
└───┴───┘


In [37]:
# remove_duplicates on DataFrame with mixed dtypes
# Expected: Duplicate rows are removed correctly
df = pd.DataFrame({
    "a": [1, 1, 2],
    "b": ["x", "x", "y"]
})

q_df = qpd.DataFrame(df)
q_res = qpd.remove_duplicates(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.drop_duplicates()
assert verify_correctness(pd_res, qpd.remove_duplicates(q_df, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ x │
│ 2 │ y │
└───┴───┘


In [38]:
# remove_duplicates on single-row DataFrame
# Expected: Single row is preserved
df = pd.DataFrame({
    "a": [1],
    "b": [2]
})

q_df = qpd.DataFrame(df)
q_res = qpd.remove_duplicates(q_df, return_type="q")

qpd.print(q_res)

pd_res = df.drop_duplicates()
assert verify_correctness(pd_res, qpd.remove_duplicates(q_df, return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 2 │
└───┴───┘


## Transformation Functions
Test structural and type transformations on the DataFrame.

In [39]:
# cast int column to float
# Expected: Column is converted to float type
df = pd.DataFrame({
    "a": [1, 2, 3],
    "b": [10, 20, 30]
})

q_df = qpd.DataFrame(df)
q_res = qpd.cast(q_df, "a", "float", return_type="q")

qpd.print(q_res)

pd_res = df.copy()
pd_res["a"] = pd_res["a"].astype(float)
assert verify_correctness(pd_res, qpd.cast(q_df, "a", "float", return_type="p"))


┌─────┬────┐
│ a   │ b  │
├─────┼────┤
│ 1.0 │ 10 │
│ 2.0 │ 20 │
│ 3.0 │ 30 │
└─────┴────┘


In [40]:
# cast float column to int
# Expected: Column is converted to int (truncation behavior matches Pandas)
df = pd.DataFrame({
    "a": [1.2, 2.7, 3.0],
    "b": [1, 2, 3]
})

q_df = qpd.DataFrame(df)
q_res = qpd.cast(q_df, "a", "int", return_type="q")

qpd.print(q_res)

pd_res = df.copy()
pd_res["a"] = pd_res["a"].astype(int)
assert verify_correctness(pd_res, qpd.cast(q_df, "a", "int", return_type="p"))

┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 1 │
│ 2 │ 2 │
│ 3 │ 3 │
└───┴───┘


In [41]:
# cast string column to symbol
# Expected: String values are converted to symbol type
df = pd.DataFrame({
    "a": ["x", "y", "z"],
    "b": [1, 2, 3]
})

q_df = qpd.DataFrame(df)
q_res = qpd.cast(q_df, "a", "s", return_type="q")

qpd.print(q_res)

pd_res = df.copy()
pd_res["a"] = pd_res["a"].astype(str)
assert verify_correctness(pd_res, qpd.cast(q_df, "a", "s", return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ x │ 1 │
│ y │ 2 │
│ z │ 3 │
└───┴───┘


In [42]:
# cast column containing nulls
# Expected: Null values are preserved after casting
df = pd.DataFrame({
    "a": [1, None, 3],
    "b": [4, 5, 6]
})

q_df = qpd.DataFrame(df)
q_res = qpd.cast(q_df, "a", "float", return_type="q")

qpd.print(q_res)

pd_res = df.copy()
pd_res["a"] = pd_res["a"].astype(float)
assert verify_correctness(pd_res, qpd.cast(q_df, "a", "float", return_type="p"))


┌─────┬───┐
│ a   │ b │
├─────┼───┤
│ 1.0 │ 4 │
│ nan │ 5 │
│ 3.0 │ 6 │
└─────┴───┘


In [43]:
# cast on empty DataFrame
# Expected: Empty table is returned
df = pd.DataFrame(columns=["a", "b"])

q_df = qpd.DataFrame(df)
q_res = qpd.cast(q_df, "a", "int", return_type="q")

qpd.print(q_res)

pd_res = df.copy()
pd_res["a"] = pd_res["a"].astype("int64", errors="ignore")
assert verify_correctness(pd_res, qpd.cast(q_df, "a", "int", return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
└───┴───┘


In [44]:
# cast with unsupported dtype
# Expected: RuntimeError is raised
df = pd.DataFrame({
    "a": [1, 2, 3]
})

q_df = qpd.DataFrame(df)

try:
    qpd.cast(q_df, "a", "unsupported_type")
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


In [45]:
# cast on non-existent column
# Expected: RuntimeError is raised
df = pd.DataFrame({
    "a": [1, 2, 3]
})

q_df = qpd.DataFrame(df)

try:
    qpd.cast(q_df, "b", "int")
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


In [46]:
# cast using q-type shorthand
# Expected: Column is cast using q shorthand type
df = pd.DataFrame({
    "a": [1, 2, 3]
})

q_df = qpd.DataFrame(df)
q_res = qpd.cast(q_df, "a", "j", return_type="q")

qpd.print(q_res)

pd_res = df.copy()
pd_res["a"] = pd_res["a"].astype(int)
assert verify_correctness(pd_res, qpd.cast(q_df, "a", "j", return_type="p"))


┌───┐
│ a │
├───┤
│ 1 │
│ 2 │
│ 3 │
└───┘


In [47]:
# drop_col with array of columns
# Expected: All specified columns are removed
df = pd.DataFrame({
    "a": [1, 2],
    "b": [3, 4],
    "c": [5, 6]
})

q_df = qpd.DataFrame(df)
q_res = qpd.drop_col(q_df, ["a", "c"], return_type="q")

qpd.print(q_res)

pd_res = df.drop(columns=["a", "c"])
assert verify_correctness(pd_res, qpd.drop_col(q_df, ["a", "c"], return_type="p"))


┌───┐
│ b │
├───┤
│ 3 │
│ 4 │
└───┘


In [48]:
# drop_col with array containing single column
# Expected: Specified column is removed
df = pd.DataFrame({
    "a": [1, 2],
    "b": [3, 4]
})

q_df = qpd.DataFrame(df)
q_res = qpd.drop_col(q_df, ["b"], return_type="q")

qpd.print(q_res)

pd_res = df.drop(columns=["b"])
assert verify_correctness(pd_res, qpd.drop_col(q_df, ["b"], return_type="p"))


┌───┐
│ a │
├───┤
│ 1 │
│ 2 │
└───┘


In [49]:
# drop_col with string input
# Expected: String is normalized to array and column is removed
df = pd.DataFrame({
    "a": [1, 2],
    "b": [3, 4]
})

q_df = qpd.DataFrame(df)
q_res = qpd.drop_col(q_df, "a", return_type="q")

qpd.print(q_res)

pd_res = df.drop(columns=["a"])
assert verify_correctness(pd_res, qpd.drop_col(q_df, "a", return_type="p"))


┌───┐
│ b │
├───┤
│ 3 │
│ 4 │
└───┘


In [50]:
# drop_col with empty array
# Expected: Table remains unchanged
df = pd.DataFrame({
    "a": [1, 2],
    "b": [3, 4]
})

q_df = qpd.DataFrame(df)
q_res = qpd.drop_col(q_df, [], return_type="q")

qpd.print(q_res)

pd_res = df.copy()
assert verify_correctness(pd_res, qpd.drop_col(q_df, [], return_type="p"))


┌───┬───┐
│ a │ b │
├───┼───┤
│ 1 │ 3 │
│ 2 │ 4 │
└───┴───┘


In [51]:
# drop_col with all columns specified
# Expected: Resulting table has no columns and row count preserved

df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": [4, 5, 6]
    })
q_df = kx.toq(df)
p_res = qpd.drop_col(q_df, ["a", "b"], return_type='p')

qpd.print(p_res)
assert isinstance(p_res, pd.DataFrame)
assert len(p_res.columns) == 0
assert len(p_res) == 3
q_res = qpd.drop_col(q_df, ["a", "b"], return_type='q')
assert isinstance(q_res, kx.Dictionary)

qpd.print(q_res)

Empty DataFrame
Empty DataFrame


## Grouping & Aggregation
Test grouping operations on categorical data.

In [52]:
# groupby_avg with single group column
# Expected: Average computed per group
df = pd.DataFrame({
    "grp": ["A", "A", "B", "B"],
    "val": [10, 20, 30, 40]
})

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_avg(q_df, "grp", "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby("grp", as_index=False)["val"].mean()
assert verify_correctness(pd_res, qpd.groupby_avg(q_df, "grp", "val", return_type="p"))


┌─────┬──────┐
│ grp │ val  │
├─────┼──────┤
│ A   │ 15.0 │
│ B   │ 35.0 │
└─────┴──────┘


In [53]:
# groupby_avg with multiple group columns
# Expected: Average computed per unique group combination
df = pd.DataFrame({
    "grp1": ["A", "A", "B", "B"],
    "grp2": [1, 2, 1, 2],
    "val": [10, 20, 30, 40]
})

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_avg(q_df, ["grp1", "grp2"], "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby(["grp1", "grp2"], as_index=False)["val"].mean()
assert verify_correctness(pd_res, qpd.groupby_avg(q_df, ["grp1", "grp2"], "val", return_type="p"))


┌──────┬──────┬──────┐
│ grp1 │ grp2 │ val  │
├──────┼──────┼──────┤
│ A    │ 1    │ 10.0 │
│ A    │ 2    │ 20.0 │
│ B    │ 1    │ 30.0 │
│ B    │ 2    │ 40.0 │
└──────┴──────┴──────┘


In [54]:
# groupby_avg with nulls in value column
# Expected: Nulls ignored during averaging
df = pd.DataFrame({
    "grp": ["A", "A", "B"],
    "val": [10, None, 30]
})

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_avg(q_df, "grp", "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby("grp", as_index=False)["val"].mean()
assert verify_correctness(pd_res, qpd.groupby_avg(q_df, "grp", "val", return_type="p"))


┌─────┬──────┐
│ grp │ val  │
├─────┼──────┤
│ A   │ 10.0 │
│ B   │ 30.0 │
└─────┴──────┘


In [55]:
# groupby_avg with single-row groups
# Expected: Average equals the value itself
df = pd.DataFrame({
    "grp": ["A", "B", "C"],
    "val": [5, 10, 15]
})

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_avg(q_df, "grp", "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby("grp", as_index=False)["val"].mean()
assert verify_correctness(pd_res, qpd.groupby_avg(q_df, "grp", "val", return_type="p"))


┌─────┬──────┐
│ grp │ val  │
├─────┼──────┤
│ A   │ 5.0  │
│ B   │ 10.0 │
│ C   │ 15.0 │
└─────┴──────┘


In [56]:
# groupby_avg on empty DataFrame
# Expected: Empty result
df = pd.DataFrame(columns=["grp", "val"])

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_avg(q_df, "grp", "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby("grp", as_index=False)["val"].mean()
assert verify_correctness(pd_res, qpd.groupby_avg(q_df, "grp", "val", return_type="p"))


┌─────┬─────┐
│ grp │ val │
├─────┼─────┤
└─────┴─────┘


In [57]:
# groupby_avg with invalid column
# Expected: RuntimeError is raised
df = pd.DataFrame({
    "grp": ["A", "B"],
    "val": [1, 2]
})

q_df = qpd.DataFrame(df)

try:
    qpd.groupby_avg(q_df, "grp", "missing")
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


In [58]:
# groupby_sum with single group column
# Expected: Sum computed per group
df = pd.DataFrame({
    "grp": ["A", "A", "B"],
    "val": [10, 20, 30]
})

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_sum(q_df, "grp", "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby("grp", as_index=False)["val"].sum()
assert verify_correctness(pd_res, qpd.groupby_sum(q_df, "grp", "val", return_type="p"))


┌─────┬─────┐
│ grp │ val │
├─────┼─────┤
│ A   │ 30  │
│ B   │ 30  │
└─────┴─────┘


In [59]:
# groupby_sum with multiple group columns
# Expected: Sum computed per group combination
df = pd.DataFrame({
    "grp1": ["A", "A", "B"],
    "grp2": [1, 1, 2],
    "val": [5, 15, 20]
})

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_sum(q_df, ["grp1", "grp2"], "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby(["grp1", "grp2"], as_index=False)["val"].sum()
assert verify_correctness(pd_res, qpd.groupby_sum(q_df, ["grp1", "grp2"], "val", return_type="p"))


┌──────┬──────┬─────┐
│ grp1 │ grp2 │ val │
├──────┼──────┼─────┤
│ A    │ 1    │ 20  │
│ B    │ 2    │ 20  │
└──────┴──────┴─────┘


In [60]:
# groupby_sum with nulls in value column
# Expected: Nulls treated as zero (ignored)
df = pd.DataFrame({
    "grp": ["A", "A", "B"],
    "val": [10, None, 5]
})

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_sum(q_df, "grp", "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby("grp", as_index=False)["val"].sum()
assert verify_correctness(pd_res, qpd.groupby_sum(q_df, "grp", "val", return_type="p"))


┌─────┬──────┐
│ grp │ val  │
├─────┼──────┤
│ A   │ 10.0 │
│ B   │ 5.0  │
└─────┴──────┘


In [61]:
# groupby_sum with negative values
# Expected: Sum includes negatives correctly
df = pd.DataFrame({
    "grp": ["A", "A", "B"],
    "val": [10, -5, 20]
})

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_sum(q_df, "grp", "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby("grp", as_index=False)["val"].sum()
assert verify_correctness(pd_res, qpd.groupby_sum(q_df, "grp", "val", return_type="p"))


┌─────┬─────┐
│ grp │ val │
├─────┼─────┤
│ A   │ 5   │
│ B   │ 20  │
└─────┴─────┘


In [62]:
# groupby_sum on empty DataFrame
# Expected: Empty result
df = pd.DataFrame(columns=["grp", "val"])

q_df = qpd.DataFrame(df)
q_res = qpd.groupby_sum(q_df, "grp", "val", return_type="q")

qpd.print(q_res)

pd_res = df.groupby("grp", as_index=False)["val"].sum()
assert verify_correctness(pd_res, qpd.groupby_sum(q_df, "grp", "val", return_type="p"))


┌─────┬─────┐
│ grp │ val │
├─────┼─────┤
└─────┴─────┘


In [63]:
# groupby_sum with invalid group column
# Expected: RuntimeError is raised
df = pd.DataFrame({
    "grp": ["A", "B"],
    "val": [1, 2]
})

q_df = qpd.DataFrame(df)

try:
    qpd.groupby_sum(q_df, "missing", "val")
    raise AssertionError("Expected RuntimeError")
except RuntimeError:
    pass


## Merge Tests

Perform merge tests on edge cases

In [64]:
# inner merge on single key
# Expected: Only matching keys retained
left = pd.DataFrame({
    "id": [1, 2, 3],
    "a": [10, 20, 30]
})

right = pd.DataFrame({
    "id": [2, 3, 4],
    "b": [200, 300, 400]
})

q_left = qpd.DataFrame(left)
q_right = qpd.DataFrame(right)

q_res = qpd.merge(q_left, q_right, how="inner", on="id", return_type="q")
qpd.print(q_res)

pd_res = pd.merge(left, right, how="inner", on="id")
assert verify_correctness(pd_res, qpd.merge(q_left, q_right, how="inner", on="id", return_type="p"))


┌────┬────┬─────┐
│ id │ a  │ b   │
├────┼────┼─────┤
│ 2  │ 20 │ 200 │
│ 3  │ 30 │ 300 │
└────┴────┴─────┘


In [65]:
# inner merge with no matching keys
# Expected: Empty result
left = pd.DataFrame({"id": [1, 2], "a": [10, 20]})
right = pd.DataFrame({"id": [3, 4], "b": [30, 40]})

q_left = qpd.DataFrame(left)
q_right = qpd.DataFrame(right)

q_res = qpd.merge(q_left, q_right, how="inner", on="id", return_type="q")
assert q_res.shape[0] == 0

pd_res = pd.merge(left, right, how="inner", on="id")
assert verify_correctness(pd_res, qpd.merge(q_left, q_right, how="inner", on="id", return_type="p"))


In [66]:
# left merge with missing matches
# Expected: All left rows preserved, nulls on right
left = pd.DataFrame({
    "id": [1, 2, 3],
    "a": [10, 20, 30]
})

right = pd.DataFrame({
    "id": [2],
    "b": [200]
})

q_left = qpd.DataFrame(left)
q_right = qpd.DataFrame(right)

q_res = qpd.merge(q_left, q_right, how="left", on="id", return_type="q")
qpd.print(q_res)

pd_res = pd.merge(left, right, how="left", on="id")
assert verify_correctness(pd_res, qpd.merge(q_left, q_right, how="left", on="id", return_type="p"))

┌────┬────┬──────┐
│ id │ a  │ b    │
├────┼────┼──────┤
│ 1  │ 10 │ <NA> │
│ 2  │ 20 │ 200  │
│ 3  │ 30 │ <NA> │
└────┴────┴──────┘


In [67]:
# right merge with missing matches
# Expected: All right rows preserved, nulls on left
left = pd.DataFrame({
    "id": [1],
    "a": [10]
})

right = pd.DataFrame({
    "id": [1, 2],
    "b": [100, 200]
})

q_left = qpd.DataFrame(left)
q_right = qpd.DataFrame(right)

q_res = qpd.merge(q_left, q_right, how="right", on="id", return_type="q")
qpd.print(q_res)

pd_res = pd.merge(left, right, how="right", on="id")
assert verify_correctness(pd_res, qpd.merge(q_left, q_right, how="right", on="id", return_type="p"))

┌────┬──────┬─────┐
│ id │ a    │ b   │
├────┼──────┼─────┤
│ 1  │ 10   │ 100 │
│ 2  │ <NA> │ 200 │
└────┴──────┴─────┘


In [68]:
# outer merge with partial overlap
# Expected: Union of keys from both tables
left = pd.DataFrame({
    "id": [1, 2],
    "a": [10, 20]
})

right = pd.DataFrame({
    "id": [2, 3],
    "b": [200, 300]
})

q_left = qpd.DataFrame(left)
q_right = qpd.DataFrame(right)

q_res = qpd.merge(q_left, q_right, how="outer", on="id", return_type="q")
qpd.print(q_res)

pd_res = pd.merge(left, right, how="outer", on="id")
assert verify_correctness(pd_res, qpd.merge(q_left, q_right, how="outer", on="id", return_type="p"))


┌────┬──────┬──────┐
│ id │ a    │ b    │
├────┼──────┼──────┤
│ 1  │ 10   │ <NA> │
│ 2  │ 20   │ 200  │
│ 3  │ <NA> │ 300  │
└────┴──────┴──────┘


In [69]:
# merge on multiple keys
# Expected: Match on full key combination
left = pd.DataFrame({
    "k1": ["A", "A", "B"],
    "k2": [1, 2, 1],
    "a": [10, 20, 30]
})

right = pd.DataFrame({
    "k1": ["A", "B"],
    "k2": [1, 1],
    "b": [100, 300]
})

q_left = qpd.DataFrame(left)
q_right = qpd.DataFrame(right)

q_res = qpd.merge(q_left, q_right, how="inner", on=["k1", "k2"], return_type="q")
qpd.print(q_res)

pd_res = pd.merge(left, right, how="inner", on=["k1", "k2"])
assert verify_correctness(pd_res, qpd.merge(q_left, q_right, how="inner", on=["k1", "k2"], return_type="p"))


┌────┬────┬────┬─────┐
│ k1 │ k2 │ a  │ b   │
├────┼────┼────┼─────┤
│ A  │ 1  │ 10 │ 100 │
│ B  │ 1  │ 30 │ 300 │
└────┴────┴────┴─────┘


In [None]:
# merge with different key column names
# Expected: Join performed after renaming right keys
left = pd.DataFrame({
    "id_left": [1, 2],
    "a": [10, 20]
})

right = pd.DataFrame({
    "id_right": [2, 3],
    "b": [200, 300]
})

q_left = qpd.DataFrame(left)
q_right = qpd.DataFrame(right)

q_res = qpd.merge(
    q_left, q_right,
    how="inner",
    left_on="id_left",
    right_on="id_right",
    return_type="q"
)

qpd.print(q_res)

pd_res = pd.merge(
    left, right,
    how="inner",
    left_on="id_left",
    right_on="id_right"
)

print(pd_res)
print(q_res)

┌─────────┬────┬─────┐
│ id_left │ a  │ b   │
├─────────┼────┼─────┤
│ 2       │ 20 │ 200 │
└─────────┴────┴─────┘


AssertionError: 