# qplib - a query language for pandas

The query language works by sequentially applying filter conditions and modification instructions to the dataframe.
Each condition/instruction starts with a connector ("%", "&", "/", "%%", "&&", "//", "$"), followed by some option flags, an operator and a value.
Each of these components is optional, with an associated default behaviour.

The examples will use a very small test dataset so that all filtering and modification is easily traceable.

Some instructions make use of colors which might not render depending on where you view the notebook (eg: github).

In [1]:
import pandas as pd
import numpy as np
import qplib as qp
from qplib import log

pd.set_option('display.max_columns', None)

df = qp.get_df()
df

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


# interactive mode

Use df.qi() to call the interactive mode if you want to try out the query language for yourself, or take a look through the examples in this notebook.

Please note that while the underlying query logic is quite heavily tested, the df.qi() ui interface is not and there might be visual bugs.

In [2]:
#Use the interactive mode in this cell to try out the query language,
#or take a look through the notebook for a walkthrough.

df.qi()

HBox(children=(Textarea(value='$verbosity=3\n$diff=None\n\n#Enter query code here,\n#or use the buttons to the…

HBox(children=(Output(),))

# filter/select

## columns




In [3]:
#Select the column called "name":
#(Mulitple equivalent examples are shown to demonstrate default behaviour)

df.q('name')
df.q('%name')
df.q('%=name')
df.q('%==name')
df.q('% == name')

Unnamed: 0,name
0,John Doe
1,Jane Smith
2,Alice Johnson
3,Bob Brown
4,eva white
5,Frank miller
6,Grace TAYLOR
7,Harry Clark
8,IVY GREEN
9,JAck Williams


In [4]:
#select all columns containing the string "bp":
df.q('?bp')

Unnamed: 0,bp systole,bp diastole
0,20,80
1,130,85
2,,
3,140,90mmHg
4,135mmhg,
5,125,75
6,NAN,
7,122,
8,,95
9,130,0


In [5]:
#Multiple selection conditions can be used by combining them with "&" or "/".

#Either condition must be fulfilled:
df.q('name  /?bp')

Unnamed: 0,name,bp systole,bp diastole
0,John Doe,20,80
1,Jane Smith,130,85
2,Alice Johnson,,
3,Bob Brown,140,90mmHg
4,eva white,135mmhg,
5,Frank miller,125,75
6,Grace TAYLOR,NAN,
7,Harry Clark,122,
8,IVY GREEN,,95
9,JAck Williams,130,0


In [6]:
#Both conditions must be fulfilled:
df.q('?bp  &?systole')

Unnamed: 0,bp systole
0,20
1,130
2,
3,140
4,135mmhg
5,125
6,NAN
7,122
8,
9,130


In [7]:
#Notice the warning when no columns fulfill both conditions.
df.q('name  &?bp')

0,1,2,3,4,5
123,WARNING,"no columns fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-05-15 09:52:01.810389,0.683


0
1
2
3
4
5
6
7
8
9
10


In [8]:
#"%" creates a new selection, discarding the previous one:
df.q(r'name   /?bp   %id')

Unnamed: 0,ID
0,10001
1,10002
2,10003
3,20001
4,20002
5,20003
6,30001
7,30002
8,30003
9,30004


In [9]:
#Reset selection by selecting everything:
df.q(r'id  /name   %is any;')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


## row filtering

Row filter conditions use the same connector symbols as column conditions but twice, to easily distinguish them.

In [10]:
#Select all rows where the value in the "id" column is greater than 20000:
df.q(r'%id    %%>20000')

Unnamed: 0,ID
3,20001
4,20002
5,20003
6,30001
7,30002
8,30003
9,30004
10,30005


In [11]:
#Select rows based on multiple conditions for the same column:
df.q(r'%id    %%>20000    &&<30003')

Unnamed: 0,ID
3,20001
4,20002
5,20003
6,30001
7,30002


In [12]:
#Creating a new column selection does not change the row selection:
df.q(r'%id    %%>20000    &&<30003   %name')

#Equivalent but more readable version:
df.q(
    r"""
    %id     %%>20000    &&<30003
    %name
    """
    )

Unnamed: 0,name
3,Bob Brown
4,eva white
5,Frank miller
6,Grace TAYLOR
7,Harry Clark


In [13]:
#Now lets add a third column selection connected to the second one:
#(notice that the order of columns is not changed)
df.q(
    r"""
    %id     %%>20000    &&<30003
    %name
    /id
    """
    )


Unnamed: 0,ID,name
3,20001,Bob Brown
4,20002,eva white
5,20003,Frank miller
6,30001,Grace TAYLOR
7,30002,Harry Clark


In [14]:
#This behaviour can be used to select rows using conditions on multiple columns.
df.q(
    r"""
    %id     %%>20000    &&<30003
    %name   &&?bob
    /id
    """
    )

Unnamed: 0,ID,name
3,20001,Bob Brown


In [15]:
#Reset selection by selecting everything:
df.q(
    r"""
    %id      %%>20000    &&<30003
    %name    &&?bob
    /id
    is any;  %%is any;
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


## flags

A number of flags can be used to modify the behaviour of selection conditions.

In [16]:
#Negate a condition
df.q(r'%id    %%!>20000')

Unnamed: 0,ID
0,10001
1,10002
2,10003


In [17]:
#All values in the selected columns must fulfill the row filter condition:
df.q(r'weight  /height    %%all>0')

Unnamed: 0,height,weight
0,170,70.2


In [18]:
#Any value in the selected columns must fulfill the row filter condition (default behaviour):
df.q(r'weight  /height    %%any>10')

Unnamed: 0,height,weight
0,170,70.2
1,175.5cm,68
3,280,na
5,185,75kg
7,6ft 1in,80.3
9,,82
10,200,-65


In [19]:
#Select each value in the selected columns that fulfills the row filter condition:
#(using background color to highlight the selected values)
#(highlighting does not work in all notebook renderers, eg: github)
df.q(r'weight  /height    %%each>10   $bg=orange')

Unnamed: 0,height,weight
0,170,70.200000
1,175.5cm,68
3,280,na
5,185,75kg
7,6ft 1in,80.3
9,,82
10,200,-65


In [20]:
#Compare to the previous example but now with highlighting:
df.q(r'weight  /height    %%any>10   $bg=orange')

Unnamed: 0,height,weight
0,170,70.200000
1,175.5cm,68
3,280,na
5,185,75kg
7,6ft 1in,80.3
9,,82
10,200,-65


In [21]:
#The index must fulfill the row filter condition:
df.q(r'weight  /height    %%idx>5')

Unnamed: 0,height,weight
6,1,
7,6ft 1in,80.3
8,-10,130lbs
9,,82
10,200,-65


In [22]:
#Interpret the value for comparison as a regex:
df.q(r'name  %%regex=........')  #matches any name with 8 characters

Unnamed: 0,name
0,John Doe
10,john Doe


In [23]:
#Also works with substring search:

#Select all rows where the name contains "J" followed by any 3 characters and then whitespace:
df.q(r'name    %% regex ? J...\s')

Unnamed: 0,name
0,John Doe
1,Jane Smith
9,JAck Williams


In [24]:
#All selection flags:
qp.qlang.FLAGS.by_trait['select']

{"!: NEGATE",
 "all: ALL",
 "any: ANY",
 "col: COL_EVAL",
 "each: EACH",
 "idx: IDX",
 "load: LOAD_SELECTION",
 "regex: REGEX",
 "save: SAVE_SELECTION",
 "strict: STRICT"}

## type filtering

The query language was designed to handle very messy datasets where sometimes no strict typing (or any typing at all!) is enforced during data entry. Therefor, operators like "is date;" do not filter based on the types in the dataset (sometimes all values are strings), but rather if it makes sense for a value to be of a certain type. Obviously, what makes sense depends on the domain and the assumptions for qplib might not align with your use case.

Using the flag "strict" switches to strict type filtering.

In [25]:
#Lets take a look at our dirty data again:
df

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [26]:
#We can see that "date of birth" is not a date, but a string.
#Lets see if qplib still recognizes it as dates:
df.q(r'date of birth    %%is date;')  #note that unary operators end with a semicolon

  result = pd.to_datetime(x, dayfirst=True)


Unnamed: 0,date of birth
0,1995-01-02
1,1990/09/14
2,1985.08.23
3,19800406
4,05-11-2007
5,06-30-1983
6,28-05-1975
7,1960Mar08
8,1955-Jan-09
9,1950 Sep 10


In [27]:
#both strings '40.0' and '20' are recognized as ints by default
df.q(r'age  %%is int;')

Unnamed: 0,age
0,-25.0
1,30.0
4,40.0
10,35.0


In [28]:
#strict mode does not recognize those strings as ints
df.q(r'age  %%strict is int;')

Unnamed: 0,age
0,-25
10,35


In [29]:
#70.2 is not treated as an int
df.q(r'weight  %%!is int;')

Unnamed: 0,weight
0,70.2
2,72.5lb
3,na
4,
5,75kg
6,
7,80.3
8,130lbs


## undefined behaviour

Due to expecting very messy data, qplib uses a type of [three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic) utilizing "True", "False" and "undefined". This means that, for example, numeric operators for numbers can be used on columns which also contain strings. As a result, ">=" is not necessarily the same as "!<=" (inversion of "<=").

In [30]:
df.q(r'height  %%>0')

Unnamed: 0,height
0,170
3,280
5,185
6,1
10,200


In [31]:
df.q(r'height  %%<0')

Unnamed: 0,height
8,-10


In [32]:
df.q(r'height  %%!>0')

Unnamed: 0,height
1,175.5cm
2,
4,
7,6ft 1in
8,-10
9,


In [33]:
#notice that only -10 is recognized as both <0 (orange) and !>0 (red)
#since the other values are not numbers, they can be !>0 but not <0
df.q(
    r"""
    height
        %%>0   $bg=lime
        %%<0   $bg=orange
        %%!>0  $color=red
        %%is any;
    """
    )

Unnamed: 0,height
0,170
1,175.5cm
2,
3,280
4,
5,185
6,1
7,6ft 1in
8,-10
9,


## saving selections

The simple linear syntax does not allow for nesting of conditions, but the same result can be achieved by saving the intermediate results in a variable.

In [34]:
#Selections can be saved using the "save" flag
df.q(
    r"""
    %id         %%>20000    &&<30003    %%save=1   #save selection to variable "1"
    %name       %%?bob      //?grace    %%save=2   #save selection to variable "2"
    %%load=1    &&load=2   #load both selections and combine them
    /id
    """
    )

Unnamed: 0,ID,name
3,20001,Bob Brown
6,30001,Grace TAYLOR


## more operators

In [35]:
#All operators for selection/filtering:
qp.qlang.OPERATORS.by_trait['select']

{"<: SMALLER",
 "<=: SMALLER_EQUAL",
 "==: EQUALS",
 ">: BIGGER",
 ">=: BIGGER_EQUAL",
 "?: CONTAINS",
 "is any;: IS_ANY",
 "is bool;: IS_BOOL",
 "is date;: IS_DATE",
 "is datetime;: IS_DATETIME",
 "is first;: IS_FIRST",
 "is float;: IS_FLOAT",
 "is int;: IS_INT",
 "is last;: IS_LAST",
 "is na;: IS_NA",
 "is nk;: IS_NK",
 "is no;: IS_NO",
 "is num;: IS_NUM",
 "is str;: IS_STR",
 "is unique;: IS_UNIQUE",
 "is yes;: IS_YES",
 "is yn;: IS_YN",
 "trim;: TRIM",
 "~: EVAL"}

In [36]:
#select repeated values
df.q(r'diabetes  %%!is unique;')

Unnamed: 0,diabetes
0,No
3,No
5,Yes
10,Yes


In [37]:
#select only first occurence of any value
#(automatically true for all unique values)
df.q(r'diabetes  %%is first;')

Unnamed: 0,diabetes
0,No
1,yes
2,
4,Y
5,Yes
6,NO
7,
8,
9,n


In [38]:
#select and highlight all na values
#(highlighting does not work in all notebook renderers, eg: github)
df.q(r'%%each is na;  $bg=orange')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35
10,30005,john Doe,1945 October 11,35,female,200,-65,45,,Normal,Yes,40ml


In [39]:
#remove all columns where no rows were selected
df.q(r'%%each is na;   %trim;  $bg=orange')

Unnamed: 0,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
1,30,F,175.5cm,68,130,85,Highe,yes,
2,,Female,,72.5lb,,,,,15 mg once a day
3,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
6,,ff,1,,NAN,,Normal,NO,
7,unk,,6ft 1in,80.3,122,,,,
8,,,-10,130lbs,,95,high,,30 MG
9,unknown,Mal,,82,130,0,,n,35
10,35,female,200,-65,45,,Normal,Yes,40ml


# modify

All modification instructions use the connector "$" and do not modify data inplace (by default). All modification instructions which could affect the original df create and return a copy instead.

## format

In [40]:
#change color:
#(color does not work in all notebook renderers, eg: github)
df.q('$color=red')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [41]:
#change background color:
#(color does not work in all notebook renderers, eg: github)
df.q('$bg=orange')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [42]:
#Use to highlight selection:
#(color does not work in all notebook renderers, eg: github)
df.q(
    r"""
    height      %%>180    $bg=orange
    is any;     %%is any;
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [43]:
#Highlight all rows where any value is NA:
#(color does not work in all notebook renderers, eg: github)
df.q('%%any is na;  $bg=orange')
df.q('%%is na;  $bg=orange')  #default behaviour is equivalent to using the "any" flag

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35
10,30005,john Doe,1945 October 11,35,female,200,-65,45,,Normal,Yes,40ml


In [44]:
#Highlight each individual NA value:
#(color does not work in all notebook renderers, eg: github)
df.q('%%each is na;  $bg=orange')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35
10,30005,john Doe,1945 October 11,35,female,200,-65,45,,Normal,Yes,40ml


In [45]:
#change alignement:
df.q(r'age %%!is int;  $align=left  %%is any;')

Unnamed: 0,age
0,-25
1,30
2,
3,
4,40.0
5,forty-five
6,
7,unk
8,
9,unknown


In [46]:
#change width:
df.q(r'age $width=200px')

Unnamed: 0,age
0,-25
1,30
2,
3,
4,40.0
5,forty-five
6,
7,unk
8,
9,unknown


## values

Modification is applied to all values in the current selection.

In [47]:
#Modify whole column:
df.q('age  $val=na')
df.q('age  $na')  #default behaviour is equivalent to using the "val" flag and the "=" operator

Unnamed: 0,age
0,na
1,na
2,na
3,na
4,na
5,na
6,na
7,na
8,na
9,na


In [48]:
#Set all NA values to "NA":
df.q(r'%%each is na;  $val=NA')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,,Y,20 Mg
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35
10,30005,john Doe,1945 October 11,35,female,200,-65,45,,Normal,Yes,40ml


## headers

In [49]:
df.q('date of birth   $header=dob')

Unnamed: 0,dob
0,1995-01-02
1,1990/09/14
2,1985.08.23
3,19800406
4,05-11-2007
5,06-30-1983
6,28-05-1975
7,1960Mar08
8,1955-Jan-09
9,1950 Sep 10


## column

In [50]:
#Create and fill a new column:
df.q('$new=abc')

Unnamed: 0,new1
0,abc
1,abc
2,abc
3,abc
4,abc
5,abc
6,abc
7,abc
8,abc
9,abc


In [51]:
#Create, fill and rename a new column:
df.q('$new=abc  $header=new text')

Unnamed: 0,new text
0,abc
1,abc
2,abc
3,abc
4,abc
5,abc
6,abc
7,abc
8,abc
9,abc


# logging

qplib has a lightweight logging system, somewhere between actual logging and using print().

In [52]:
#logs from the current session (since importing qplib) can be found here:
logs = qp.log().copy()
logs

Unnamed: 0,level,text,context,time,delta_ms
0,DEBUG,df was checked. no problems found,qp.qlang.check_df,2025-05-15 09:52:01.474501,0.000
1,TRACE,transformed code into raw instructions:\nInstr...,qp.qlang.tokenize,2025-05-15 09:52:01.475323,0.856
2,TRACE,"found ""CONNECTORS.MODIFY"" in ""$verbosity=3""",qp.qlang.extract_symbol,2025-05-15 09:52:01.475462,0.167
3,TRACE,"found ""FLAGS.VERBOSITY"" in ""verbosity=3""",qp.qlang.extract_symbol,2025-05-15 09:52:01.475525,0.072
4,TRACE,"found ""OPERATORS.SET"" in ""=3""",qp.qlang.extract_symbol,2025-05-15 09:52:01.475568,0.051
...,...,...,...,...,...
1260,DEBUG,"df will be copied since instruction ""$header=n...",qp.qlang.parse,2025-05-15 09:52:03.630533,0.047
1261,TRACE,"parsed instruction: ""$header=new text""",qp.qlang.parse,2025-05-15 09:52:03.630568,0.042
1262,TRACE,"instruction ""$header=new text"" is valid",qp.qlang.validate,2025-05-15 09:52:03.632026,1.484
1263,DEBUG,"applying instruction:\n""Instruction:\n\tline_n...",qp.qlang.query,2025-05-15 09:52:03.632097,0.080


In [53]:
#since the logs are stored in a dataframe, we can use qplib to filter them:
logs.q(r'level  %%warning   $bg=orange  %is any;')

Unnamed: 0,level,text,context,time,delta_ms
122,WARNING,"no columns fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-05-15 09:52:01.810389,0.683


In [54]:
#clear logs:
qp.log(clear=True)
logs = qp.log().copy()
logs

cleared all logs in qp.util.logs.


In [55]:
#by default, all levels are logged, but only warnings and errors are shown while using qplib:
df.q('name  &?bp')

0,1,2,3,4,5
17,WARNING,"no columns fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-05-15 09:52:03.893181,1.61


0
1
2
3
4
5
6
7
8
9
10


In [56]:
#show all log levels:
df.q(
    r"""
    $verbosity=5
    name  &?bp
    """
    )



0,1,2,3,4,5
28,TRACE,instruction applied,qp.qlang.query,2025-05-15 09:52:03.950840,0.045


0,1,2,3,4,5
29,TRACE,"found ""CONNECTORS.NEW_SELECT_COLS"" in ""%name """,qp.qlang.extract_symbol,2025-05-15 09:52:03.956839,6.051


0,1,2,3,4,5
30,TRACE,"no operator found in ""%name "". using default """"=: SET""""",qp.qlang.parse,2025-05-15 09:52:03.966624,9.823


0,1,2,3,4,5
31,TRACE,"""""=: SET"""" is interpreted as """"==: EQUALS"""" for selection instruction",qp.qlang.parse,2025-05-15 09:52:03.973820,7.229


0,1,2,3,4,5
32,TRACE,"parsed instruction: ""%name """,qp.qlang.parse,2025-05-15 09:52:03.982559,8.774


0,1,2,3,4,5
33,TRACE,"instruction ""%name "" is valid",qp.qlang.validate,2025-05-15 09:52:03.992645,10.13


0,1,2,3,4,5
34,DEBUG,"applying instruction: ""Instruction:  line_num: 2  code: %name connector: ""%: NEW_SELECT_COLS""  operator: ""==: EQUALS""  value: name  function: _select_cols""",qp.qlang.query,2025-05-15 09:52:03.998540,5.928


0,1,2,3,4,5
35,TRACE,"value ""name"" is treated as type ""str"" for comparison",qp.qlang._filter_series,2025-05-15 09:52:04.005755,7.239


0,1,2,3,4,5
36,TRACE,instruction applied,qp.qlang.query,2025-05-15 09:52:04.015173,9.461


0,1,2,3,4,5
37,TRACE,"found ""CONNECTORS.AND_SELECT_COLS"" in ""&?bp""",qp.qlang.extract_symbol,2025-05-15 09:52:04.021173,6.031


0,1,2,3,4,5
38,TRACE,"found ""OPERATORS.CONTAINS"" in ""?bp""",qp.qlang.extract_symbol,2025-05-15 09:52:04.028117,7.084


0,1,2,3,4,5
39,TRACE,"parsed instruction: ""&?bp""",qp.qlang.parse,2025-05-15 09:52:04.035377,7.286


0,1,2,3,4,5
40,TRACE,"instruction ""&?bp"" is valid",qp.qlang.validate,2025-05-15 09:52:04.041136,5.793


0,1,2,3,4,5
41,DEBUG,"applying instruction: ""Instruction:  line_num: 2  code: &?bp  connector: ""&: AND_SELECT_COLS""  operator: ""?: CONTAINS""  value: bp  function: _select_cols""",qp.qlang.query,2025-05-15 09:52:04.048182,7.075


0,1,2,3,4,5
42,WARNING,"no columns fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-05-15 09:52:04.054310,6.153


0,1,2,3,4,5
43,TRACE,instruction applied,qp.qlang.query,2025-05-15 09:52:04.059778,5.525


0
1
2
3
4
5
6
7
8
9
10


# syntax symbols

syntax symbols and their relations are defined in a csv file which gets read into a dataframe when importing qplib

In [57]:
#all syntax symbols (and their traits):
defs = qp.qlang.DEFINITIONS
defs

Unnamed: 0,type,glyph,description,select,select_rows,select_rows_scope,select_cols,modify,unary,conversion,settings,metadata,format,copy_df,is_type,NEW_SELECT_ROWS,AND_SELECT_ROWS,OR_SELECT_ROWS,NEW_SELECT_COLS,AND_SELECT_COLS,OR_SELECT_COLS,MODIFY,BIGGER_EQUAL,SMALLER_EQUAL,BIGGER,SMALLER,EQUALS,CONTAINS,TRIM,IS_ANY,IS_STR,IS_INT,IS_FLOAT,IS_NUM,IS_BOOL,IS_DATETIME,IS_DATE,IS_NA,IS_NK,IS_YN,IS_YES,IS_NO,IS_UNIQUE,IS_FIRST,IS_LAST,ADD,SET,EVAL,SORT,TO_STR,TO_INT,TO_FLOAT,TO_NUM,TO_BOOL,TO_DATETIME,TO_DATE,TO_NA,TO_NK,TO_YN,NEGATE,ANY,ALL,IDX,EACH,STRICT,SAVE_SELECTION,LOAD_SELECTION,VERBOSITY,DIFF,METADATA,TAG_METADATA,COLOR,BACKGROUND_COLOR,ALIGN,WIDTH,CSS,VAL,HEADER,NEW_COL,COL_EVAL,REGEX
select,trait,,,3,1,1,1,0,1,0,0,0,0,0,1,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,2,2
select_rows,trait,,,1,3,1,0,0,1,0,0,0,0,0,1,2,2,2,0,0,0,0,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,2,2
select_rows_scope,trait,,,1,1,3,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,2,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
select_cols,trait,,,1,0,0,3,0,1,0,0,0,0,0,1,0,0,0,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,2,2
modify,trait,,,0,0,0,0,3,1,1,1,1,1,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VAL,flag,val,modify selected values,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0
HEADER,flag,header,modify the headers of the selected columns,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0
NEW_COL,flag,new,create a new column with the selected values,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0
COL_EVAL,flag,col,"when used with the eval operator, evaluates on...",2,2,1,2,2,0,0,0,0,0,2,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0


In [None]:
#a value of 2 means that a symbol has this specific trait.
#eg: all of the following symbols are used for selection:
defs.q(r'select  %%2')

Unnamed: 0,select
NEW_SELECT_ROWS,2
AND_SELECT_ROWS,2
OR_SELECT_ROWS,2
NEW_SELECT_COLS,2
AND_SELECT_COLS,2
OR_SELECT_COLS,2
BIGGER_EQUAL,2
SMALLER_EQUAL,2
BIGGER,2
SMALLER,2


In [None]:
#a value of 1 means that 2 traits or symbols are compatible with each other.
#eg: the following symbols can be used with the negation flag:
defs.q(r'NEGATE  %%1')

Unnamed: 0,NEGATE
select_rows_scope,1
modify,1
unary,1
is_type,1
NEW_SELECT_ROWS,1
AND_SELECT_ROWS,1
OR_SELECT_ROWS,1
NEW_SELECT_COLS,1
AND_SELECT_COLS,1
OR_SELECT_COLS,1


# other qplib utilities

## qp.diff

creates colored diff output for two dataframes. Please note that not all notebook renderers support colored output. e.g. when viewing in github, the following examples will lose much of their usefullness.

color code:
- <font color="#f73434">red</font>: deleted row or column (missing in new df)
- <font color="#6dae51">green</font>: added row or column (present in new df but not in old df)
- <font color="orange">orange</font>: changed row or column(present in both df but different values)
- <font color="#f7746a">light red</font>: deleted value
- <font color="#c0e7b0">light green</font>: added value
- <font color="#f7d67c">light orange</font>: changed value

In [2]:
import qplib as qp

df_new, df_old = qp.get_dfs()

print('df_new:')
display(df_new)

print('df_old:')
display(df_old)

print('mode=new:')
display(qp.diff(df_new, df_old, uid='uid', mode='new'))

print('mode=new+:')
display(qp.diff(df_new, df_old, uid='uid', mode='new+'))

print('mode=old:')
display(qp.diff(df_new, df_old, uid='uid', mode='old'))

print('mode=mix:')
display(qp.diff(df_new, df_old, uid='uid', mode='mix'))


df_new:


Unnamed: 0,uid,d,b,a
y,y,2,2,0.0
x2,x2,1,1,1.0
z,z,3,3,


df_old:


Unnamed: 0,uid,a,b,c
x,x,1,1.0,1
y,y,2,2.0,2
z,z,3,,3


mode=new:


Unnamed: 0_level_0,meta,uid,d,b,a
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
y,vals changed: 1,y,2,2,0.0
x2,added row,x2,1,1,1.0
z,vals added: 1 vals removed: 1,z,3,3,


mode=new+:


Unnamed: 0_level_0,meta,uid,d,old: d,b,old: b,a,old: a
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
y,vals changed: 1,y,2,,2,,0.0,2.0
x2,added row,x2,1,,1,,1.0,
z,vals added: 1 vals removed: 1,z,3,,3,,,3.0


mode=old:


Unnamed: 0_level_0,meta,uid,a,b,c
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
x,removed row,x,1,1.0,1
y,vals changed: 1,y,2,2.0,2
z,vals added: 1 vals removed: 1,z,3,,3


mode=mix:


Unnamed: 0_level_0,meta,uid,d,b,a,c
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
y,vals changed: 1,y,2.0,2,0.0,2.0
x2,added row,x2,1.0,1,1.0,
z,vals added: 1 vals removed: 1,z,3.0,3,,3.0
x,removed row,x,,1,1.0,1.0
