# qplib - a query language for pandas

The query language works by sequentially applying filter conditions and modification instructions to the dataframe.  
Each condition/instruction starts with a connector one of these connectors:  
- "$"
- "%"
- "%%"
- "%%%"
- "&"
- "&&"
- "&&&"
- "/"
- "//"
- "///"

followed by some option flags, an operator and a value.
Each of these components is optional, with an associated default behaviour.

The examples will use a very small test dataset so that all filtering and modification is easily traceable.

Some instructions make use of colors which might not render depending on where you view the notebook (eg: github).

In [83]:
import pandas as pd
import numpy as np
import qplib as qp
from qplib import log

pd.set_option('display.max_columns', None)

df = qp.get_df()
df

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


# interactive mode

Use df.qi() to call the interactive mode if you want to try out the query language for yourself, or take a look through the examples in this notebook.

Please note that while the underlying query logic is quite heavily tested, the df.qi() ui interface is not and there might be visual bugs.

In [84]:
#Use the interactive mode in this cell to try out the query language,
#or take a look through the notebook for a walkthrough.

df.qi()

HBox(children=(Textarea(value='$verbosity=3\n$diff=None\n\n#Enter query code here,\n#or use the buttons to the…

HBox(children=(Output(),))

# filter/select

## cols




In [85]:
#Select the column called "name":
#(Mulitple equivalent examples are shown to demonstrate default behaviour)

df.q('name')
df.q('%name')
df.q('%=name')
df.q('%==name')
df.q('% == name')

Unnamed: 0,name
0,John Doe
1,Jane Smith
2,Alice Johnson
3,Bob Brown
4,eva white
5,Frank miller
6,Grace TAYLOR
7,Harry Clark
8,IVY GREEN
9,JAck Williams


In [86]:
#select all columns containing the string "bp":
df.q('?bp')

Unnamed: 0,bp systole,bp diastole
0,20,80
1,130,85
2,,
3,140,90mmHg
4,135mmhg,
5,125,75
6,NAN,
7,122,
8,,95
9,130,0


In [87]:
#Multiple selection conditions can be used by combining them with "&" or "/".

#Either condition must be fulfilled:
df.q('name  /?bp')

Unnamed: 0,name,bp systole,bp diastole
0,John Doe,20,80
1,Jane Smith,130,85
2,Alice Johnson,,
3,Bob Brown,140,90mmHg
4,eva white,135mmhg,
5,Frank miller,125,75
6,Grace TAYLOR,NAN,
7,Harry Clark,122,
8,IVY GREEN,,95
9,JAck Williams,130,0


In [88]:
#Both conditions must be fulfilled:
df.q('?bp  &?systole')

Unnamed: 0,bp systole
0,20
1,130
2,
3,140
4,135mmhg
5,125
6,NAN
7,122
8,
9,130


In [89]:
#Notice the warning when no columns fulfill both conditions.
df.q('name  &?bp')

0,1,2,3,4,5
18,WARNING,"no cols fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-10-21 14:45:48.596303,1011188.167


0
1
2
3
4
5
6
7
8
9
10


In [90]:
#"%" creates a new selection, discarding the previous one:
df.q(r'name   /?bp   %id')

Unnamed: 0,ID
0,10001
1,10002
2,10003
3,20001
4,20002
5,20003
6,30001
7,30002
8,30003
9,30004


In [91]:
#Reset selection by selecting everything:
df.q(r'id  /name   %is any;')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


## rows

Row filter conditions use the same connector symbols as column conditions but twice, to easily distinguish them.

In [92]:
#Select all rows where the value in the "id" column is greater than 20000:
df.q(r'%id    %%>20000')

Unnamed: 0,ID
3,20001
4,20002
5,20003
6,30001
7,30002
8,30003
9,30004
10,30005


In [93]:
#Select rows based on multiple conditions for the same column:
df.q(r'%id    %%>20000    &&<30003')

Unnamed: 0,ID
3,20001
4,20002
5,20003
6,30001
7,30002


In [94]:
#Creating a new column selection does not change the row selection:
df.q(r'%id    %%>20000    &&<30003   %name')

#suggested formatting for longer queries:
df.q(
    r"""
    %id     %%>20000    &&<30003
    %name
    """
    )

Unnamed: 0,name
3,Bob Brown
4,eva white
5,Frank miller
6,Grace TAYLOR
7,Harry Clark


In [95]:
#Now lets add a third column selection connected to the second one:
#(notice that the order of columns is not changed)
df.q(
    r"""
    %id     %%>20000    &&<30003
    %name
    /id
    """
    )


Unnamed: 0,ID,name
3,20001,Bob Brown
4,20002,eva white
5,20003,Frank miller
6,30001,Grace TAYLOR
7,30002,Harry Clark


In [96]:
#This behaviour can be used to select rows using conditions on multiple columns.
df.q(
    r"""
    %id     %%>20000    &&<30003
    %name   &&?bob
    /id
    """
    )

Unnamed: 0,ID,name
3,20001,Bob Brown


In [97]:
#Reset selection by selecting everything:
df.q(
    r"""
    %id      %%>20000    &&<30003
    %name    &&?bob
    /id
    is any;  %%is any;
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


## vals

In [98]:
#Select all na values and highlight them using orange background color:
#(highlighting does not work in all notebook renderers, eg: github)
df.q(r'%%%is na;  $bg=orange')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [99]:
#Instead of highlighting we could also change them
#to an easily identifiable value for demonstration:
df.q(r'%%%is na;  $vals=!!!!!!!!!!!!!')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,!!!!!!!!!!!!!
2,10003,Alice Johnson,1985.08.23,!!!!!!!!!!!!!,Female,!!!!!!!!!!!!!,72.5lb,!!!!!!!!!!!!!,!!!!!!!!!!!!!,!!!!!!!!!!!!!,!!!!!!!!!!!!!,15 mg once a day
3,20001,Bob Brown,19800406,!!!!!!!!!!!!!,Male,280,!!!!!!!!!!!!!,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,!!!!!!!!!!!!!,!!!!!!!!!!!!!,135mmhg,!!!!!!!!!!!!!,!!!!!!!!!!!!!,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,!!!!!!!!!!!!!,ff,1,!!!!!!!!!!!!!,!!!!!!!!!!!!!,!!!!!!!!!!!!!,Normal,NO,!!!!!!!!!!!!!
7,30002,Harry Clark,1960Mar08,unk,!!!!!!!!!!!!!,6ft 1in,80.3,122,!!!!!!!!!!!!!,!!!!!!!!!!!!!,!!!!!!!!!!!!!,!!!!!!!!!!!!!
8,30003,IVY GREEN,1955-Jan-09,!!!!!!!!!!!!!,!!!!!!!!!!!!!,-10,130lbs,!!!!!!!!!!!!!,95,high,!!!!!!!!!!!!!,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,!!!!!!!!!!!!!,82,130,0,!!!!!!!!!!!!!,n,35


In [100]:
#select values between 0 and 100:
df.q(r'%%%>0;   &&&<100   $bg=orange')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [101]:
#After selecting values, "trim;" can be used to
#remove all column in which no values were selected:
df.q(r'%%%>0;   &&&<100   $bg=orange  %trim;')

Unnamed: 0,age,height,weight,bp systole,bp diastole,dose
0,-25,170,70.200000,20,80,10kg
1,30,175.5cm,68,130,85,
2,,,72.5lb,,,15 mg once a day
3,,280,na,140,90mmHg,20mg
4,40.0,,,135mmhg,,20 Mg
5,forty-five,185,75kg,125,75,25g
6,,1,,NAN,,
7,unk,6ft 1in,80.3,122,,
8,,-10,130lbs,,95,30 MG
9,unknown,,82,130,0,35


In [102]:
#This also works for rows:
df.q(r'%%%>0;   &&&<100   $bg=orange   %%trim;')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80.0,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85.0,Highe,yes,
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75.0,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95.0,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0.0,,n,35
10,30005,john Doe,1945 October 11,35,female,200,-65,45,,Normal,Yes,40ml


In [103]:
#Or both:
df.q(r'%%%>0;   &&&<100   $bg=orange  %trim;   %%trim;')

Unnamed: 0,age,height,weight,bp systole,bp diastole,dose
0,-25,170,70.200000,20,80.0,10kg
1,30,175.5cm,68,130,85.0,
4,40.0,,,135mmhg,,20 Mg
5,forty-five,185,75kg,125,75.0,25g
6,,1,,NAN,,
7,unk,6ft 1in,80.3,122,,
8,,-10,130lbs,,95.0,30 MG
9,unknown,,82,130,0.0,35
10,35,200,-65,45,,40ml


In [104]:
#Only select values in a subset of columns by
#first selecting the rows and cols of the subset
#and then using a value selection:
df.q(
    r"""
    %name  /weight  /height
        %%%>0;   &&&<100
        $bg=orange
        
    %is any;
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [105]:
#doing it the other way round will first select values
#in the whole df, then select the subset of rows and cols
#without changing the value selection:
df.q(
    r"""
    %%%>0;   &&&<100
    $bg=orange

    %name  /weight  /height
    
    %is any;
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [106]:
#this means that "%trim;" will still be based on the originally
#selected values, not on the current row/col selection:
df.q(
    r"""
    %%%>0;   &&&<100
    $bg=orange

    %name  /weight  /height
    %trim;
    """
    )

Unnamed: 0,age,height,weight,bp systole,bp diastole,dose
0,-25,170,70.200000,20,80,10kg
1,30,175.5cm,68,130,85,
2,,,72.5lb,,,15 mg once a day
3,,280,na,140,90mmHg,20mg
4,40.0,,,135mmhg,,20 Mg
5,forty-five,185,75kg,125,75,25g
6,,1,,NAN,,
7,unk,6ft 1in,80.3,122,,
8,,-10,130lbs,,95,30 MG
9,unknown,,82,130,0,35


In [107]:
#but, other connectors can be used to
#combine "trim;" with the current selection:
df.q(
    r"""
    %%%>0;   &&&<100
    $bg=orange

    %name  /weight  /height
    &trim;
    """
    )

Unnamed: 0,height,weight
0,170,70.200000
1,175.5cm,68
2,,72.5lb
3,280,na
4,,
5,185,75kg
6,1,
7,6ft 1in,80.3
8,-10,130lbs
9,,82


In [108]:
#this capability to use the 3 layers of selection (rows, cols, vals)
#independently or in combination allows for selecting various
#values accros the df based on different criteria.
#eg. lets select various types of suspicious values,
#add them all to a saved selection which we can load
#and highlight at the end:
df.q(
    r"""
    %%%is na;  $save=1

    %age  /height  /weight  /?bp
        ///!is num;
        ///<0
        $save+=1

    gender
        %%%male   ///m   ///female   ///f   ///other
        %%%invert;
        $vals save+=1

    cholesterol
        %%%normal   ///high   ///low   ///good   ///bad
        %%%invert;
        $vals save+=1

    is any;  %%is any;
    %%%load=1   $bg=orange
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


## flags

A number of flags can be used to modify the behaviour of selection conditions.

In [109]:
#Negate a condition
df.q(r'%id    %%!>20000')

Unnamed: 0,ID
0,10001
1,10002
2,10003


In [110]:
#All values in the selected columns must fulfill the row filter condition:
df.q(r'weight  /height    %%all>0')

Unnamed: 0,height,weight
0,170,70.2


In [111]:
#Any value in the selected columns must fulfill the row filter condition (default behaviour):
df.q(r'weight  /height    %%any>10')

Unnamed: 0,height,weight
0,170,70.2
1,175.5cm,68
3,280,na
5,185,75kg
7,6ft 1in,80.3
9,,82
10,200,-65


In [112]:
#The index must fulfill the row filter condition:
df.q(r'weight  /height    %%idx>5')

Unnamed: 0,height,weight
6,1,
7,6ft 1in,80.3
8,-10,130lbs
9,,82
10,200,-65


In [113]:
#Interpret the value for comparison as a regex:
df.q(r'name  %%regex=........')  #matches any name with 8 characters

Unnamed: 0,name
0,John Doe
10,john Doe


In [114]:
#Also works with substring search:

#Select all rows where the name contains "J" followed by any 3 characters and then whitespace:
df.q(r'name    %% regex ? J...\s')

Unnamed: 0,name
0,John Doe
1,Jane Smith
9,JAck Williams


In [115]:
#All selection flags:
qp.qlang.FLAGS.by_trait['select']

{<"!": NEGATE>,
 <"all": ALL>,
 <"any": ANY>,
 <"col": COL_EVAL>,
 <"idx": IDX>,
 <"load": LOAD_SELECTION>,
 <"regex": REGEX>,
 <"strict": STRICT>}

## type filtering

The query language was designed to handle very messy datasets where sometimes no strict typing (or any typing at all!) is enforced during data entry. Therefor, operators like "is date;" do not filter based on the types in the dataset (sometimes all values are strings), but rather if it makes sense for a value to be of a certain type. Obviously, what makes sense depends on the domain and the assumptions for qplib might not align with your use case.

Using the flag "strict" switches to strict type filtering.

In [116]:
#Lets take a look at our dirty data again:
df

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [117]:
#We can see that "date of birth" is not a date, but a string.
#Lets see if qplib still recognizes it as dates:
df.q(r'date of birth    %%is date;')  #note that unary operators end with a semicolon

  result = pd.to_datetime(x, dayfirst=True)


Unnamed: 0,date of birth
0,1995-01-02
1,1990/09/14
2,1985.08.23
3,19800406
4,05-11-2007
5,06-30-1983
6,28-05-1975
7,1960Mar08
8,1955-Jan-09
9,1950 Sep 10


In [118]:
#both strings '40.0' and '20' are recognized as ints by default
df.q(r'age  %%is int;')

Unnamed: 0,age
0,-25.0
1,30.0
4,40.0
10,35.0


In [119]:
#strict mode does not recognize those strings as ints
df.q(r'age  %%strict is int;')

Unnamed: 0,age
0,-25
10,35


In [120]:
#70.2 is not treated as an int
df.q(r'weight  %%!is int;')

Unnamed: 0,weight
0,70.2
2,72.5lb
3,na
4,
5,75kg
6,
7,80.3
8,130lbs


## undefined behaviour

Due to expecting very messy data, qplib uses a type of [three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic) utilizing "True", "False" and "undefined". This means that, for example, numeric operators for numbers can be used on columns which also contain strings. As a result, ">=" is not necessarily the same as "!<=" (inversion of "<=").

In [121]:
df.q(r'height  %%>0')

Unnamed: 0,height
0,170
3,280
5,185
6,1
10,200


In [122]:
df.q(r'height  %%<0')

Unnamed: 0,height
8,-10


In [123]:
df.q(r'height  %%!>0')

Unnamed: 0,height
1,175.5cm
2,
4,
7,6ft 1in
8,-10
9,


In [124]:
#notice that only -10 is recognized as both <0 (orange) and !>0 (red)
#since the other values are not numbers, they can be !>0 but not <0
df.q(
    r"""
    height
        %%>0   $bg=lime
        %%<0   $bg=orange
        %%!>0  $color=red
        %%is any;
    """
    )

Unnamed: 0,height
0,170
1,175.5cm
2,
3,280
4,
5,185
6,1
7,6ft 1in
8,-10
9,


In [125]:
#The same result but marking values via a second meta col
#in case the notebook renderer does not support highlighting:
df.q(
    r"""
    height  %%>0    $meta+=>0 <br>
    height  %%<0    $meta+=<0 <br>
    height  %%!>0   $meta+=!>0 <br>
    %%is any;
    $align=right
    /=meta
    """
    )

0,1,2,3,4,5
19,INFO,"no metadata col found in dataframe. creating new col named ""meta""",qp.qlang._modify_metadata,2025-10-21 14:45:51.042160,2445.865


Unnamed: 0,height,meta
0,170,>0
1,175.5cm,!>0
2,,!>0
3,280,>0
4,,!>0
5,185,>0
6,1,>0
7,6ft 1in,!>0
8,-10,<0 !>0
9,,!>0


## saving selections

The simple linear syntax does not allow for nesting of conditions, but the same result can be achieved by saving the intermediate results in a variable.

In [126]:
#Selections can be saved using the "save" flag
df.q(
    r"""
    %id         %%>20000    &&<30003    $save=1   #save selection to variable "1"
    %name       %%?bob      //?grace    $save=2   #save selection to variable "2"
    %%load=1    &&load=2   #load both row selections and combine them
    /id
    """
    )

Unnamed: 0,ID,name
3,20001,Bob Brown
6,30001,Grace TAYLOR


In [127]:
#The "save" flag saves the current col, row, val selection.
#loading just the col selection:
df.q(
    r"""
    %height  /weight
        %%all is num;
            %%%>0
    $save=1
    is any;  %%is any;

    %load=1
    """
    )

Unnamed: 0,height,weight
0,170,70.2
1,175.5cm,68
2,,72.5lb
3,280,na
4,,
5,185,75kg
6,1,
7,6ft 1in,80.3
8,-10,130lbs
9,,82


In [128]:
#loading row selection:
df.q(
    r"""
    %height  /weight
        %%all is num;
            %%%>0
    $save=1
    is any;  %%is any;

    %%load=1
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170.0,70.2,20,80.0,Normal,No,10kg
6,30001,Grace TAYLOR,28-05-1975,,ff,1.0,,NAN,,Normal,NO,
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82.0,130,0.0,,n,35
10,30005,john Doe,1945 October 11,35,female,200.0,-65.0,45,,Normal,Yes,40ml


In [129]:
#loading val selection:
df.q(
    r"""
    %height  /weight
        %%all is num;
            %%%>0
    $save=1
    is any;  %%is any;

    %%%load=1  $bg=orange

    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


## more operators

In [130]:
#All operators for selection/filtering:
qp.qlang.OPERATORS.by_trait['select']

{<"<": SMALLER>,
 <"<=": SMALLER_EQUAL>,
 <"=": SET>,
 <"==": EQUALS>,
 <">": BIGGER>,
 <">=": BIGGER_EQUAL>,
 <"?": CONTAINS>,
 <"invert;": INVERT>,
 <"is any;": IS_ANY>,
 <"is bool;": IS_BOOL>,
 <"is date;": IS_DATE>,
 <"is datetime;": IS_DATETIME>,
 <"is first;": IS_FIRST>,
 <"is float;": IS_FLOAT>,
 <"is int;": IS_INT>,
 <"is last;": IS_LAST>,
 <"is na;": IS_NA>,
 <"is nk;": IS_NK>,
 <"is no;": IS_NO>,
 <"is num;": IS_NUM>,
 <"is str;": IS_STR>,
 <"is unique;": IS_UNIQUE>,
 <"is yes;": IS_YES>,
 <"is yn;": IS_YN>,
 <"trim;": TRIM>,
 <"~": EVAL>}

In [131]:
#select repeated values
df.q(r'diabetes  %%!is unique;')

Unnamed: 0,diabetes
0,No
3,No
5,Yes
10,Yes


In [132]:
#select only first occurence of any value
#(automatically true for all unique values)
df.q(r'diabetes  %%is first;')

Unnamed: 0,diabetes
0,No
1,yes
2,
4,Y
5,Yes
6,NO
7,
8,
9,n


# modify

All modification instructions use the connector "$" and do not modify data inplace (by default). All modification instructions which could affect the original df create and return a copy instead.

## format

In [133]:
#change color:
#(color does not work in all notebook renderers, eg: github)
df.q('$color=red')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [134]:
#change background color:
#(color does not work in all notebook renderers, eg: github)
df.q('$bg=orange')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [135]:
#Use to highlight selection:
#(color does not work in all notebook renderers, eg: github)
df.q(
    r"""
    height      %%>180    $bg=orange
    is any;     %%is any;
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [136]:
#Highlight all rows where any value is NA:
#(color does not work in all notebook renderers, eg: github)
df.q(r'%%any is na;   $bg=orange   %%is any;')
df.q(r'%%is na;   $bg=orange   %%is any;')  #default behaviour is equivalent to using the "any" flag

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [137]:
#Highlight each individual NA value:
#(color does not work in all notebook renderers, eg: github)
df.q(r'%%%is na;  $bg=orange')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [138]:
#change alignement:
df.q(r'age %%!is int;  $align=left  %%is any;')

Unnamed: 0,age
0,-25
1,30
2,
3,
4,40.0
5,forty-five
6,
7,unk
8,
9,unknown


In [139]:
#change width:
df.q(r'age $width=200px')

Unnamed: 0,age
0,-25
1,30
2,
3,
4,40.0
5,forty-five
6,
7,unk
8,
9,unknown


## values

Modification is applied to all values in the current selection.

In [140]:
#Modify whole column:
df.q('age  $vals=na')

Unnamed: 0,age
0,na
1,na
2,na
3,na
4,na
5,na
6,na
7,na
8,na
9,na


In [141]:
#Set all NA values to "NA":
df.q(r'%%%is na;  $vals=NA')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,John Doe,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [142]:
#values can also be replaced with
#respective values from other columns:
df.q(r'name  %%%?j  $vals=@ID  %is any;')

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
0,10001,10001,1995-01-02,-25,M,170,70.2,20,80,Normal,No,10kg
1,10002,10002,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,
2,10003,10003,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g
6,30001,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,NO,
7,30002,Harry Clark,1960Mar08,unk,,6ft 1in,80.3,122,,,,
8,30003,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,30 MG
9,30004,30004,1950 Sep 10,unknown,Mal,,82,130,0,,n,35


In [143]:
#Or appended with values from other columns:
df.q(
    r"""
    #create col with error codes
    $new=___ERROR  $cols=error code
        %%%is any;  $vals+=@ID


    #append error codes to na values in subset of df
    %age  /gender
        %%idx>5  &&idx<=8
            %%%is na;
                $vals+=@error code
                $bg=orange


    is any;  %%is any;
    """
    )

Unnamed: 0,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose,error code
0,10001,John Doe,1995-01-02,-25,M,170,70.200000,20,80,Normal,No,10kg,___ERROR10001
1,10002,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,yes,,___ERROR10002
2,10003,Alice Johnson,1985.08.23,,Female,,72.5lb,,,,,15 mg once a day,___ERROR10003
3,20001,Bob Brown,19800406,,Male,280,na,140,90mmHg,GOOD,No,20mg,___ERROR20001
4,20002,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,Y,20 Mg,___ERROR20002
5,20003,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,Yes,25g,___ERROR20003
6,30001,Grace TAYLOR,28-05-1975,nan___ERROR30001,ff,1,,NAN,,Normal,NO,,___ERROR30001
7,30002,Harry Clark,1960Mar08,unk,NaN___ERROR30002,6ft 1in,80.3,122,,,,,___ERROR30002
8,30003,IVY GREEN,1955-Jan-09,___ERROR30003,None___ERROR30003,-10,130lbs,,95,high,,30 MG,___ERROR30003
9,30004,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,n,35,___ERROR30004


## column names

In [144]:
df.q('date of birth   $cols=dob')

Unnamed: 0,dob
0,1995-01-02
1,1990/09/14
2,1985.08.23
3,19800406
4,05-11-2007
5,06-30-1983
6,28-05-1975
7,1960Mar08
8,1955-Jan-09
9,1950 Sep 10


## new column

In [145]:
#Create and fill a new column:
df.q('$new=abc')

Unnamed: 0,new1
0,abc
1,abc
2,abc
3,abc
4,abc
5,abc
6,abc
7,abc
8,abc
9,abc


In [146]:
#Create, fill and rename a new column:
df.q('$new=abc  $cols=new text')

Unnamed: 0,new text
0,abc
1,abc
2,abc
3,abc
4,abc
5,abc
6,abc
7,abc
8,abc
9,abc


# logging

qplib has a lightweight logging system, somewhere between actual logging and using print().

In [147]:
#logs from the current session (since importing qplib) can be found here:
logs = qp.log().copy()
logs

Unnamed: 0,level,text,context,time,delta_ms
0,WARNING,"no cols fulfill the condition in ""&?bp"" and th...",qp.qlang._select_cols,2025-10-21 14:28:57.207178,0.0
1,TRACE,instruction applied,qp.qlang.query,2025-10-21 14:28:57.261116,53.944
2,TRACE,"found ""CONNECTORS.NEW_SELECT_COLS"" in ""%name """,qp.qlang.extract_symbol,2025-10-21 14:28:57.269587,8.496
3,TRACE,"no operator found in ""%name "". using default ...",qp.qlang.parse,2025-10-21 14:28:57.277730,8.205
4,TRACE,"""<""="": SET>"" is interpreted as ""<""=="": EQUALS>...",qp.qlang.parse,2025-10-21 14:28:57.286375,8.673
5,TRACE,"parsed instruction: ""%name """,qp.qlang.parse,2025-10-21 14:28:57.293851,7.506
6,TRACE,"instruction ""%name "" is valid",qp.qlang.validate,2025-10-21 14:28:57.303425,9.602
7,DEBUG,applying instruction:<br>Instruction:<br>&emsp...,qp.qlang.query,2025-10-21 14:28:57.309976,6.591
8,TRACE,"value ""name"" is treated as type ""str""",qp.qlang._filter_series,2025-10-21 14:28:57.318445,8.49
9,TRACE,instruction applied,qp.qlang.query,2025-10-21 14:28:57.329243,10.839


In [148]:
#since the logs are stored in a dataframe, we can use qplib to filter them:
logs.q(r'level  %%warning   $bg=orange  %is any;')

Unnamed: 0,level,text,context,time,delta_ms
0,WARNING,"no cols fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-10-21 14:28:57.207178,0.0
15,WARNING,"no cols fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-10-21 14:28:57.400816,10.499
17,WARNING,"no cols fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-10-21 14:45:48.596303,1011188.167


In [149]:
#clear logs:
qp.log(clear=True)
logs = qp.log().copy()
logs

cleared all logs in qp.util.logs.


In [150]:
#by default only warnings and errors are shown:
df.q('name  &?bp')

0,1,2,3,4,5
1,WARNING,"no cols fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-10-21 14:45:52.927758,0.0


0
1
2
3
4
5
6
7
8
9
10


In [151]:
#show all log levels:
df.q(
    r"""
    $verbosity=5
    name  &?bp
    """
    )



0,1,2,3,4,5
2,TRACE,instruction applied,qp.qlang.query,2025-10-21 14:45:52.975372,47.625


0,1,2,3,4,5
3,TRACE,"found ""CONNECTORS.NEW_SELECT_COLS"" in ""%name """,qp.qlang.extract_symbol,2025-10-21 14:45:52.984200,8.865


0,1,2,3,4,5
4,TRACE,"no operator found in ""%name "". using default ""<""="": SET>""",qp.qlang.parse,2025-10-21 14:45:52.991730,7.556


0,1,2,3,4,5
5,TRACE,"""<""="": SET>"" is interpreted as ""<""=="": EQUALS>"" for selection instruction",qp.qlang.parse,2025-10-21 14:45:52.998193,6.487


0,1,2,3,4,5
6,TRACE,"parsed instruction: ""%name """,qp.qlang.parse,2025-10-21 14:45:53.004948,6.792


0,1,2,3,4,5
7,TRACE,"instruction ""%name "" is valid",qp.qlang.validate,2025-10-21 14:45:53.012801,7.898


0,1,2,3,4,5
8,DEBUG,"applying instruction: Instruction:  line_num: 2  code: %name connector: <""%"": NEW_SELECT_COLS>  operator: <""=="": EQUALS>  value: name  function: _select_cols",qp.qlang.query,2025-10-21 14:45:53.020629,7.858


0,1,2,3,4,5
9,TRACE,"value ""name"" is treated as type ""str""",qp.qlang._filter_series,2025-10-21 14:45:53.028104,7.494


0,1,2,3,4,5
10,TRACE,instruction applied,qp.qlang.query,2025-10-21 14:45:53.035946,7.866


0,1,2,3,4,5
11,TRACE,"found ""CONNECTORS.AND_SELECT_COLS"" in ""&?bp""",qp.qlang.extract_symbol,2025-10-21 14:45:53.043384,7.466


0,1,2,3,4,5
12,TRACE,"found ""OPERATORS.CONTAINS"" in ""?bp""",qp.qlang.extract_symbol,2025-10-21 14:45:53.050491,7.133


0,1,2,3,4,5
13,TRACE,"parsed instruction: ""&?bp""",qp.qlang.parse,2025-10-21 14:45:53.057178,6.716


0,1,2,3,4,5
14,TRACE,"instruction ""&?bp"" is valid",qp.qlang.validate,2025-10-21 14:45:53.064145,6.994


0,1,2,3,4,5
15,DEBUG,"applying instruction: Instruction:  line_num: 2  code: &?bp  connector: <""&"": AND_SELECT_COLS>  operator: <""?"": CONTAINS>  value: bp  function: _select_cols",qp.qlang.query,2025-10-21 14:45:53.068702,4.582


0,1,2,3,4,5
16,WARNING,"no cols fulfill the condition in ""&?bp"" and the previous condition(s)",qp.qlang._select_cols,2025-10-21 14:45:53.075257,6.582


0,1,2,3,4,5
17,TRACE,instruction applied,qp.qlang.query,2025-10-21 14:45:53.082511,7.282


0
1
2
3
4
5
6
7
8
9
10


# syntax symbols

syntax symbols and their relations are defined in a csv file which gets read into a dataframe when importing qplib

In [152]:
#all syntax symbols (and their traits):
defs = qp.qlang.DEFINITIONS
defs

Unnamed: 0,type,glyph,description,select,select_vals,select_rows,select_rows_scope,select_cols,modify,modify_scope,unary,conversion,settings,metadata,format,copy_df,is_type,NEW_SELECT_VALS,AND_SELECT_VALS,OR_SELECT_VALS,NEW_SELECT_ROWS,AND_SELECT_ROWS,OR_SELECT_ROWS,NEW_SELECT_COLS,AND_SELECT_COLS,OR_SELECT_COLS,MODIFY,BIGGER_EQUAL,SMALLER_EQUAL,BIGGER,SMALLER,EQUALS,CONTAINS,TRIM,INVERT,IS_ANY,IS_STR,IS_INT,IS_FLOAT,IS_NUM,IS_BOOL,IS_DATETIME,IS_DATE,IS_NA,IS_NK,IS_YN,IS_YES,IS_NO,IS_UNIQUE,IS_FIRST,IS_LAST,ADD,SET,EVAL,SORT,TO_STR,TO_INT,TO_FLOAT,TO_NUM,TO_BOOL,TO_DATETIME,TO_DATE,TO_NA,TO_NK,TO_YN,NEGATE,ANY,ALL,IDX,STRICT,SAVE_SELECTION,LOAD_SELECTION,VERBOSITY,DIFF,METADATA,TAG_METADATA,COLOR,BACKGROUND_COLOR,ALIGN,WIDTH,CSS,COLS,ROWS,VALS,NEW_COL,COL_EVAL,REGEX
select,trait,,,3,1,1,1,1,0,0,1,0,0,0,0,0,1,2,2,2,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
select_vals,trait,,,1,3,0,0,0,0,0,1,0,0,0,0,0,1,2,2,2,0,0,0,0,0,0,0,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
select_rows,trait,,,1,0,3,1,0,0,0,1,0,0,0,0,0,1,0,0,0,2,2,2,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
select_rows_scope,trait,,,1,0,1,3,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,2,2,2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
select_cols,trait,,,1,0,0,0,3,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ROWS,flag,rows,modify the values in the selected rows,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,0,3,0,0,1,0
VALS,flag,vals,modify selected values,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,0,0,3,0,1,0
NEW_COL,flag,new,create a new column with the selected values,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0
COL_EVAL,flag,col,"when used with the eval operator, evaluates on...",2,2,2,1,2,2,1,0,0,0,0,0,2,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,3,0


In [153]:
#a value of 2 means that a symbol has this specific trait.
#eg: all of the following symbols are used for selection:
defs.q(r'select  %%2')

Unnamed: 0,select
NEW_SELECT_VALS,2
AND_SELECT_VALS,2
OR_SELECT_VALS,2
NEW_SELECT_ROWS,2
AND_SELECT_ROWS,2
OR_SELECT_ROWS,2
NEW_SELECT_COLS,2
AND_SELECT_COLS,2
OR_SELECT_COLS,2
BIGGER_EQUAL,2


In [154]:
#a value of 1 means that 2 traits or symbols are compatible with each other.
#eg: the following symbols can be used with the negation flag:
defs.q(r'NEGATE  %%1')

Unnamed: 0,NEGATE
select_rows_scope,1
modify,1
modify_scope,1
unary,1
is_type,1
NEW_SELECT_VALS,1
AND_SELECT_VALS,1
OR_SELECT_VALS,1
NEW_SELECT_ROWS,1
AND_SELECT_ROWS,1


# other qplib utilities

## qp.diff

creates colored diff output for two dataframes. Please note that not all notebook renderers support colored output. e.g. when viewing in github, the following examples will lose much of their usefullness.

color code:
- <font color="#f7746a">light red</font>: deleted value (missing in new df)
- <font color="#c0e7b0">light green</font>: added value (present in new df but not in old df)
- <font color="#f7d67c">light orange</font>: changed value (present in both dfs but different values)
- <font color="#f73434">red</font>: deleted row or column
- <font color="#6dae51">green</font>: added row or column

In [155]:
import os
import qplib as qp

df_new, df_old = qp.get_dfs()

print('df_new:')
display(df_new)

print('df_old:')
display(df_old)

print('Summary:')
display(qp.diff(df_new, df_old, uid='uid').summary())

print('mode=new:')
display(qp.diff(df_new, df_old, uid='uid').show('new'))

print('mode=new+:')
display(qp.diff(df_new, df_old, uid='uid').show('new+'))

print('mode=old:')
display(qp.diff(df_new, df_old, uid='uid').show('old'))

print('mode=mix:')
display(qp.diff(df_new, df_old, uid='uid').show('mix'))


df_new:


Unnamed: 0,uid,d,b,a
y,y,2,2,0.0
x2,x2,1,1,1.0
z,z,3,3,


df_old:


Unnamed: 0,uid,a,b,c
x,x,1,1.0,1
y,y,2,2.0,2
z,z,3,,3


Summary:


Unnamed: 0,uid col,cols shared,rows shared,cols added,cols removed,rows added,rows removed,dtypes changed,cols renamed in new,cols renamed in old,cols ignored in new,cols ignored in old
0,uid,2,2,d,c,x2,x,,,,,


mode=new:


Unnamed: 0_level_0,meta,uid,d,b,a
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
y,vals changed: 1,y,2,2,0.0
x2,added row,x2,1,1,1.0
z,vals added: 1 vals removed: 1,z,3,3,


mode=new+:


Unnamed: 0_level_0,meta,uid,d,old: d,b,old: b,a,old: a
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
y,vals changed: 1,y,2,,2,,0.0,2.0
x2,added row,x2,1,,1,,1.0,
z,vals added: 1 vals removed: 1,z,3,,3,,,3.0


mode=old:


Unnamed: 0_level_0,meta,uid,a,b,c
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
x,removed row,x,1,1.0,1
y,vals changed: 1,y,2,2.0,2
z,vals added: 1 vals removed: 1,z,3,,3


mode=mix:


Unnamed: 0_level_0,meta,uid,d,b,a,c
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
y,vals changed: 1,y,2.0,2,0.0,2.0
x2,added row,x2,1.0,1,1.0,
z,vals added: 1 vals removed: 1,z,3.0,3,,3.0
x,removed row,x,,1,1.0,1.0


In [156]:
#works with .csv and .xlsx files (also with multiple sheets):

df1_new, df1_old = qp.get_dfs()

df2_old = qp.get_df()
df2_old.rename(columns={'ID': 'uid'}, inplace=True)
df2_new = df2_old.iloc[1:, :10].copy()
df2_new['new_col'] = np.nan
df2_new.loc[3, 'name'] = 'new_name'
df2_new.loc[[2,7], 'height'] = [170, None]
df2_new.loc[[3,8], 'age'] = [60, pd.NA]


with pd.ExcelWriter(f'dfs_new.xlsx') as writer:
    df1_new.to_excel(writer, sheet_name='df1', index=False)
    df2_new.to_excel(writer, sheet_name='df2', index=False)

with pd.ExcelWriter(f'dfs_old.xlsx') as writer:
    df1_old.to_excel(writer, sheet_name='df1', index=False)
    df2_old.to_excel(writer, sheet_name='df2', index=False)



diff_obj = qp.diff(
    f'dfs_new.xlsx',
    f'dfs_old.xlsx',
    uid='uid',
    )


display(diff_obj.summary(), diff_obj.show('mix', 'df1'), diff_obj.show('mix', 'df2'))

Unnamed: 0,sheets,in both files,uid col,cols shared,rows shared,cols added,cols removed,rows added,rows removed,dtypes changed,cols renamed in new,cols renamed in old,cols ignored in new,cols ignored in old
0,df1,yes,uid,2,2,d,c,x2,x,a: int64 -> float64 b: float64 -> int64,,,,
1,df2,yes,uid,9,10,new_col,diabetes; dose,,10001,,,,,


Unnamed: 0_level_0,meta,uid,d,b,a,c
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
y,vals changed: 1,y,2.0,2.0,0.0,2.0
x2,added row,x2,1.0,1.0,1.0,
z,vals added: 1 vals removed: 1,z,3.0,3.0,,3.0
x,removed row,x,,1.0,1.0,1.0


Unnamed: 0_level_0,meta,uid,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,new_col,diabetes,dose
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10002,,10002.0,Jane Smith,1990/09/14,30,F,175.5cm,68,130,85,Highe,,yes,
10003,vals added: 1,10003.0,Alice Johnson,1985.08.23,,Female,170,72.5lb,,,,,,15 mg once a day
20001,vals added: 1 vals changed: 1,20001.0,new_name,19800406,60,Male,280,na,140,90mmHg,GOOD,,No,20mg
20002,,20002.0,eva white,05-11-2007,40.0,Other,,,135mmhg,,n.a.,,Y,20 Mg
20003,,20003.0,Frank miller,06-30-1983,forty-five,m,185,75kg,125,75,High,,Yes,25g
30001,,30001.0,Grace TAYLOR,28-05-1975,,ff,1,,NAN,,Normal,,NO,
30002,vals removed: 1,30002.0,Harry Clark,1960Mar08,unk,,,80.3,122,,,,,
30003,,30003.0,IVY GREEN,1955-Jan-09,,,-10,130lbs,,95,high,,,30 MG
30004,,30004.0,JAck Williams,1950 Sep 10,unknown,Mal,,82,130,0,,,n,35
30005,,30005.0,john Doe,1945 October 11,35,female,200,-65,45,,Normal,,Yes,40ml


In [157]:
#cleanup
import gc
gc.collect()
os.remove('dfs_new.xlsx')
os.remove('dfs_old.xlsx')

## qp.merge

Performs a modified left join on two dataframes, aggregating duplicates in the right df into a single cell as a string.

In [158]:
#combining multiple dfs even of moderate sizes can result
#in an explosion in size when some of them contain
#multiple values for the same id.

import pandas as pd
import qplib as qp

a = pd.DataFrame({
    'id': [1, 2, 3],
    'value_a': ['a', 'b', 'c']
    })
b = pd.DataFrame({
    'id': [1, 1, 2],
    'value_b': ['d', 'e', 'f']
    })
c = pd.DataFrame({
    'id': [3, 1, 1],
    'value_c': ['g', 'h', 'i']
    })
d = pd.DataFrame({
    'id': [1, 1, 1],
    'value_d': ['j', 'k', 'l']
    })
e = pd.DataFrame({
    'id': [1, 1, 1],
    'value_e': ['m', 'n', 'o']
    })
f = pd.DataFrame({
    'id': [1, 2, 1],
    'value_f': ['p', 'q', 'r']
    })

merged = a
rows = len(a.index)
cols = len(a.columns)
for df in [b, c, d, e, f]:
    merged = pd.merge(merged, df, on='id', how='left')
    rows += len(df.index)
    cols += len(df.columns) - 1  #-1 because 'id' is already in

merged_qp = a
for df in [b, c, d, e, f]:
    merged_qp = qp.merge(merged_qp, df, on='id', prefix='')

print(f'before merging: {rows} rows, {cols} columns')
print(f'after regular merging: {len(merged.index)} rows, {len(merged.columns)} columns')
print(f'after qp.merge: {len(merged_qp.index)} rows, {len(merged_qp.columns)} columns')

print('result of regular merging:')
display(merged)

print('result of qp.merge:')
display(merged_qp.q(r'%!=id  $~x.replace("\n", "<br>")  $align=left  %is any;'))


before merging: 18 rows, 7 columns
after regular merging: 74 rows, 7 columns
after qp.merge: 3 rows, 7 columns
result of regular merging:


Unnamed: 0,id,value_a,value_b,value_c,value_d,value_e,value_f
0,1,a,d,h,j,m,p
1,1,a,d,h,j,m,r
2,1,a,d,h,j,n,p
3,1,a,d,h,j,n,r
4,1,a,d,h,j,o,p
...,...,...,...,...,...,...,...
69,1,a,e,i,l,n,r
70,1,a,e,i,l,o,p
71,1,a,e,i,l,o,r
72,2,b,f,,,,q


result of qp.merge:


Unnamed: 0,id,value_a,value_b,value_c,value_d,value_e,value_f
0,1,a,#1: d ; #2: e ;,#1: h ; #2: i ;,#1: j ; #2: k ; #3: l ;,#1: m ; #2: n ; #3: o ;,#1: p ; #2: r ;
1,2,b,f,,,,q
2,3,c,,g,,,


In [159]:
#this works well for text values but numerical values cannot
#be compared or used in calculations after merging.
#to solve this, the "flatten" arg can be used to flatten specified values.

b1 = b.copy()
b1['value_b'] = [42.0, 43.0, 44.0]

merged_qp1 = a
for df in [b1, c, d, e, f]:
    merged_qp1 = qp.merge(merged_qp1, df, on='id', prefix='', flatten=['value_b'])

merged_qp1.q(r'%!=id  $to str;  $~x.replace("\n", "<br>")  $align=left  %is any;')

Unnamed: 0,id,value_a,value_b,value_b_1,value_b_2,value_c,value_d,value_e,value_f
0,1,a,#1: 42.0 ; #2: 43.0 ;,42.0,43.0,#1: h ; #2: i ;,#1: j ; #2: k ; #3: l ;,#1: m ; #2: n ; #3: o ;,#1: p ; #2: r ;
1,2,b,44.0,44.0,,,,,q
2,3,c,,,,g,,,
