# imports

In [None]:
import pandas as pd
import numpy as np
import copy
import os
import sys
import shutil
import datetime
import pickle
import qplib as qp
from qplib import log, na, nk, num



# pd_query

In [1]:

import numpy as np
import pandas as pd
import copy
import re
import qplib as qp

from IPython.display import display
from ipywidgets import widgets, interactive_output, HBox, VBox, fixed, Layout, interact_manual

from qplib.util import log, qpDict
from qplib.pd_util import _check_df, indexQpExtension, seriesQpExtension, dfQpExtension
from qplib.pd_util import diff as mv_diff



from qplib.types import qp_int, qp_float, qp_num, qp_bool, qp_datetime, qp_date, qp_na, qp_nk, qp_yn




_operators1 = qpDict({
    #need a value for comparison or modification
    'bigger_equal': '>=',
    'smaller_equal': '<=',
    'bigger': '>',
    'smaller': '<',
    'strict_equal': '==',
    'equal': '=',
    'regex_match': '~~',
    'regex_search': '~',
    'strict_contains': '(())',
    'contains': '()',

    'lambda_condition': '?',
    'lambda_condition_col': 'col?',
    })
_operators2 = qpDict({
    #these dont need a values for comparison or modification
    'is_str': 'is str',
    'is_int': 'is int',
    'is_float': 'is float',
    'is_num': 'is num',
    'is_bool': 'is bool',

    'is_datetime': 'is datetime',
    'is_date': 'is date',

    'is_any': 'is any',
    'is_na': 'is na',
    'is_nk': 'is nk',
    'is_yn': 'is yn',
    'is_yes': 'is yes',
    'is_no': 'is no',
    })
_operators = qpDict({**_operators1, **_operators2})
_operators_only_df = qpDict({
    'lambda_condition_col': _operators['lambda_condition_col'],
    })

_modifiers1 = qpDict({
    #need a value for comparison or modification
    'set_x': ['x=', 'x ='],
    'set_col': ['col=', 'col ='],
    'set_row': ['row=', 'row ='],
    'set_col_tags': ['col#=', 'col# =', 'col #=', 'col # ='],
    'set_row_tags': ['row#=', 'row# =', 'row #=', 'row # ='],
    })
_modifiers2 = qpDict({
    #these dont need a values for comparison or modification
    'to_str': 'to str',
    'to_int': 'to int',
    'to_float': 'to float',
    'to_num': 'to num',
    'to_bool': 'to bool',

    'to_datetime': 'to datetime',
    'to_date': 'to date',

    'to_na': 'to na',
    'to_nk': 'to nk',
    'to_yn': 'to yn',

    'get_num': 'get num',
    })
_modifiers = qpDict({**_modifiers1, **_modifiers2})
_modifiers_only_df = qpDict({
    'set_col': _modifiers['set_col'],
    'set_row': _modifiers['set_row'],
    'set_col_tags': _modifiers['set_col_tags'],
    'set_row_tags': _modifiers['set_row_tags'],
    })



@pd.api.extensions.register_index_accessor('q')
class IndexQuery:
    """
    A custom query language for filtering and modifying data  in pandas Indices.

    each query consists of 2 string expressions, each of which can be left empty:
        1. filter: '=abc', '!=1', 'is int && >0'
        2. data modification: 'to num', 'x= str(x)'

    eg.: se.q('is str', 'x= "prefix" + x')

    multiple conditions in an expression as well as multiple whole queries can be connected.

    Connectors:
        &&: the previous conditions AND this new conditions have to apply
        //: the previous conditions OR this new conditions have to apply
        >>: ignore previous conditions and use these new conditions INSTEAD

    Operators:
        >: bigger
        <: smaller
        >=: bigger or equal
        <=: smaller or equal
        
        ~: contains a regex pattern (re.search)
        ~~: matches a regex pattern (re.match)

        =: equal
        ==: strictly equal (case sensitive)

        (): contains a string
        (()): contains a string (case sensitive)

        ?: filter using a python expression (must evaluate to True or False)

        is str: is a string
        is int: is an integer
        is float: is a float
        is num: is a number (int or float)
        is bool: is a boolean

        is date: is a date (quite format lenient but defaults to european formats)
        is datetime: is a datetime

        is any: matches any value, use to select all
        is na: not available data (quite lenient)
        is nk: not known data (quite lenient)
        is yk: is a yes or no value
        is yes: is a yes value
        is no: is a no value

    Modifiers:
        x=: set x to a value

        to str: convert to string
        to int: convert to integer
        to float: convert to float
        to num: convert to number
        to bool: convert to boolean

        to date: convert to date
        to datetime: convert to datetime

        to na: convert to not available
        to nk: convert to not known
        to yn: convert to yes or no
    """


    def __init__(self, idx: pd.Index):
        idx.qp._operators = _operators
        idx.qp._operators_only_df = _operators_only_df
        idx.qp._modifiers = _modifiers
        idx.qp._modifiers_only_df = _modifiers_only_df
        self.idx = idx 


    #wip
    # def check(self):
    #     _check_index(self.idx)


    def __repr__(self):
        return 'docstring of dataframe accessor pd_object.q():' + self.__doc__
    

    def __call__(self, *expressions, verbosity=3):

        #setup
        idx = self.idx
        idx.qp = self.idx.qp
        idx.qp._input = f".q{expressions}"
        
        index_expression = index_condition = pd.Index([True for i in idx])
        for i_expression,expression in enumerate(expressions):
            expression_type = ['filter index', 'modify index'][i_expression%2]
            if expression == '':
                continue


            if expression_type == 'filter index':
                ast = _parse_expression(idx, expression, 'filter index', verbosity)

                for condition in ast['conditions']:
                    index_new = _apply_condition(idx, condition, _operators, verbosity, index=idx)

                    index_condition = _update_index(index_condition, index_new, condition['condition_connector'], i=None, verbosity=verbosity)
                index_expression = _update_index(index_expression, index_condition, ast['connector'], i=None, verbosity=verbosity)

                if index_expression.any() == False and verbosity >= 2:
                        log(f'no values fulfill the condition(s) in "{expression}"', level='warning', source='idx.q', input=idx.qp._input)


            elif expression_type == 'modify index':
                ast = _parse_expression_modify(idx, expression, expression_type, verbosity)

                for modification in ast['modifications']:
                    se = idx.to_series()
                    se.qp = idx.qp
                    idx = pd.Index(_apply_modification(se, index_expression, modification, verbosity, index=idx))
                    idx.qp = self.idx.qp
        


        idx_filtered = idx[index_expression]
        idx_filtered.qp = idx.qp

        return idx_filtered


@pd.api.extensions.register_series_accessor('q')
class SeriesQuery:
    """
    A custom query language for filtering and modifying data  in pandas Series.

    each query consists of 2 string expressions, each of which can be left empty:
        1. filter: '=abc', '!=1', 'is int && >0'
        2. data modification: 'to num', 'x= str(x)'

    eg.: se.q('is str', 'x= "prefix" + x')

    multiple conditions in an expression as well as multiple whole queries can be connected.

    Connectors:
        &&: the previous conditions AND this new conditions have to apply
        //: the previous conditions OR this new conditions have to apply
        >>: ignore previous conditions and use these new conditions INSTEAD

    Operators:
        >: bigger
        <: smaller
        >=: bigger or equal
        <=: smaller or equal
        
        ~: contains a regex pattern (re.search)
        ~~: matches a regex pattern (re.match)

        =: equal
        ==: strictly equal (case sensitive)

        (): contains a string
        (()): contains a string (case sensitive)

        ?: filter using a python expression (must evaluate to True or False)

        is str: is a string
        is int: is an integer
        is float: is a float
        is num: is a number (int or float)
        is bool: is a boolean

        is date: is a date (quite format lenient but defaults to european formats)
        is datetime: is a datetime

        is any: matches any value, use to select all
        is na: not available data (quite lenient)
        is nk: not known data (quite lenient)
        is yk: is a yes or no value
        is yes: is a yes value
        is no: is a no value

    Modifiers:
        x=: set x to a value

        to str: convert to string
        to int: convert to integer
        to float: convert to float
        to num: convert to number
        to bool: convert to boolean

        to date: convert to date
        to datetime: convert to datetime

        to na: convert to not available
        to nk: convert to not known
        to yn: convert to yes or no
    """


    def __init__(self, se: pd.Series):
        se.qp._operators = _operators
        se.qp._operators_only_df = _operators_only_df
        se.qp._modifiers = _modifiers
        se.qp._modifiers_only_df = _modifiers_only_df
        self.se = se 


    #wip
    # def check(self):
    #     _check_series(self.se)


    def __repr__(self):
        return 'docstring of dataframe accessor pd_object.q():' + self.__doc__
    

    def __call__(self, *expressions, verbosity=3):

        #setup
        se = copy.deepcopy(self.se)
        se.qp = self.se.qp
        se.qp._input = f".q{expressions}"
        
        index_expression = index_condition = pd.Index([True for i in se])
        for i_expression,expression in enumerate(expressions):
            expression_type = ['filter series', 'modify series'][i_expression%2]
            if expression == '':
                continue


            if expression_type == 'filter series':
                ast = _parse_expression(se, expression, expression_type, verbosity)

                for condition in ast['conditions']:
                    index_new = _apply_condition(se, condition, _operators, verbosity, series=se)

                    index_condition = _update_index(index_condition, index_new, condition['condition_connector'], i=None, verbosity=verbosity)
                index_expression = _update_index(index_expression, index_condition, ast['connector'], i=None, verbosity=verbosity)

                if index_expression.any() == False and verbosity >= 2:
                    log(f'no values fulfill the condition(s) in "{expression}"', level='warning', source='se.q', input=se.qp._input)


            elif expression_type == 'modify series':
                ast = _parse_expression_modify(se, expression, expression_type, verbosity)

                for modification in ast['modifications']:
                    se = _apply_modification(se, index_expression, modification, verbosity, series=se)
                    se.qp = self.se.qp


        se_filtered = se[index_expression]
        se_filtered.qp = se.qp
        return se_filtered


@pd.api.extensions.register_dataframe_accessor('q')
class DataFrameQuery:
    """
    A custom query language for filtering and modifying data and metadata in pandas DataFrames.

    each query consists of 3 string expressions, any of which can be left empty:
        1. column filter: '=col1', '!=col2', '()1 // ()2'
        2. row filter: 'is date', 'is int && >0', '? len(str(x)) > 5'
        3. (meta-)data modification: 'x= str(x)', 'col#= integer column', 'row#= positive'

    eg.: df.q('=col1', 'is int && >0', 'row#= positive')

    multiple conditions in an expression as well as multiple whole queries can be connected.

    Connectors:
        &&: the previous conditions AND this new conditions have to apply
        //: the previous conditions OR this new conditions have to apply
        >>: ignore previous conditions and use these new conditions INSTEAD

    Operators:
        >: bigger
        <: smaller
        >=: bigger or equal
        <=: smaller or equal
        
        ~: contains a regex pattern (re.search)
        ~~: matches a regex pattern (re.match)

        =: equal
        ==: strictly equal (case sensitive)

        (): contains a string
        (()): contains a string (case sensitive)

        ?: filter using a python expression (must evaluate to True or False)

        is str: is a string
        is int: is an integer
        is float: is a float
        is num: is a number (int or float)
        is bool: is a boolean

        is date: is a date (quite format lenient but defaults to european formats)
        is datetime: is a datetime

        is any: matches any value, use to select all
        is na: not available data (quite lenient)
        is nk: not known data (quite lenient)
        is yk: is a yes or no value
        is yes: is a yes value
        is no: is a no value

    Modifiers:
        x=: set x to a value
        col=: set whole column to a value
        row=: set whole row to a value
        col#=: tag columns
        row#=: tag rows

        to str: convert to string
        to int: convert to integer
        to float: convert to float
        to num: convert to number
        to bool: convert to boolean

        to date: convert to date
        to datetime: convert to datetime

        to na: convert to not available
        to nk: convert to not known
        to yn: convert to yes or no
    """


    def __init__(self, df: pd.DataFrame):
        _check_df(df)
        self.df = df
        self.df.qp._operators = _operators
        self.df.qp._modifiers = _modifiers


    def __repr__(self):
        return 'docstring of dataframe accessor pd_object.q():' + self.__doc__
    

    def __call__(self,
            *expressions,  #string expressions for filtering and modifying data
            diff=None,  #[None, 'mix', 'old', 'new', 'new+']
            max_cols=200,  #maximum number of columns to display. None: show all
            max_rows=20,  #maximum number of rows to display. None: show all
            inplace=True,  #make modifications inplace or just return a new dataframe
            verbosity=3,  #verbosity level for logging. 0: no logging, 1: errors, 2: warnings, 3: info, 4: debug
            **kwargs
            ):

        ###########################################
        #                  setup                  #
        ###########################################

        #input string for logging
        input_str = ".q("
        for i,expression in enumerate(expressions):
            if (i+1)%3 == 1:
                input_str += f"\n\t{expression !r},"
            else:
                input_str += f" {expression !r},"
   
        if diff is not None:
            input_str += f"\n\tdiff='{diff}',"
        if max_cols is not None:
            input_str += f"\n\tmax_cols={max_cols},"
        if max_rows is not None:
            input_str += f"\n\tmax_rows={max_rows},"
        
        input_str += f"\n\tinplace={inplace},"
        input_str += f"\n\tverbosity={verbosity},"

        for kwarg in kwargs:
            input_str += f"\n\t{kwarg}='{kwargs[kwarg]}'"

        self.df.qp._input = input_str + "\n\t)"


    
        if inplace is False:
            df = self.df.copy()
        else:
            df = self.df
            
        df.qp = self.df.qp

        

        #inserting metadata row at the top, sadly a bit hacky
        #because there does not seem to be an inplace function for that
        if '#' not in df.index:
            if verbosity >= 2:
                log(f'a row named "#" containing metadata at location 0 is expected but was not found. '
                    +'adding metadata row now. its advised to prepare the data with df=df.format().',
                    level='warning', source='df.q()')
            df_old = df.copy()
            index_old = df.index
            index_new = pd.Index(['#', *index_old])
            
            df.loc['#'] = ''
            df.set_index(index_new, inplace=True)
            df.loc[index_old, :] = df_old
            df.loc['#'] = ''

        #inserting metadata column at the start
        if '#' not in df.columns:
            if verbosity >= 2:
                log(f'a column named "#" containing metadata at location 0 is expected but was not found. '
                    +'adding metadata column now. its advised to prepare the data with df=df.format().',
                    level='warning', source='df.q()')
            df.insert(0, '#', '')





        
        ###########################################
        #               run queries               #
        ###########################################

        cols_filtered = cols_filtered_condition = pd.Index([True for col in df.columns])
        rows_filtered = rows_filtered_condition = rows_filtered_col = pd.Index([True for row in df.index])
        

        for i_expression,expression in enumerate(expressions):
            expression_type = ['col', 'row', 'modify'][i_expression%3]
            if expression == '':
                continue


            #filter columns
            if expression_type == 'col':
                ast = _parse_expression(df, expression, expression_type, verbosity)

                for i_condition,condition in enumerate(ast['conditions']):
                    if condition['by_tag'] == 'row or col metadata':
                        cols_filtered_new = _apply_condition(df.loc['#'], condition, _operators, verbosity, df=df)
                    else:
                        cols_filtered_new = _apply_condition(df.columns, condition, _operators, verbosity, df=df)

                    cols_filtered_condition = _update_index(cols_filtered_condition, cols_filtered_new, condition['condition_connector'], i_condition, verbosity=verbosity)
                cols_filtered = _update_index(cols_filtered, cols_filtered_condition, ast['connector'], i_expression, verbosity=verbosity)

                if cols_filtered.any() == False and verbosity >= 2:
                    log(f'no columns fulfill the condition(s) in "{expression}"', level='warning', source='df.q', input=df.qp._input)

                


            #filter rows
            elif expression_type == 'row':
                ast = _parse_expression(df, expression, expression_type, verbosity)

                for i_condition,condition in enumerate(ast['conditions']):
                    for i_col,col in enumerate(df.columns[cols_filtered_condition][1:]):  #ignore metadata column
                        if condition['by_tag'] == 'row or col metadata':
                            rows_filtered_new = _apply_condition(df['#'], condition, _operators, verbosity, df=df)
                        else:
                            rows_filtered_new = _apply_condition(df[col], condition, _operators, verbosity, df=df)

                        rows_filtered_col = _update_index(rows_filtered_col, rows_filtered_new, condition['which_cols'], i_col, verbosity=verbosity)
                    rows_filtered_condition = _update_index(rows_filtered_condition, rows_filtered_col, condition['condition_connector'], i_condition, verbosity=verbosity)
                rows_filtered = _update_index(rows_filtered, rows_filtered_condition, ast['connector'], i_expression-1, verbosity=verbosity)

                if rows_filtered.any() == False and verbosity >= 2:
                    log(f'no rows fulfill the condition(s) in "{expression}"', level='warning', source='df.q', input=df.qp._input)



            #modify data
            elif expression_type == 'modify':
                cols_filtered_no_metadata = [col for col in df.columns[cols_filtered] if not col.startswith('#')]
                rows_filtered_no_metadata = [row for row in df.index[rows_filtered] if row != '#']

                ast = _parse_expression_modify(df, expression, expression_type, verbosity)

                for modification in ast['modifications']:
                            
                    #tag columns
                    if modification['modifier'] in _modifiers['set_col_tags']:
                        df.loc['#', cols_filtered_no_metadata] = df.loc['#', cols_filtered_no_metadata].apply(
                            lambda x: eval(modification['value'])
                            )
                    
                    #tag rows
                    elif modification['modifier'] in _modifiers['set_row_tags']:
                        df.loc[rows_filtered_no_metadata, '#'] = df.loc[rows_filtered_no_metadata, '#'].apply(
                            lambda x: eval(modification['value'])
                            )
                    
                    #modify filtered data
                    else:
                        df.loc[:, :] = _apply_modification_df(df, rows_filtered_no_metadata, cols_filtered_no_metadata, modification, verbosity).loc[:, :]






        ##########################################
        #            display settings            #
        ##########################################
   
        df_filtered = df.loc[rows_filtered, cols_filtered]
        df_filtered.qp = self.df.qp
    
        if diff is None:

            cols_num = len(df.columns[cols_filtered])
            rows_num = len(df.index[rows_filtered])

            if verbosity >= 2:
                if max_cols is not None and max_cols < cols_num:
                    log(f'showing {max_cols} out of {cols_num} columns', level='warning', source='df.q', input=df.qp._input)
                if max_rows is not None and max_rows < rows_num:
                    log(f'showing {max_rows} out of {rows_num} rows', level='warning', source='df.q', input=df.qp._input)
 
            pd.set_option('display.max_columns', max_cols)
            pd.set_option('display.max_rows', max_rows)
            pd.set_option('display.min_rows', max_rows)

            return df_filtered
        
        else:


            #show difference before and after filtering
            result = qp.diff(
                df_filtered, self.df, show=diff,
                max_cols=max_cols, max_rows=max_rows,
                verbosity=verbosity)  
            return  result



def _parse_expression(pd_object, expression, mode, verbosity=3):

    ast = {
        'expression': expression,
        'mode': mode,
        'connector': None,
        'conditions': [],
        }
    
    if expression[:2] not in ['&&', '//', '>>']:
        expression = '>>' + expression

    ast['connector'] = expression[:2]


    expressions = re.split('(&&|//|>>)', expression)
    for ind in range(len(expressions)):

        condition = {'expression': expressions[ind], 'mode': mode}
        condition_str = expressions[ind].strip()

        if len(condition_str) == 0 or condition_str in ['&&', '//', '>>']:
            continue

        
        if expressions[ind-1] == '&&':  #and
            condition['condition_connector'] = '&&'
        elif expressions[ind-1] == '//':  #inclusive or
            condition['condition_connector'] = '//'
        elif expressions[ind-1] == '>>':  #reset
            condition['condition_connector'] = '>>'

        
        #row conditions also specify in which cols the condition is applied. default: any
        if mode == 'row':
            if condition_str.startswith('any'):
                condition['which_cols'] = 'any'
                condition_str = condition_str[3:].lstrip()
            elif condition_str.startswith('all'):
                condition['which_cols'] = 'all'
                condition_str = condition_str[3:].lstrip()
            else:
                condition['which_cols'] = 'any'


        #rows and cols can be filtered by metadata
        if condition_str.startswith('#'):
            condition['by_tag'] = 'row or col metadata'
            condition_str = condition_str[1:].lstrip()
        else:
            condition['by_tag'] = False

        #wip
        # if condition_str.startswith('x#'):
        #     condition['by_tag_value'] = 'value metadata'
        #     condition_str = condition_str[1:].lstrip()
        # else:
        #     condition['by_tag_value'] = False

        


        #should the condition be negated. default: False
        if condition_str.startswith('!'):
            condition['negate'] = True
            condition_str = condition_str[1:].lstrip()
        else:
            condition['negate'] = False


        #operator for condition. default: =
        for operator in _operators.values_flat():
            if condition_str.startswith(operator):
                condition['operator'] = operator
                condition_str = condition_str[len(operator):].lstrip()
                break
        
        if 'operator' not in condition:
            if verbosity >= 3:
                log(f'no operator found in condition "{condition_str}". Using default operator "="',
                    level='info', source='_parse_expression', input=pd_object.qp._input)
            condition['operator'] = '='


        if mode in ['filter series', 'filter index'] and condition['operator'] in _operators_only_df.values_flat():
            if verbosity >= 1:
                operator_temp = condition['operator']
                log(f'operator "{operator_temp}" only works with dataframes. Ignoring condition "{condition_str}"',
                    level='error', source='_parse_expression', input=pd_object.qp._input)
            continue
        
        if condition['operator'] in _operators2.values_flat() and len(condition_str) > 0:
            if verbosity >= 2:
                operator_temp = condition['operator']
                log(f'operator "{operator_temp}" does not need a value for comparison. Ignoring value "{condition_str}"',
                    level='warning', source='_parse_expression', input=pd_object.qp._input)


        #value for comparison
        condition['value'] = condition_str.strip()
    
        ast['conditions'].append(condition)

    if verbosity >= 4:
        display(f'abstract syntax tree for expression "{expression}":', ast)
        
    return ast

def _parse_expression_modify(pd_object, expression, mode, verbosity=3):

    ast = {
        'expression': expression,
        'mode': mode,
        'modifications': [],
        }
    

    expressions = re.split('(&&)', expression)
    for ind in range(len(expressions)):

        condition = {}
        condition_str = expressions[ind].strip()

        if len(condition_str) == 0 or condition_str in ['&&']:
            continue


        #modifier to use. default: "set x:"
        for modifier in _modifiers.values_flat():
            if condition_str.startswith(modifier):
                condition['modifier'] = modifier
                condition_str = condition_str[len(modifier):].lstrip()
                break
        
        if 'modifier' not in condition:
            if verbosity > 3:
                log(f'no modifier found in condition "{condition_str}". Using default modifier "x="',
                    level='info', source='_parse_expression', input=pd_object.qp._input)
            condition['modifier'] = 'x='

        if mode in ['modify series', 'modify index'] and condition['modifier'] in _modifiers_only_df.values_flat():
            if verbosity >= 1:
                modifier_temp = condition['modifier']
                log(f'modifier "{modifier_temp}" only works with dataframes. Ignoring condition "{condition_str}"',
                    level='error', source='_parse_expression', input=pd_object.qp._input)
            continue

        if condition['modifier'] in _modifiers2.values_flat() and len(condition_str) > 0:
            if verbosity >= 2:
                modifier_temp = condition['modifier']
                log(f'modifier "{modifier_temp}" does not need a value for comparison. Ignoring value "{condition_str}"',
                    level='warning', source='_parse_expression', input=pd_object.qp._input)


        #expression used for modification
        condition['value'] = condition_str.strip()
    
        ast['modifications'].append(condition)
        
    if verbosity >= 4:
        display(f'abstract syntax tree for expression "{expression}":', ast)
    
    return ast


def _apply_condition(pd_object, condition, operators, verbosity=3, df=None, series=None, index=None):
    """
    filters a pandas object using a query condition
    """

    value = condition['value']

    if isinstance(pd_object, pd.Index):
        pd_object = pd_object.to_series()

    
    match condition['operator']:
        #numeric comparison
        case operators.bigger_equal:
            filtered = pd.to_numeric(pd_object, errors='coerce') >= pd.to_numeric(value)
        case operators.smaller_equal:
            filtered = pd.to_numeric(pd_object, errors='coerce') <= pd.to_numeric(value)
        case operators.bigger:
            filtered = pd.to_numeric(pd_object, errors='coerce') > pd.to_numeric(value)
        case operators.smaller:
            filtered = pd.to_numeric(pd_object, errors='coerce') < pd.to_numeric(value)
        
        
        #regex comparison
        case operators.regex_match:
            filtered = pd_object.astype(str).str.fullmatch(value) 
        case operators.regex_search:
            filtered = pd_object.astype(str).str.contains(value)


        #string equality comparison
        case operators.strict_equal:
            filtered = pd_object.astype(str) == value
        case operators.equal:
            value_lenient = [value]
            try:
                value_lenient.append(str(float(value)))
                value_lenient.append(str(int(float(value))))
            except:
                value_lenient.append(value.lower())
            filtered = pd_object.astype(str).str.lower().isin(value_lenient)
        
        #substring comparison
        case operators.strict_contains:
            filtered = pd_object.astype(str).str.contains(value, case=True, regex=False)
        case operators.contains:
            filtered = pd_object.astype(str).str.contains(value, case=False, regex=False)



        #lambda function
        case operators.lambda_condition:
            filtered = pd_object.apply(lambda x, df=df, series=series, index=index, pd=pd, np=np: eval(value))
        case operators.lambda_condition_col:
            filtered = eval(value, {'col': pd_object, 'df': df, 'pd': pd, 'np': np, 'qp': qp})


        #type checks
        case operators.is_bool:
            filtered = pd_object.apply(lambda x: isinstance(x, bool))
        case operators.is_str:
            filtered = pd_object.apply(lambda x: isinstance(x, str))
        case operators.is_int:
            filtered = pd_object.apply(lambda x: isinstance(x, int))
        case operators.is_float:
            filtered = pd_object.apply(lambda x: isinstance(x, float))
        case operators.is_num:
            filtered = pd_object.apply(lambda x: qp_num(x, errors='ERROR')) != 'ERROR'

        case operators.is_date:
            filtered = pd_object.apply(lambda x: qp_date(x, errors='ERROR')) != 'ERROR'
        case operators.is_datetime:
            filtered = pd_object.apply(lambda x: qp_datetime(x, errors='ERROR')) != 'ERROR'

        case operators.is_any:
            filtered = pd_object.apply(lambda x: True)
        case operators.is_na:
            filtered = pd_object.apply(lambda x: qp_na(x, errors='ERROR')) != 'ERROR'
        case operators.is_nk:
            filtered = pd_object.apply(lambda x: qp_nk(x, errors='ERROR')) != 'ERROR'
        case operators.is_yn:
            filtered = pd_object.apply(lambda x: qp_yn(x, errors='ERROR')) != 'ERROR'
        case operators.is_yes:
            filtered = pd_object.apply(lambda x: qp_yn(x, errors='ERROR', yes=1)) == 1
        case operators.is_no:
            filtered = pd_object.apply(lambda x: qp_yn(x, errors='ERROR', no=0)) == 0

        case _:
            if verbosity >= 1:
                operator_temp = condition['operator']
                log(f'operator "{operator_temp}" is not implemented', level='error', source='_apply_condition()', input=pd_object.qp._input)
            filtered = None


    if condition['negate']:
        filtered = ~filtered

    if condition['mode'] in ['col', 'row']:
        filtered.iloc[0] = True  #metadata row/column should always be included

    return filtered

def _apply_modification(pd_object, indices, modification, verbosity=3, series=None, index=None):
    modifiers = pd_object.qp._modifiers

    #data modification
    if modification['modifier'] in modifiers['set_x']:
        pd_object[indices] = pd_object[indices].map(lambda x, pd=pd, np=np, qp=qp: eval(modification['value']))

    #type conversion
    elif modification['modifier'] == modifiers['to_str']:
        pd_object[indices] = pd_object[indices].map(str)
    elif modification['modifier'] == modifiers['to_int']:
        pd_object[indices] = pd_object[indices].map(qp_int)
    elif modification['modifier'] == modifiers['to_float']:
        pd_object[indices] = pd_object[indices].map(qp_float)
    elif modification['modifier'] == modifiers['to_num']:
        pd_object[indices] = pd_object[indices].map(qp_num)
    elif modification['modifier'] == modifiers['to_bool']:
        pd_object[indices] = pd_object[indices].map(qp_bool)
    
    elif modification['modifier'] == modifiers['to_date']:
        pd_object[indices] = pd_object[indices].map(qp_date)
    elif modification['modifier'] == modifiers['to_datetime']:
        pd_object[indices] = pd_object[indices].map(qp_datetime)
    elif modification['modifier'] == modifiers['to_na']:
        pd_object[indices] = pd_object[indices].map(qp_na)
    elif modification['modifier'] == modifiers['to_nk']:
        pd_object[indices] = pd_object[indices].map(qp_nk)
    elif modification['modifier'] == modifiers['to_yn']:
        pd_object[indices] = pd_object[indices].map(qp_yn)

    return pd_object

def _apply_modification_df(df, rows, cols, modification, verbosity=3):
    modifiers = df.qp._modifiers

    if pd.__version__ >= '2.1.0':
        #data modification
        if modification['modifier'] in modifiers['set_x']:
            df.loc[rows, cols] = df.loc[rows, cols].map(lambda x, df=df, pd=pd, np=np, qp=qp: eval(modification['value']))
                

        elif modification['modifier'] in modifiers['set_col']:
            df.loc[:, cols] = df.loc[:, cols].apply(lambda x, df=df, pd=pd, np=np, qp=qp: eval(modification['value']), axis=0)

        elif modification['modifier'] in modifiers['set_row']:
            df.loc[rows, :] = df.loc[rows, :].apply(lambda x, df=df, pd=pd, np=np, qp=qp: eval(modification['value']), axis=1)


        #type conversion
        elif modification['modifier'] == modifiers['to_str']:
            df.loc[rows, cols] = df.loc[rows, cols].map(str)
        elif modification['modifier'] == modifiers['to_int']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_int)
        elif modification['modifier'] == modifiers['to_float']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_float)
        elif modification['modifier'] == modifiers['to_num']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_num)
        elif modification['modifier'] == modifiers['to_bool']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_bool)
        
        elif modification['modifier'] == modifiers['to_date']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_date)
        elif modification['modifier'] == modifiers['to_datetime']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_datetime)

        elif modification['modifier'] == modifiers['to_na']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_na)
        elif modification['modifier'] == modifiers['to_nk']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_nk)
        elif modification['modifier'] == modifiers['to_yn']:
            df.loc[rows, cols] = df.loc[rows, cols].map(qp_yn)

    else:
        #data modification
        if modification['modifier'] in modifiers['set_x']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(lambda x, df=df, pd=pd, np=np, qp=qp: eval(modification['value']))
                

        elif modification['modifier'] in modifiers['set_col']:
            df.loc[:, cols] = df.loc[:, cols].apply(lambda x, df=df, pd=pd, np=np, qp=qp: eval(modification['value']), axis=0)

        elif modification['modifier'] in modifiers['set_row']:
            df.loc[rows, :] = df.loc[rows, :].apply(lambda x, df=df, pd=pd, np=np, qp=qp: eval(modification['value']), axis=1)


        #type conversion
        elif modification['modifier'] == modifiers['to_str']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(str)
        elif modification['modifier'] == modifiers['to_int']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_int)
        elif modification['modifier'] == modifiers['to_float']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_float)
        elif modification['modifier'] == modifiers['to_num']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_num)
        elif modification['modifier'] == modifiers['to_bool']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_bool)
        
        elif modification['modifier'] == modifiers['to_date']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_date)
        elif modification['modifier'] == modifiers['to_datetime']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_datetime)

        elif modification['modifier'] == modifiers['to_na']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_na)
        elif modification['modifier'] == modifiers['to_nk']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_nk)
        elif modification['modifier'] == modifiers['to_yn']:
            df.loc[rows, cols] = df.loc[rows, cols].applymap(qp_yn) 

    return df
    

def _update_index(values, values_new, connector, i, verbosity=3):
    if i == 0:
        values = values_new
    elif connector == '>>':
        values = values_new
    elif connector in ['&&', 'all']:
        values &= values_new
    elif connector in ['//', 'any']:
        values |= values_new
    else:
        if verbosity >= 1:
            log(f'connector "{connector}" is not implemented', level='error', source='_update_index()')
    return values



@pd.api.extensions.register_dataframe_accessor('qi')
class DataFrameQueryInteractiveMode:
    """
    Wrapper for df.q() for interactive use in Jupyter notebooks.
    """
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def __call__(self, num_filters=5):
        kwargs = {'df': fixed(self.df)}


        ###########################################
        #            tab0 queries&diff            #
        ###########################################

        ui_label_filter_cols = widgets.Label(value='filter columns')
        ui_label_filter_rows = widgets.Label(value='filter rows')
        ui_label_modify_data = widgets.Label(value='modify data')
        ui_expressions = [ui_label_filter_cols, ui_label_filter_rows, ui_label_modify_data]



        for i in range(num_filters):
            if i == 0:
                placeholder_col = '=name'
                placeholder_row = '()john'
                placeholder_modify = 'x= x.upper() '
            elif i == 1:
                placeholder_col = '// =age'
                placeholder_row = '&& <0 // >120'
                placeholder_modify = 'row#= "implausible age" '
            elif i == 2:
                placeholder_col = '// =weight // =height'
                placeholder_row = '!is num'
                placeholder_modify = 'to num'
            else:
                placeholder_col = ''
                placeholder_row = ''
                placeholder_modify = ''
            

            col = widgets.Combobox(
                value='',
                placeholder=placeholder_col,
                options=['=' + col for col in self.df.columns if col != '#'],
                )
            kwargs[f'col_expression{i}'] = col
            ui_expressions.append(col)
            
            row = widgets.Combobox(
                value='',
                placeholder=placeholder_row,
                options=[
                    '>0', '<0', '>=0', '<=0',  #numerical comparison
                    '=abc', '!=abc', '==AbC', '!==AbC',  #equality checks
                    '()abc', '(())AbC',  #substring checks
                    '~*a.c', '~~*a.c',  #regex checks
                    
                    '? len(x) > 3',  #lambda function condition for each value
                    'col? col > df["age"]',  #lambda function condition for whole columns

                    #type checks
                    'is str', 'is int', 'is float', 'is num', 'is bool',
                    'is date', 'is datetime',
                    'is any', 'is na', 'is nk',
                    'is yn', 'is yes', 'is no'
                    ]
                )
            kwargs[f'row_expression{i}'] = row
            ui_expressions.append(row)
            
            modify = widgets.Combobox(
                value='',
                placeholder=placeholder_modify,
                options=[
                    'x= x.upper() ', 'col= df["ID"]', 'row= df.loc[0]',  #change data
                    'col#= "tag" ', 'col#= x + "tag" ', 'row#= "tag" ', 'row#= x + "tag"',  #change metadata

                    #change type
                    'to str', 'to int', 'to float', 'to num', 'to bool',
                    'to date', 'to datetime',
                    'to na', 'to nk', 'to yn',
                    ]
                )
            kwargs[f'modify_expression{i}'] = modify
            ui_expressions.append(modify)
        

        #show differences
        ui_diff = widgets.ToggleButtons(
            options=[None, 'mix', 'old', 'new', 'new+'],
            description='show differences mode:',
            tooltips=[
                'dont show differences, just show the new (filtered) dataframe.',
                'show new (filtered) dataframe plus all the removed (filtered) values from the old dataframe. values affected by the filters are marked green (newly added), yellow (modified), red (deleted)',
                'show old (unfiltered) dataframe. values affected by the filters are marked green (newly added), yellow (modified), red (deleted)',
                'show new (filtered) dataframe. values affected by the filters are marked green (newly added), yellow (modified), red (deleted)',
                'show new (filtered) dataframe but also adds metadata columns with the prefix "#". If a value changed, the metadata column contains the old value. values affected by the filters are marked green (newly added), yellow (modified), red (deleted)',
                ],
            )
        kwargs['diff'] = ui_diff



        ###########################################
        #              tab1 settings              #
        ###########################################

        ui_inplace = widgets.ToggleButtons(
            options=[True, False],
            value=False,
            description='make modifications inplace:',
            tooltips=[
                'make modifications inplace, e.g. change the original dataframe',
                'return a new dataframe with the modifications',
                ],
            )
        kwargs['inplace'] = ui_inplace

        ui_verbosity = widgets.ToggleButtons(
            options=[0, 1, 2, 3, 4],
            value=3,
            description='verbosity level:',
            tooltips=[
                'no logging',
                'only errors',
                'errors and warnings',
                'errors, warnings and info',
                'errors, warnings, info and debug',
                ],
            )
        kwargs['verbosity'] = ui_verbosity



        cols_num = len(self.df.columns)
        rows_num = len(self.df.index)
        if '#' not in self.df.columns:
            cols_num = len(self.df.columns) + 1
        if '#' not in self.df.index:
            rows_num = len(self.df.index) + 1

        ui_max_cols = widgets.IntSlider(
            value=200,
            min=0,
            max=cols_num*2-1,  #*2 because of metadata columns which get added by diff='new+'
            description='columns',
            )
        kwargs['max_cols'] = ui_max_cols

        ui_max_rows = widgets.IntSlider(
            value=20,
            min=0,
            max=rows_num,
            description='rows',
            )
        kwargs['max_rows'] = ui_max_rows



        ###########################################
        #                tab2 info                #
        ###########################################


        ui_gridbox = widgets.GridBox(ui_expressions, layout=widgets.Layout(grid_template_columns="repeat(3, 330px)"))   
        tab0 = VBox([ui_gridbox, ui_diff])
        tab1 = VBox([ui_inplace, ui_verbosity, HBox([ui_max_cols, ui_max_rows])])
        ui_tab = widgets.Tab(
            children=[tab0, tab1],
            titles=['queries', 'settings'],
            )
        ui = VBox([ui_tab])
        display(ui)


        out = HBox([interactive_output(_interactive_mode, kwargs)], layout=Layout(overflow_y='auto'))

        display(out)

def _interactive_mode(**kwargs):

    df = kwargs.pop('df')
    expressions = [val for key, val in kwargs.items() if 'expression' in key]


    result = df.q(
        *expressions,
        diff=kwargs['diff'],
        max_cols=kwargs['max_cols'],
        max_rows=kwargs['max_rows'],
        inplace=kwargs['inplace'],
        verbosity=kwargs['verbosity'],
        )


    
    display(result)
    print('input code: ', df.qp._input)
    return result



if 'cards' not in globals():
    cards = pd.read_csv('data/cards.csv').format()
df = qp.get_df().format()


df.qi()


  @pd.api.extensions.register_index_accessor('q')
  @pd.api.extensions.register_series_accessor('q')
  @pd.api.extensions.register_dataframe_accessor('q')
  @pd.api.extensions.register_dataframe_accessor('qi')
  cards = pd.read_csv('data/cards.csv').format()


created dataframe qp.util.logs for tracking log entries
use qp.log(message, level, source) or qp.log(message) to add log entries
logs are saved in qp.util.logs


0,1,2,3,4,5
0,info,"striping column headers of leading and trailing whitespace, replacing ""//"" with ""/ /"", ""&&"" with ""& &"" and "">>"" with ""> >""",qp.df.format(),,2024-06-28 15:36:56.638555


0,1,2,3,4,5
1,info,adding metadata row and column,qp.df.format(),,2024-06-28 15:36:58.485038


0,1,2,3,4,5
2,info,"striping column headers of leading and trailing whitespace, replacing ""//"" with ""/ /"", ""&&"" with ""& &"" and "">>"" with ""> >""",qp.df.format(),,2024-06-28 15:37:02.090504


0,1,2,3,4,5
3,info,adding metadata row and column,qp.df.format(),,2024-06-28 15:37:02.100506


VBox(children=(Tab(children=(VBox(children=(GridBox(children=(Label(value='filter columns'), Label(value='filt…

HBox(children=(Output(),))

In [7]:
cards.qi()

VBox(children=(Tab(children=(VBox(children=(GridBox(children=(Label(value='filter columns'), Label(value='filt…

HBox(children=(Output(),))

In [50]:
import cProfile, pstats

profiler = cProfile.Profile()
profiler.enable()

cards.q('name', '()a', inplace=True, verbosity=0)

profiler.disable()
stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats(10)

         169278 function calls (169224 primitive calls) in 0.420 seconds

   Ordered by: internal time
   List reduced from 349 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    83233    0.128    0.000    0.186    0.000 C:\Users\MartinVölkl-GouyaIns\AppData\Roaming\Python\Python312\site-packages\pandas\core\strings\object_array.py:144(<lambda>)
        2    0.120    0.060    0.306    0.153 {pandas._libs.lib.map_infer_mask}
    83234    0.059    0.000    0.059    0.000 {method 'upper' of 'str' objects}
        3    0.043    0.014    0.043    0.014 {pandas._libs.algos.take_2d_axis0_object_object}
        3    0.011    0.004    0.012    0.004 {pandas._libs.lib.maybe_convert_objects}
        2    0.011    0.005    0.011    0.005 {built-in method pandas._libs.missing.isnaobj}
        2    0.010    0.005    0.010    0.005 {built-in method pandas._libs.lib.ensure_string_array}
        4    0.006    0.002    0.008    0.002 C:\Users\Marti

<pstats.Stats at 0x2899c2065d0>

In [None]:
import pandas as pd
import cProfile, pstats
import statistics

if 'cards' not in globals():
    cards = pd.read_csv('data/cards.csv').format()

cols = [True for col in cards.columns]
rows = [True for row in cards.index]
times = []


profiler = cProfile.Profile()
profiler.enable()

cards.q('name', '()a', inplace=True)
# print(a)
# a = cards.index[rows]
# cards.loc[rows, cols]

profiler.disable()
stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats(10)
times.append(stats.total_tt)


# print(statistics.mean(times), times)

In [None]:
import cProfile, pstats

profiler = cProfile.Profile()
profiler.enable()

for i in range(100):
    cards.q('name', '()a', inplace=True, verbosity=0)

profiler.disable()
stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats(10)

In [None]:
0.20989580000000002 [0.2695525, 0.1782757, 0.1733644, 0.17382249999999996, 0.18152440000000003, 0.1757844, 0.17239650000000006, 0.17846470000000003, 0.2548654000000002, 0.34090750000000003]



# qp.diff()

In [None]:

import pandas as pd
import numpy as np
import copy
import os
import datetime

from IPython.display import display
from ipywidgets import interact, widgets
from pandas.api.extensions import register_dataframe_accessor

from qplib.util import log, qpDict
from qplib.types import qp_date, qp_na




if 'cards' not in globals():
    cards = pd.read_csv('dat/cards.csv')

# cards1 = cards.q('=toughness', '>1 && <4', modify='int(x)+10', inplace=False)

# diff(cards1, cards, show='new')

# types

In [None]:
import pandas as pd
import numpy as np
import re


def qp_int(x, errors='coerce', na=np.nan):
    try:
        return int(float(x))  #float first to handle strings like '1.0'
    except:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to integer.
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns np.nan
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return na
            case _:
                return errors

def qp_float(x, errors='coerce', na=np.nan):
    try:
        return float(x)
    except:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to float.
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns np.nan
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return na
            case _:
                return errors
            
def qp_num(x, errors='coerce', na=np.nan):
    try:
        return pd.to_numeric(x)
    except:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to numeric.
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns np.nan
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return na
            case _:
                return errors
            
def qp_bool(x, errors='coerce', na=None):
    if str(x).lower() in ['y', 'yes', 'true', '1', '1.0', 'positive', 'pos']:
        return True
    elif str(x).lower() in ['n', 'no', 'false', '0', '0.0', 'negative', 'neg']:
        return False
    else:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to boolean.
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns NaN
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return na
            case _:
                return errors


def qp_date(x, errors='coerce', na=pd.NaT):
    if isinstance(x, str):
        x = x.replace('_', '-')
    try:
        if re.match(r'\D*(1|2)\d\d\d', x):
            return pd.to_datetime(x, dayfirst=False).date()
        else:
            return pd.to_datetime(x, dayfirst=True).date()
    except:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to date.
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns NaT
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return na
            case _:
                return errors
       
def qp_datetime(x, errors='coerce', na=pd.NaT):
    if isinstance(x, str):
        x = x.replace('_', '-')
    try:
        if re.match(r'\D*(1|2\d\d\d)', x):
            return pd.to_datetime(x, dayfirst=False)
        else:
            return pd.to_datetime(x, dayfirst=True)
    except:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to datetime.
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns NaT
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return na
            case _:
                return errors


def qp_na(x, errors='ignore', na=None):
    possible_nas = [
        'not available', 'na', 'n/a', 'n.a', 'n.a.', 'na.', 'n.a',
        'not a number', 'nan',
        'null', 'nil',
        'none',
        '',
        ]
    
    if pd.isna(x) or str(x).lower() in possible_nas:
        return na
    else:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to "{na}".
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns NaN
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return None
            case _:
                return errors

def qp_nk(x, errors='ignore', nk='unknown'):
    possible_nks = [
        'unk', 'unknown', 'not known', 'not known.',
        'nk', 'n.k.', 'n.k', 'n/k',
        'not specified', 'not specified.',
        ]
    
    if str(x).lower() in possible_nks:
        return nk
    else:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to "{nk}".
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns NaN
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return None
            case _:
                return errors

def qp_yn(x, errors='coerce', yes='yes', no='no', na=None):
    if str(x).lower() in ['y', 'yes', 'true', '1', '1.0', 'positive', 'pos']:
        return yes
    elif str(x).lower() in ['n', 'no', 'false', '0', '0.0', 'negative', 'neg']:
        return no
    else:
        match errors:
            case 'raise':
                raise ValueError(f"""could not convert "{x}" to "{yes}" or "{no}".
                    Error handling:
                    errors='ignore': returns the original value
                    errors='raise': raises a ValueError
                    errors='coerce': returns NaN
                    errors=<any other value>: returns <any other value>
                    """)
            case 'ignore':
                return x
            case 'coerce':
                return na
            case _:
                return errors

        




ValueError: Unable to parse string "123,456.789" at position 0

In [14]:
import re

# Define the regex pattern to match a sequence of numbers allowing for different kinds of separators
pattern = r"\d+([,.\s_]\d+)*"

# Example string to match against
example_string = "123,456.789 0123\t456_789"

# Perform the search
matches = re.findall(pattern, example_string)

print(matches)

['_789']


# "bashlike" wrappers

# temp

In [1]:
import dis
import qplib as qp
import pandas as pd
import numpy as np


df = qp.get_df().format()

df.qi()

created dataframe qp.util.logs for tracking log entries
use qp.log(message, level, source) or qp.log(message) to add log entries
logs are saved in qp.util.logs


0,1,2,3,4,5
0,info,"striping column headers of leading and trailing whitespace, replacing ""//"" with ""/ /"", ""&&"" with ""& &"" and "">>"" with ""> >""",qp.df.format(),,2024-07-04 16:45:06.353200


0,1,2,3,4,5
1,info,adding metadata row and column,qp.df.format(),,2024-07-04 16:45:07.875228


VBox(children=(Tab(children=(VBox(children=(GridBox(children=(Label(value='filter columns'), Label(value='filt…

HBox(children=(Output(),))

In [2]:
import qplib as qp

df = qp.get_df().format()

df.q(
	'=name',
		'is any',
			'x = x.title()',
	'=date of birth',
		'is any',
			'to date',
	'=age',
		'!is num // <0',
			'x=None',
	'=gender',
		"? str(x).lower()  in  ['m', 'male', 'mal']",
			'x = "M"',
    '=gender',
		"? str(x).lower()  in  ['f', 'female', 'ff']",
			'x = "F"',
    '=diabetes',
		'is any',
			'to yn',
	'is any',
		'is any',
			'',
	diff=None,
	inplace=False,
	verbosity=3,
	)

0,1,2,3,4,5
87,info,"striping column headers of leading and trailing whitespace, replacing ""//"" with ""/ /"", ""&&"" with ""& &"" and "">>"" with ""> >""",qp.df.format(),,2024-07-04 16:52:33.374622


0,1,2,3,4,5
88,info,adding metadata row and column,qp.df.format(),,2024-07-04 16:52:33.386517


  return pd.to_datetime(x, dayfirst=True).date()


Unnamed: 0,#,ID,name,date of birth,age,gender,height,weight,bp systole,bp diastole,cholesterol,diabetes,dose
#,,,,,,,,,,,,,
0,,10001.0,John Doe,1995-01-02,,M,170,70.2,20,80,Normal,no,10kg
1,,10002.0,Jane Smith,1990-09-14,30.0,F,175.5cm,68,130,85,Highe,yes,
2,,10003.0,Alice Johnson,1985-08-23,,F,,72.5lb,,,,,15 mg once a day
3,,20001.0,Bob Brown,1980-04-06,,M,280,na,140,90mmHg,GOOD,no,20mg
4,,20002.0,Eva White,2007-11-05,40.0,Other,,,135mmhg,,n.a.,yes,20 Mg
5,,20003.0,Frank Miller,1983-06-30,,M,185,75kg,125,75,High,yes,25g
6,,30001.0,Grace Taylor,1975-05-28,,F,1,,NAN,,Normal,no,
7,,30002.0,Harry Clark,NaT,,,6ft 1in,80.3,122,,,,
8,,30003.0,Ivy Green,1955-01-09,,,-10,130lbs,,95,high,,30 MG


In [None]:
#test