# log

In [None]:
import pandas as pd
import numpy as np
import copy
import os
import sys
import shutil
import datetime
import qplib as qp
from qplib import log
pd.set_option('display.max_columns', None)


log('trace: this is a trace message')
log('debug: this is a debug message')
log('info: this is an info message')
log('warning: this is a warning message')
log('error: this is an error message')

log()

# diff

In [None]:
import qplib as qp

df_old, df_new = qp.get_dfs()

print('Summary:')
display(qp.diff(df_old, df_new, uid='uid').summary())

print('df_new:')
display(df_new)

print('df_old:')
display(df_old)

print('mode=new:')
display(qp.diff(df_old, df_new, uid='uid').show('new'))

print('mode=new+:')
display(qp.diff(df_old, df_new, uid='uid').show('new+'))


print('mode=old:')
display(qp.diff(df_old, df_new, uid='uid').show('old'))

print('mode=mix:')
display(qp.diff(df_old, df_new, uid='uid').show('mix'))


# qlang

## format symbols

In [None]:
import pandas as pd
import numpy as np
import copy
import os
import sys
import shutil
import datetime
import qplib as qp
from qplib import log
pd.set_option('display.max_columns', None)

def format_symbols():
    shutil.copy('qplib/data/symbols.xlsx', 'qplib/data/symbols_temp.xlsx')
    df = pd.read_excel('qplib/data/symbols_temp.xlsx', index_col=0)
    size = len(df.index)
    for i in range(2, size):
        row = df.index[i]
        for j in range(i+1):
            col = df.index[j]
            if row != col:
                df.loc[col, row] = df.loc[row, col]
    df.to_csv('qplib/data/symbols.csv')
    os.remove('qplib/data/symbols_temp.xlsx')
    return df

show = format_symbols()

# df


## performance analysis

In [None]:
import pandas as pd
import numpy as np
import copy
import os
import sys
import shutil
import datetime
import qplib as qp
from qplib import log
pd.set_option('display.max_columns', None)
qp.qlang.VERBOSITY = 3


if not 'cards' in locals():
    cards = pd.read_csv('archive/cards.csv')


log(clear=True)

log('start')

cards.q(
    r"""
    power  %%>3  &&<5
    """
    )

log('stop')

logs = log().copy()
#milliseconds timestamps
logs['timestamp'] = logs['time'] - logs.loc[0, 'time']
logs['timestamp'] = logs['timestamp'].dt.total_seconds() * 1000


logs.q(
    r"""
    $ verbosity = 3
    $ diff = None

    text $width=500px $align=left
    
    %!=context  &!=time
    """
    )

### v0.7.5 vs v0.8

python code used:

```py
df = qp.get_df()
query = ''
%timeit qp.qlang.query(df, query)
%timeit qp.qlang_v3.query(df, query)
```


query = ''  
722 μs ± 44.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)  
487 μs ± 24.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each) 

------------------------------------------------------------------------------------------

query = 'id'  
1.89 ms ± 45.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)  
1.69 ms ± 29.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)  

------------------------------------------------------------------------------------------

query = r"""  
name %%?john  
age &&>30  
is any;  
"""  
7.8 ms ± 778 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)  
6.95 ms ± 945 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)  

------------------------------------------------------------------------------------------

df = pd.read_excel('archive/cards.xlsx')  
query = r"""  
"""  
21.2 ms ± 342 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)  
23.4 ms ± 332 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)  

------------------------------------------------------------------------------------------

query = r"""  
power %%>3  &&<5  
"""  
29.8 ms ± 2.99 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)  
22.8 ms ± 588 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)  

------------------------------------------------------------------------------------------

query = r"""  
power      %%>5  
toughness  &&=1  
"""  
26.7 ms ± 804 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)  
31.6 ms ± 3.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)  

------------------------------------------------------------------------------------------

query1 = r"""  
%%each is na;  
"""  
query2 = r"""  
%%%is na;  
"""  
600 ms ± 22.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)  
629 ms ± 29.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)  

# tests

In [None]:

# #run tests in folder "tests" using pytest and create a test report
# !pytest tests --html=tests/test_report.html


In [None]:
import pandas as pd
import numpy as np
import copy
import os
import sys
import shutil
import datetime
import qplib as qp
from qplib import log
pd.set_option('display.max_columns', None)


def get_df_simple():
    df = pd.DataFrame({
        'a': [-1, 0, 1],
        'b': [1, 2, 3]
        })
    return df

def get_df_simple_tagged():
    df = pd.DataFrame({
        'meta': ['', '', ''],
        'a': [-1, 0, 1],
        'b': [1, 2, 3]
        })
    df.index = pd.Index([3, 1, 2])
    return df


def get_df():
    df = pd.DataFrame({
        'ID': [10001, 10002, 10003, 20001, 20002, 20003, 30001, 30002, 30003, 30004, 30005],
        'name': ['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'eva white', 'Frank miller', 'Grace TAYLOR', 'Harry Clark', 'IVY GREEN', 'JAck Williams', 'john Doe'],
        'date of birth': ['1995-01-02', '1990/09/14', '1985.08.23', '19800406', '05-11-2007', '06-30-1983', '28-05-1975', '1960Mar08', '1955-Jan-09', '1950 Sep 10', '1945 October 11'],
        'age': [-25, '30', np.nan, None, '40.0', 'forty-five', 'nan', 'unk', '', 'unknown', 35],
        'gender': ['M', 'F', 'Female', 'Male', 'Other', 'm', 'ff', 'NaN', None, 'Mal', 'female'],
        'height': [170, '175.5cm', None, '280', 'NaN', '185', '1', '6ft 1in', -10, '', 200],
        'weight': [70.2, '68', '72.5lb', 'na', '', '75kg', None, '80.3', '130lbs', '82', -65],
        'bp systole': ['20', 130, 'NaN', '140', '135mmhg', '125', 'NAN', '122', '', 130, '45'],
        'bp diastole': [80, '85', 'nan', '90mmHg', np.nan, '75', 'NaN', None, '95', '0', 'NaN'],
        'cholesterol': ['Normal', 'Highe', 'NaN', 'GOOD', 'n.a.', 'High', 'Normal', 'n/a', 'high', '', 'Normal'],
        'diabetes': ['No', 'yes', 'N/A', 'No', 'Y', 'Yes', 'NO', None, 'NaN', 'n', 'Yes'],
        'dose': ['10kg', 'NaN', '15 mg once a day', '20mg', '20 Mg', '25g', 'NaN', None, '30 MG', '35', '40ml']
        })
    return df


def get_df_tagged():
    df1 = get_df()
    df2 = pd.DataFrame('', index=df1.index, columns=['meta', *df1.columns])
    df2.iloc[:, 1:] = df1.loc[:, :]
    return df2



params = [
    (r'name  %%is str;',                 show.loc[:, ['name']]),
    ]
# @pytest.mark.parametrize('code, expected', params)
def test(code, expected):
    result = get_df().q(code)
    assert result.equals(expected), qp.diff(result, expected, output='str')

for code, expected in params:
    test(code, expected)




# get loc

In [17]:
#get lines of python code in folders qplib and tests

import qplib as qp

code = qp.lsr('qplib').q('%name %%?.py  &&!?.pyc')['name'].tolist()
tests = qp.lsr('tests').q('%name %%?.py  &&!?.pyc')['name'].tolist()

loc_code = 0
loc_tests = 0
locs = pd.DataFrame(columns=['file', 'lines'])

for file in code:
    with open(os.path.join('qplib', file), 'r', encoding='utf-8') as f:
        lines = f.readlines()
        loc_code += len(lines)
        locs.loc[len(locs), :] = [file, len(lines)]


for file in tests:
    with open(os.path.join('tests', file), 'r', encoding='utf-8') as f:
        lines = f.readlines()
        loc_tests += len(lines)
        locs.loc[len(locs), :] = [file, len(lines)]


print(f'Lines of code in qplib: {loc_code}')
print(f'Lines of code in tests: {loc_tests}')
locs

Lines of code in qplib: 5833
Lines of code in tests: 7983


Unnamed: 0,file,lines
0,diffing.py,1360
1,excel.py,171
2,pandas.py,687
3,qlang.py,2131
4,types.py,619
5,util.py,772
6,__init__.py,93
7,test_days_between.py,147
8,test_deduplicate.py,129
9,test_diff.py,2612


# temp

In [6]:
import pandas as pd
import numpy as np
import copy
import os
import sys
import shutil
import datetime
import gc
import qplib as qp
from qplib import log
pd.set_option('display.max_columns', None)

df = qp.get_df()
df1, df2 = qp.get_dfs()
