In [149]:
import polars as pl

In [151]:
def process_body(field_list) -> dict:
    record_dict = {}
    for field in field_list:
        if '=' in field:
            tag, value = field.split("=")
            record_dict[tag] = value
    return record_dict

In [152]:
def read_log(filename) -> list[dict]:
    with open(filename, "r", encoding="utf8") as file:
        record_list = []
        for line in file:
            field_list = line.split(chr(1)) # chr, ord
            log_dt, _ = field_list[0].split(": ")
            fields = field_list[1:]
            record_dict = process_body(fields)
            record_dict["log_dt"] = log_dt
            record_list.append(record_dict)
        return record_list


In [204]:
record_list=read_log("input/FIX.4.2-CSCII2-FIXTEST_Messages_20240313-0.log")

In [205]:
df0=pl.from_dicts(record_list).with_columns(pl.col('log_dt').str.to_datetime('%F %X.%3f'))

In [206]:
df_order=df0.filter(pl.col('35')=='D').select('1', '11', '55', 'log_dt')
df_trade=df0.filter(pl.col('35')=='8').select('1', '11', '55', 'log_dt')
df_cancel=df0.filter(pl.col('35')=='F').select('11','41', 'log_dt')

In [207]:
df_order_delay=df_order.join(df_trade, on=['1','11', '55']).rename({'1':'account_id', '11':'order_id', '55':'code', 'log_dt':'order_dt', 'log_dt_right':'order_report_dt'}).with_columns((pl.col('order_report_dt')-pl.col('order_dt')).cast(pl.UInt64).alias('order_delay'))

In [208]:
df_cancel_delay=df_cancel.join(df_trade, on='11').rename({'1':'account_id', '41':'order_id', 'log_dt':'cancel_dt', 'log_dt_right':'cancel_report_dt'}).with_columns((pl.col('cancel_report_dt')-pl.col('cancel_dt')).cast(pl.UInt64).alias('cancel_delay'))

In [214]:
df_summary=df_order_delay.join(df_cancel_delay, on=['account_id','order_id']).select(pl.exclude(['11', '55']))

In [215]:
df_o32_sh=df_summary.filter(pl.col('account_id') == "19997001").filter(pl.col('code').str.starts_with('6'))
df_o32_sz=df_summary.filter(pl.col('account_id') == "19997001").filter(~pl.col('code').str.starts_with('6'))
df_ldp_sh=df_summary.filter(pl.col('account_id') == "19997002").filter(pl.col('code').str.starts_with('6'))
df_ldp_sz=df_summary.filter(pl.col('account_id') == "19997002").filter(~pl.col('code').str.starts_with('6'))

In [223]:
df_o32_sh.select('order_delay', 'cancel_delay').describe()

describe,order_delay,cancel_delay
str,f64,f64
"""count""",201.0,201.0
"""null_count""",0.0,0.0
"""mean""",61.129353,861.522388
"""std""",10.689396,715.192164
"""min""",50.0,215.0
"""25%""",55.0,427.0
"""50%""",59.0,527.0
"""75%""",62.0,821.0
"""max""",133.0,2437.0


In [219]:
df_o32_sz.select('order_delay', 'cancel_delay').describe()

describe,order_delay,cancel_delay
str,f64,f64
"""count""",201.0,201.0
"""null_count""",0.0,0.0
"""mean""",62.940299,956.746269
"""std""",10.445402,218.056989
"""min""",50.0,410.0
"""25%""",56.0,810.0
"""50%""",60.0,964.0
"""75%""",65.0,1091.0
"""max""",98.0,1490.0


In [220]:
df_ldp_sh.select('order_delay', 'cancel_delay').describe()

describe,order_delay,cancel_delay
str,f64,f64
"""count""",203.0,203.0
"""null_count""",0.0,0.0
"""mean""",15.492611,64.448276
"""std""",3.477466,8.578758
"""min""",8.0,48.0
"""25%""",12.0,59.0
"""50%""",16.0,63.0
"""75%""",19.0,69.0
"""max""",22.0,109.0


In [221]:
df_ldp_sz.select('order_delay', 'cancel_delay').describe()

describe,order_delay,cancel_delay
str,f64,f64
"""count""",203.0,203.0
"""null_count""",0.0,0.0
"""mean""",15.315271,20.871921
"""std""",3.448219,3.719543
"""min""",9.0,11.0
"""25%""",12.0,19.0
"""50%""",15.0,21.0
"""75%""",18.0,24.0
"""max""",24.0,32.0
