## Input Files

In [89]:
import plotly.express as px
import pandas as pd

writes_throughput_data_path = "logs/benchmark/benchmark_LocationDb_Locations_RTree_Constant_10000_1000_1745834903.607223.log"
writes_io_data_path = "logs/disk/disk_cc_1745830835.log"
reads_throughput_data_path = ""
reads_io_data_path = ""

In [90]:
# Define a function to parse the log file
def parse_io_log_file(file_path):
    # Open and read the log file
    with open(file_path, 'r') as file:
        log_data = file.read()
    
    # Split the log data into individual records using "---" as the delimiter
    records = log_data.split('---')
    
    # Remove any empty records (e.g., due to trailing "---")
    records = [record.strip() for record in records if record.strip()]
    
    # Parse each record into a dictionary
    parsed_records = []
    for record in records:
        record_dict = {}
        for line in record.split('\n'):
            if ':' in line:  # Assuming key-value pairs are separated by ":"
                key, value = line.split(':', 1)
                record_dict[key.strip()] = value.strip()
        parsed_records.append(record_dict)
    
    # Convert the list of dictionaries into a pandas DataFrame
    columns = ['timestamp', 'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes', 'cancelled_write_bytes']
    dtypes = {'timestamp': 'int64', 'rchar': 'int64', 'wchar': 'int64', 'syscr': 'int64', 'syscw': 'int64', 'read_bytes': 'int64', 'write_bytes': 'int64', 'cancelled_write_bytes': 'int64'}
    df = pd.DataFrame(parsed_records, columns=columns).astype(dtypes)
    df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
    return df

io_df = parse_io_log_file(writes_io_data_path)
io_df

Unnamed: 0,timestamp,rchar,wchar,syscr,syscw,read_bytes,write_bytes,cancelled_write_bytes
0,2025-04-28 09:00:39,25115449,35267,12936,136,0,61440,0
1,2025-04-28 09:00:40,25115449,35267,12936,136,0,61440,0
2,2025-04-28 09:00:41,25115449,35267,12936,136,0,61440,0
3,2025-04-28 09:00:42,25115449,35267,12936,136,0,61440,0
4,2025-04-28 09:00:43,25115449,35267,12936,136,0,61440,0
...,...,...,...,...,...,...,...,...
3634,2025-04-28 10:02:01,51788896,56401109,152321,71667,32768,12926976,0
3635,2025-04-28 10:02:02,51788896,56401109,152321,71667,32768,12926976,0
3636,2025-04-28 10:02:03,51788896,56401109,152321,71667,32768,12926976,0
3637,2025-04-28 10:02:04,51788896,56401109,152321,71667,32768,12926976,0


In [91]:
def visualize_io_data(df, ycolumn='rchar', yaxis_title='Characters Read', title='Characters Read vs Timestamp'):
    fig = px.line(
        df,
        x='timestamp',
        y=ycolumn,
        title=title,
        labels={'timestamp': 'Timestamp', ycolumn: ycolumn},
        template='plotly_white'
    )

    # Customize the layout
    fig.update_layout(
        xaxis_title='Timestamp',
        yaxis_title=yaxis_title,
        xaxis=dict(tickangle=45),
        title_x=0.5
    )

    # Show the plot
    return fig

In [92]:
import json
def parse_throughput_log_file(file_path):
    # Open and read the log file
    rows = []
    with open(file_path, 'r') as file:
        for line in file:
            timestamp, status, message = line.split(" - ")
            json_message = json.loads(message)
            if json_message["status"] != "success" or json_message["event"] != "command.query":
                continue
            metrics = json_message.get("metrics")
            rows.append({
                "timestamp": timestamp,
                #"status": json_message.get("status"),
                "elapsed_time": metrics.get("elapsedTime"),
                "execution_time": metrics.get("executionTime"),
                "compile_time": metrics.get("compileTime"),
                "queue_wait_time": metrics.get("queueWaitTime"),
                # "resultCount": metrics.get("resultCount"),
                # "result_size": metrics.get("resultSize"),
                # "processed_objects": metrics.get("processedObjects"),
            })
    df = pd.DataFrame(rows)
    duration_columns = ["elapsed_time", "execution_time", "compile_time", "queue_wait_time"]
    for col in duration_columns:
        df[col] = df[col].str.replace(r'(ms|ns)$', '', regex=True).astype("double")
    df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
    return df
    
            
throughput_df = parse_throughput_log_file(writes_throughput_data_path)
throughput_df


The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.



Unnamed: 0,timestamp,elapsed_time,execution_time,compile_time,queue_wait_time
0,2025-04-28 10:08:28,24.193666,23.709542,5.461500,0.0
1,2025-04-28 10:08:28,12.395333,12.066917,3.947833,0.0
2,2025-04-28 10:08:28,10.943167,10.698209,3.711125,0.0
3,2025-04-28 10:08:28,11.343542,11.111917,4.511500,0.0
4,2025-04-28 10:08:28,9.302708,9.098209,2.651125,0.0
...,...,...,...,...,...
976,2025-04-28 10:08:45,10.592458,10.392750,3.170916,0.0
977,2025-04-28 10:08:45,10.152500,9.954958,3.469750,0.0
978,2025-04-28 10:08:45,11.760083,11.544125,3.407541,0.0
979,2025-04-28 10:08:45,11.235875,10.999500,4.003208,0.0


In [93]:
def visualize_throughput_data(df):
    df_grouped = df.resample('1s', on='timestamp').mean().reset_index()

    # Create a line plot for execution time vs timestamp
    fig = px.line(
        df_grouped,
        x='timestamp',
        y='execution_time',
        title='Execution Time vs Timestamp (Bucketed by 1 Second)',
        labels={'timestamp': 'Timestamp', 'execution_time': 'Execution Time (ms)'},
        template='plotly_white'
    )

    # Customize the layout
    fig.update_layout(
        xaxis_title='Timestamp',
        yaxis_title='Execution Time (ms)',
        xaxis=dict(tickangle=45),
        title_x=0.5
    )
    return fig
visualize_throughput_data(throughput_df)

In [94]:
def show_aggregate_stats(df):
    return df.describe([.50, .75, .95, .99])

## Write
- Write Throughput
  - mean
  - std.dev
  - median
  - p95
  - p99
- Bytes Written to Disk
  - per second
  - aggregate

### Write Throughput

In [95]:
show_aggregate_stats(throughput_df[["elapsed_time"]])

Unnamed: 0,elapsed_time
count,981.0
mean,12.735829
std,7.327431
min,7.277042
50%,10.489958
75%,13.140875
95%,23.761416
99%,42.79785
max,94.719167


### Bytes Written to Disk

#### Aggregate

In [96]:
show_aggregate_stats(io_df[["wchar"]])

Unnamed: 0,wchar
count,3639.0
mean,722922.0
std,5262858.0
min,35267.0
50%,125317.0
75%,170342.0
95%,541607.0
99%,23666400.0
max,56401110.0


#### As time

In [97]:
visualize_io_data(io_df, ycolumn='wchar', yaxis_title='Characters Written', title='Characters Written vs Timestamp').show()

## Reads
- Read Throughput
  - mean
  - std.dev
  - median
  - p95
  - p99
- Bytes Read from Disk
  - per second
  - aggregate

In [98]:
show_aggregate_stats(io_df[["rchar"]])

Unnamed: 0,rchar
count,3639.0
mean,25569080.0
std,2510083.0
min,25115450.0
50%,25221660.0
75%,25274650.0
95%,26797640.0
99%,39893240.0
max,51788900.0


In [99]:
visualize_io_data(io_df, ycolumn='rchar', yaxis_title='Characters Read', title='Characters Read vs Timestamp').show()