## Input Files

In [9]:
writes_throughput_data_path = ""
writes_io_data_path = "logs/disk/disk_cc_1745830835.log"
reads_throughput_data_path = ""
reads_io_data_path = ""

In [35]:
import pandas as pd

# Define a function to parse the log file
def parse_io_log_file(file_path):
    # Open and read the log file
    with open(file_path, 'r') as file:
        log_data = file.read()
    
    # Split the log data into individual records using "---" as the delimiter
    records = log_data.split('---')
    
    # Remove any empty records (e.g., due to trailing "---")
    records = [record.strip() for record in records if record.strip()]
    
    # Parse each record into a dictionary
    parsed_records = []
    for record in records:
        record_dict = {}
        for line in record.split('\n'):
            if ':' in line:  # Assuming key-value pairs are separated by ":"
                key, value = line.split(':', 1)
                record_dict[key.strip()] = value.strip()
        parsed_records.append(record_dict)
    
    # Convert the list of dictionaries into a pandas DataFrame
    columns = ['timestamp', 'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes', 'cancelled_write_bytes']
    dtypes = {'timestamp': 'int64', 'rchar': 'int64', 'wchar': 'int64', 'syscr': 'int64', 'syscw': 'int64', 'read_bytes': 'int64', 'write_bytes': 'int64', 'cancelled_write_bytes': 'int64'}
    df = pd.DataFrame(parsed_records, columns=columns).astype(dtypes)
    df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')
    return df

df = parse_io_log_file(writes_io_data_path)

In [None]:
import json
unsuccessful_count = 0
def parse_benchmark_log_file(file_path):
    # Open and read the log file
    rows = []
    with open(file_path, 'r') as file:
        for line in file:
            timestamp, status, message = line.split(" - ")
            json_message = json.loads(message)
            if json_message["status"] != "success":
                unsuccessful_count += 1
                continue
            metrics = json_message["metrics"]
            rows.append({
                "timestamp": timestamp,
                "status": status,
                "elapsed_time": metrics["elapsedTime"],
                "execution_time": metrics["executionTime"],
                "compile_time": metrics["compileTime"],
                "queue_wait_time": metrics["queueWaitTime"],
                "resultCount": metrics["resultCount"],
                "result_size": metrics["resultSize"],
                "processed_objects": metrics["processedObjects"],
                "buffer_cache_hit_ratio": metrics["bufferCacheHitRatio"],
                "buffer_cache_page_read_count": metrics["bufferCachePageReadCount"],
            })
            
        
            
            
            
            

## Write
- Write Throughput
  - mean
  - std.dev
  - median
  - p95
  - p99
- Bytes Written to Disk
  - per second
  - aggregate

### Write Throughput

### Bytes Written to Disk

#### Aggregate

In [33]:
df[["wchar"]].describe([.50, .75, .95, .99])

Unnamed: 0,wchar
count,323.0
mean,42774.232198
std,4514.836803
min,35267.0
50%,42739.0
75%,46475.0
95%,50431.0
99%,50898.0
max,50898.0


#### As time

In [29]:
# Import Plotly
import plotly.express as px

# Create a line plot for rchar vs timestamp
fig = px.line(
    df,
    x='timestamp',
    y='rchar',
    title='Characters Written vs Timestamp',
    labels={'timestamp': 'Timestamp', 'wchar': 'wchar'},
    template='plotly_white'
)

# Customize the layout
fig.update_layout(
    xaxis_title='Timestamp',
    yaxis_title='Characters Written',
    xaxis=dict(tickangle=45),
    title_x=0.5
)

# Show the plot
fig.show()

## Reads
- Read Throughput
  - mean
  - std.dev
  - median
  - p95
  - p99
- Bytes Read from Disk
  - per second
  - aggregate

In [34]:
df[["rchar"]].describe([.50, .75, .95, .99])

Unnamed: 0,rchar
count,323.0
mean,25124380.0
std,5350.523
min,25115450.0
50%,25124350.0
75%,25128790.0
95%,25133360.0
99%,25133920.0
max,25133920.0


In [30]:
# Import Plotly
import plotly.express as px

# Create a line plot for rchar vs timestamp
fig = px.line(
    df,
    x='timestamp',
    y='rchar',
    title='Characters Read vs Timestamp',
    labels={'timestamp': 'Timestamp', 'rchar': 'rchar'},
    template='plotly_white'
)

# Customize the layout
fig.update_layout(
    xaxis_title='Timestamp',
    yaxis_title='Character Read',
    xaxis=dict(tickangle=45),
    title_x=0.5
)

# Show the plot
fig.show()