# Description

This notebook reads the performance data of the jobes from the `performance` directory and shows the performance of the jobes in a table.

In [4]:
import pandas as pd
import glob

In [6]:

dfs = []
files = glob.glob('performance/*.csv')
print(files)
for file in files:
    df = pd.read_csv(file)
    # Drop the rows with the metrics jobs_records	tasks_records if they are present
    df = df[~df['metric'].str.contains('jobs_records|tasks_records')]
    # add first 10 characters of the filename as a column
    df['filename'] = file.split('/')[-1][:10]
    dfs.append(df)

['performance/q1_performance_metrics_1736699369.csv', 'performance/q2_performance_metrics_1736699518.csv', 'performance/q3_performance_metrics_1736700092.csv', 'performance/q4_performance_metrics_1736700640.csv', 'performance/q5_bonus1_performance_metrics_1736702121.csv', 'performance/q6_performance_metrics_1736702746.csv', 'performance/q8_performance_metrics_1736718629.csv', 'performance/q9_bonus2_performance_metrics_1736719852.csv']


In [7]:
# For each df change the reading_time & processing_time from seconds to a readable format (HH:MM:SS)
for df in dfs:
    mask = df['metric'].isin(['reading_time', 'processing_time'])
    df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')

  df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')
  df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')
  df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')
  df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')
  df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')
  df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')
  df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')
  df.loc[mask, 'value'] = pd.to_datetime(df.loc[mask, 'value'].astype(float), unit='s').dt.strftime('%H:%M:%S')


In [8]:
def format_records(value):
    num = int(value)
    if num >= 1_000_000:
        return f'{num / 1_000_000:.1f}M'
    elif num >= 1_000:
        return f'{num / 1_000:.1f}k'
    else:
        return str(num)

for df in dfs:
    mask = df['metric'].str.endswith('_records')
    df.loc[mask, 'value'] = df.loc[mask, 'value'].apply(format_records)

In [9]:
# Initialize an empty list to hold the processed data
rows = []

# Process each dataframe
for idx, df in enumerate(dfs):
    # Extract the question ID (e.g., q1, q2, ...)
    question_id = f'Question {idx + 1}'
    # Convert the dataframe to a dictionary of metrics and add the question ID
    row = {row['metric']: row['value'] for _, row in df.iterrows()}
    row['filename'] = df['filename'].iloc[0]
    rows.append(row)

# Create a dataframe from the list of rows
combined_df = pd.DataFrame(rows)

# Fill missing values with a placeholder (e.g., '-')
combined_df = combined_df.fillna('-')



In [11]:
from matplotlib import pyplot as plt
import re

def extract_question(filename):
    """Extract the number of the question number from the filename."""
    match = re.search(r'q(\d+)', filename)
    return int(match.group(1)) if match else None

# Extract cores and convert times to seconds
combined_df['questions'] = combined_df['filename'].apply(extract_question)

# Sort by number of cores
combined_df = combined_df.sort_values('questions')

combined_df.head(10)

Unnamed: 0,reading_time,processing_time,total_records,filename,questions
0,00:00:27,00:00:13,37.8k,q1_perform,1
1,00:00:27,00:00:21,37.8k,q2_perform,2
2,00:01:44,00:02:50,146.7M,q3_perform,3
3,00:00:59,00:01:53,144.6M,q4_perform,4
4,00:01:28,00:08:06,144.7M,q5_bonus1_,5
5,00:00:55,00:04:29,144.6M,q6_perform,6
6,00:01:52,00:18:41,1377.4M,q8_perform,8
7,00:00:45,00:08:03,1232.8M,q9_bonus2_,9
