In [None]:
import os
import glob
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def combine_csv_files(directory):
    csv_files = glob.glob(os.path.join(directory, "*.csv"))
    print(f"Found {len(csv_files)} CSV files in {directory}")

    df_list = []
    for file in csv_files:
        df = pd.read_csv(file)
        df_list.append(df)
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df, df_list

In [None]:
scatter_color = '#1f77b4'
highlight_color = '#0D47A1'
median_color = '#E53935'
refline_color = '#212121'

# Power Prediction

In [None]:
csv_directory = "../data/baseline/power/"

combined_df, df_list = combine_csv_files(csv_directory)
print("Combined DataFrame shape:", combined_df.shape)

combined_df.to_csv("../data/baseline_power_results.csv", index=False)
print("Saved combined DataFrame to '../data/baseline_power_results.csv'")

In [None]:
num_bins = 500
combined_df['power_bin'] = pd.qcut(combined_df['avg_power_per_node'], q=num_bins)

grouped = combined_df.groupby('power_bin')['predicted_power']
stats = grouped.agg(
    median='median'
).reset_index()

stats['bin_center'] = stats['power_bin'].apply(lambda interval: (interval.left + interval.right) / 2)

fig, ax = plt.subplots(1, 1, figsize=(12, 10), dpi=300)

ax.scatter(combined_df.avg_power_per_node, combined_df.predicted_power, s=1, alpha=0.1, c=scatter_color)
ax.scatter(combined_df.avg_power_per_node[0], combined_df.predicted_power[0], s=10, alpha=1, label='Individual Jobs', c=scatter_color)

ax.plot(stats['bin_center'], stats['median'], color=median_color, lw=5, label='Median Prediction', alpha=0.8)
ax.plot(stats['bin_center'], stats['bin_center'], linestyle='--', color=refline_color, lw=3, label='Predicted = Actual')

ax.set_ylabel("Predicted Power (W)", fontsize=16)
ax.set_xlabel("Actual Power (W)", fontsize=16)
ax.grid(True)
ax.legend(loc='lower right', fontsize=16)

ax.set_xlim([200, 800])
ax.set_ylim([200, 800])

plt.tight_layout()
# plt.savefig('../figures/semantic_search_power.png', dpi=300)
plt.show()

# Runtime Prediction

In [None]:
csv_directory = "../data/baseline/runtime/"

combined_df, df_list = combine_csv_files(csv_directory)
print("Combined DataFrame shape:", combined_df.shape)

combined_df.to_csv("../data/baseline_runtime_results.csv", index=False)
print("Saved combined DataFrame to '../data/baseline_runtime_results.csv'")

In [None]:
num_bins = 100
combined_df['runtime_bin'] = pd.qcut(combined_df['wallclock_used_sec'] / 3600, q=num_bins, duplicates='drop')

combined_df['predicted_runtime_hours'] = combined_df['predicted_runtime'] / 3600

grouped = combined_df.groupby('runtime_bin')['predicted_runtime_hours']
stats = grouped.agg(
    median='median'
).reset_index()

stats['bin_center'] = stats['runtime_bin'].apply(lambda interval: (interval.left + interval.right) / 2)

fig, ax = plt.subplots(1, 1, figsize=(12, 10), dpi=300)

ax.scatter(combined_df.wallclock_used_sec / 3600, combined_df.predicted_runtime_hours, color=scatter_color, s=1, alpha=.1)
ax.scatter(combined_df.wallclock_used_sec[0] / 3600, combined_df.predicted_runtime_hours[0], color=scatter_color, s=10, label='Individual Jobs')

ax.plot(stats['bin_center'], stats['median'], color=median_color, lw=5, label='Median Prediction', alpha=0.8)
ax.plot(stats['bin_center'], stats['bin_center'], linestyle='--', color=refline_color, lw=3, label='Predicted = Actual')

ax.set_ylabel("Predicted Runtime (hours)", fontsize=16)
ax.set_xlabel("Actual Runtime (hours)", fontsize=16)
ax.grid(True)
ax.legend(loc='lower right', fontsize=16)

# Set limits
ax.set_xlim([10 / 3600, 200000 / 3600])
ax.set_ylim([10 / 3600, 200000 / 3600])
plt.xscale('log')
ax.set_yscale('log')
plt.tight_layout()
# plt.savefig('../figures/semantic_search_runtime.png', dpi=300)
plt.show()