In [None]:
import os
import glob
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import ast

In [None]:
def combine_csv_files(directory):
    csv_files = glob.glob(os.path.join(directory, "*.csv"))
    print(f"Found {len(csv_files)} CSV files in {directory}")

    df_list = []
    for file in csv_files:
        df = pd.read_csv(file)
        df_list.append(df)
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df, df_list


csv_directory = "../data/semantic_search/"

combined_df, df_list = combine_csv_files(csv_directory)
print("Combined DataFrame shape:", combined_df.shape)

combined_df.to_csv("../data/semantic_search_results.csv", index=False)
print("Saved combined DataFrame to '../data/semantic_search_results.csv'")

In [None]:
combined_df['id'] = combined_df.id.astype(str)
combined_df['neighbors_object'] = combined_df.neighbors.apply(lambda n: ast.literal_eval(n))
combined_df['n_neighbors'] = combined_df.neighbors_object.apply(lambda x: len(x))
combined_df['power_predicted_top_neighbor'] = combined_df['neighbors_object'].apply(lambda n: n[0]['avg_power_per_node'])
combined_df['runtime_predicted_top_neighbor'] = combined_df['neighbors_object'].apply(lambda n: n[0]['wallclock_used_sec'])
combined_df['distance_top_neighbor'] = combined_df['neighbors_object'].apply(lambda n: n[0]['distance'])
combined_df['id_top_neighbor'] = combined_df['neighbors_object'].apply(lambda n: n[0]['id'])

In [None]:
scatter_color = '#1f77b4'
highlight_color = '#0D47A1'
median_color = '#E53935'
refline_color = '#212121'

In [None]:
num_bins = 500
combined_df['power_bin'] = pd.qcut(combined_df['avg_power_per_node'], q=num_bins)

grouped = combined_df.groupby('power_bin')['predicted_power']
stats = grouped.agg(
    median='median'
).reset_index()

stats['bin_center'] = stats['power_bin'].apply(lambda interval: (interval.left + interval.right) / 2)

fig, ax = plt.subplots(1, 1, figsize=(12, 10), dpi=300)

ax.scatter(combined_df.avg_power_per_node, combined_df.predicted_power, s=1, alpha=0.1, c=scatter_color)
ax.scatter(combined_df.avg_power_per_node[0], combined_df.predicted_power[0], s=10, alpha=1, label='Individual Jobs', c=scatter_color)

ax.plot(stats['bin_center'], stats['median'], color=median_color, lw=5, label='Median Prediction', alpha=0.8)
ax.plot(stats['bin_center'], stats['bin_center'], linestyle='--', color=refline_color, lw=3, label='Predicted = Actual')

ax.set_ylabel("Predicted Power (W)", fontsize=16)
ax.set_xlabel("Actual Power (W)", fontsize=16)
ax.grid(True)
ax.legend(loc='lower right', fontsize=16)

ax.set_xlim([200, 800])
ax.set_ylim([200, 800])

plt.tight_layout()
# plt.savefig('../figures/semantic_search_power.png', dpi=300)
plt.show()

In [None]:
num_bins = 100
combined_df['runtime_bin'] = pd.qcut(combined_df['wallclock_used_sec'] / 3600, q=num_bins, duplicates='drop')

combined_df['predicted_runtime_hours'] = combined_df['predicted_runtime'] / 3600

grouped = combined_df.groupby('runtime_bin')['predicted_runtime_hours']
stats = grouped.agg(
    median='median'
).reset_index()

stats['bin_center'] = stats['runtime_bin'].apply(lambda interval: (interval.left + interval.right) / 2)

fig, ax = plt.subplots(1, 1, figsize=(12, 10), dpi=300)

ax.scatter(combined_df.wallclock_used_sec / 3600, combined_df.predicted_runtime_hours, color=scatter_color, s=1, alpha=.1)
ax.scatter(combined_df.wallclock_used_sec[0] / 3600, combined_df.predicted_runtime_hours[0], color=scatter_color, s=10, label='Individual Jobs')

ax.plot(stats['bin_center'], stats['median'], color=median_color, lw=5, label='Median Prediction', alpha=0.8)
ax.plot(stats['bin_center'], stats['bin_center'], linestyle='--', color=refline_color, lw=3, label='Predicted = Actual')

ax.set_ylabel("Predicted Runtime (hours)", fontsize=16)
ax.set_xlabel("Actual Runtime (hours)", fontsize=16)
ax.grid(True)
ax.legend(loc='lower right', fontsize=16)

# Set limits
ax.set_xlim([10 / 3600, 200000 / 3600])
ax.set_ylim([10 / 3600, 200000 / 3600])
plt.xscale('log')
ax.set_yscale('log')
plt.tight_layout()
# plt.savefig('../figures/semantic_search_runtime.png', dpi=300)
plt.show()

In [None]:
def plot_error_by_range(df, num_bins=30):
    df["range_bin"] = pd.qcut(df["range_of_matches"], q=num_bins, duplicates='drop')
    grouped = df.groupby("range_bin")["absolute_error"]

    stats = grouped.agg(
        median="median",
        q1=lambda x: x.quantile(0.25),
        q3=lambda x: x.quantile(0.75)
    ).reset_index()

    stats["bin_center"] = stats["range_bin"].apply(lambda x: (x.left + x.right) / 2)

    plt.figure(figsize=(10, 6), dpi=300)
    sns.lineplot(data=stats, x="bin_center", y="median", label="Median", color='#D81159', linewidth=2.5)
    sns.lineplot(data=stats, x="bin_center", y="q1", label="Q1 (25%)", color='#3A86FF', linestyle='--')
    sns.lineplot(data=stats, x="bin_center", y="q3", label="Q3 (75%)", color='#3A86FF', linestyle='--')
    plt.fill_between(stats["bin_center"], stats["q1"], stats["q3"], color='#3A86FF', alpha=0.2)

    plt.xlabel("Power Range of Five Nearest Neighbors (W)", fontsize=16)
    plt.ylabel("Predicted Power Absolute Error (W)", fontsize=16)
    plt.xscale('log')
    plt.yscale('log')
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.legend(fontsize=12, loc='upper left')

    ax = plt.gca()
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    ax.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=False))
    ax.xaxis.set_minor_formatter(LogFormatter(labelOnlyBase=False))
    ax.yaxis.set_major_formatter(LogFormatter(labelOnlyBase=False))
    ax.yaxis.set_minor_formatter(LogFormatter(labelOnlyBase=False))
    
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    
    plt.tight_layout()
    plt.xlim([1,800])
    plt.ylim([.1,800])
    # plt.savefig('../figures/error_by_range.png', dpi=300)
    plt.show()

In [None]:
# Only consider results with more than one retrieved neighbor
# If only one neighbor is retrieved, there is no range.
plot_error_by_range(combined_df[combined_df.n_neighbors > 1])