In [3]:
# %% [markdown]
# # Linux Performance Comparison Analysis
# 
# This notebook analyzes performance metrics across different Linux distributions and versions.

# %%
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# %%
# Prepare the data (could also load from CSV)
data = {
    "OS": [
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.11.0-26-generic-x86_64-with-glibc2.39",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-6.12.25-amd64-x86_64-with-glibc2.41",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
        "Linux-5.15.0-142-generic-x86_64-with-glibc2.35",
    ],
    "Run": [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3],
    "Vector Size": [
        1000000, 5000000, 20000000, 1000000, 5000000, 20000000, 1000000, 5000000, 20000000,
        1000000, 5000000, 20000000, 1000000, 5000000, 20000000, 1000000, 5000000, 20000000,
        1000000, 5000000, 20000000, 1000000, 5000000, 20000000, 1000000, 5000000, 20000000
    ],
    "Time (s)": [
        0.007009, 0.056373, 1.104579, 0.004586, 0.022934, 0.722199, 0.007372, 0.031431, 0.123634,
        0.018215, 0.091526, 0.391965, 0.011815, 0.015908, 0.072467, 0.004566, 0.017031, 0.094796,
        0.00356, 0.039633, 0.186876, 0.002841, 0.012146, 0.127103, 0.002562, 0.013044, 0.062455
    ]
}

df = pd.DataFrame(data)

# Clean up OS names for better readability
df['OS_Short'] = df['OS'].str.extract(r'(Linux-\d+\.\d+\.\d+)')
df['OS_Type'] = df['OS'].apply(lambda x: 'Desktop' if 'generic' in x else ('Server' if 'amd64' in x else 'Kali'))
df['OS_Version'] = df['OS'].str.extract(r'glibc(\d+\.\d+)')

# %%
# Plot 1: Time vs Vector Size for each OS
plt.figure(figsize=(12, 8))
sns.lineplot(data=df, x='Vector Size', y='Time (s)', hue='OS_Short', style='OS_Type', 
             markers=True, dashes=False, linewidth=2.5)
plt.title('Performance Comparison: Time vs Vector Size Across Linux Systems')
plt.xscale('log')
plt.yscale('log')
plt.grid(True, which="both", ls="--")
plt.legend(title='Linux Version', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# %%
# Plot 2: Facet grid by Run number
g = sns.FacetGrid(df, col='Run', hue='OS_Short', height=5, aspect=1.2)
g.map(sns.lineplot, 'Vector Size', 'Time (s)', marker='o')
g.add_legend(title='Linux Version')
g.set(xscale='log', yscale='log')
g.set_titles("Run {col_name}")
g.fig.suptitle('Performance Across Different Runs', y=1.05)
g.set_axis_labels("Vector Size (log)", "Time (s) (log)")
plt.show()

# %%
# Plot 3: Bar plot comparing average performance
plt.figure(figsize=(12, 6))
avg_times = df.groupby(['OS_Short', 'OS_Type', 'Vector Size'])['Time (s)'].mean().reset_index()
sns.barplot(data=avg_times, x='OS_Short', y='Time (s)', hue='Vector Size', 
            palette='viridis', edgecolor='black')
plt.title('Average Execution Time by Linux Version and Vector Size')
plt.xticks(rotation=45)
plt.ylabel('Time (seconds)')
plt.xlabel('Linux Version')
plt.legend(title='Vector Size', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# %%
# Plot 4: Heatmap of performance
heatmap_data = df.pivot_table(index=['OS_Short', 'OS_Type'], 
                             columns='Vector Size', 
                             values='Time (s)', 
                             aggfunc='mean')

plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, fmt=".3f", cmap="YlOrRd", linewidths=.5)
plt.title('Heatmap of Execution Times (seconds)')
plt.xlabel('Vector Size')
plt.ylabel('Linux Version and Type')
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'pandas'