In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

task = "reg"
task = "clf"

if task == "clf":
    task_name = "Classification"
elif "reg":
    task_name = "Regression"

fpath_notopt_times = f'res_{task}_notopt_times.csv'
fpath_opt_times = f'res_{task}_opt_times.csv'

fpath_notopt_depth = f'res_{task}_notopt_depth.csv'

fpath_notopt_sorted_count = f'res_{task}_notopt_sorted_count.csv'
fpath_opt_sorted_count = f'res_{task}_opt_sorted_count.csv'


# Python vs Rust

In [None]:
axissize = 18
legendsize = 16
ticksize=16
titlesize=20
dpi=150

In [None]:
def python_vs_rust(python_times, rust_times, task, task_name):
    df_python = pd.DataFrame(python_times, columns=['Execution Time (sec)'])
    df_rust = pd.DataFrame(rust_times, columns=['Execution Time (sec)'])

    # Plotting
    plt.figure(figsize=(10, 6), dpi=dpi)
    plt.bar('Python', df_python['Execution Time (sec)'].mean(), yerr=df_python['Execution Time (sec)'].std(), color='orange', alpha=0.7, label='Execution Time (sec)')
    plt.bar('Rust', df_rust['Execution Time (sec)'].mean(), yerr=df_rust['Execution Time (sec)'].std(), color='red', alpha=0.7)
    
    plt.xlabel('Programming Languages', fontsize=axissize)
    plt.ylabel('Execution Time (sec)', fontsize=axissize)
    plt.title(f'Execution Time of Python vs Rust for {task_name}', fontsize=titlesize)
    
    plt.xticks(fontsize=ticksize)
    plt.yticks(fontsize=ticksize)
    
    # plt.legend(fontsize=legendsize)
    plt.grid(True)
    plt.savefig(f'plots/python-vs-rust-{task}.png', dpi=dpi)
    plt.show()


In [None]:
# Classification
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Provided times for Python and Rust
python_times = [
    11.77, 10.62, 10.00, 9.46, 10.90, 10.50, 8.48, 11.26, 9.98, 10.40,
    9.18, 9.98, 9.16, 10.43, 9.31, 8.96, 12.19, 10.97, 9.15, 9.91,
    9.77, 12.49, 11.29, 10.32, 14.11, 10.17, 9.60, 10.85, 11.47, 10.57,
    10.34, 11.78, 9.52, 10.21, 10.53, 10.83, 11.63, 11.94, 10.05, 9.11
]

rust_times = [
    2.845, 2.919, 2.835, 2.818, 3.067, 2.988, 3.084, 2.790, 2.818, 2.791,
    2.807, 2.803, 2.797, 2.804, 2.780, 2.786, 2.807, 2.792, 2.791, 2.827,
    2.971, 3.015, 2.960, 2.958, 2.835, 2.868, 2.816, 2.944, 2.845, 2.993,
    2.840, 2.881, 2.833, 2.853, 2.935, 2.908, 2.920, 2.861, 2.900, 2.824
]

python_vs_rust(python_times, rust_times, "clf", "Classification")
print(f"Python: {np.array(python_times).mean():.2f}±{np.array(python_times).var():.2f}")
print(f"Rust: {np.array(rust_times).mean():.2f}±{np.array(rust_times).var():.2f}")

In [None]:
# Regression
# Provided new times for Python and Rust
python_times = [
    13.02, 13.19, 12.64, 13.14, 12.93, 13.27, 13.46, 13.45, 14.24, 13.26,
    13.58, 12.96, 13.44, 13.44, 13.38, 12.60, 13.33, 12.52, 13.01, 13.54,
    13.09, 13.57, 13.33, 13.43, 13.82, 13.83, 12.94, 12.50, 13.21, 14.28,
    13.35, 12.88, 13.52, 13.57, 14.24, 13.78, 13.65, 13.59, 13.38, 13.43
]

rust_times = [
    0.456, 0.458, 0.462, 0.468, 0.465, 0.472, 0.463, 0.475, 0.475, 0.471,
    0.480, 0.447, 0.467, 0.448, 0.471, 0.472, 0.507, 0.454, 0.460, 0.476,
    0.476, 0.473, 0.485, 0.481, 0.463, 0.478, 0.529, 0.446, 0.459, 0.503,
    0.473, 0.447, 0.450, 0.523, 0.483, 0.462, 0.470, 0.449, 0.483, 0.477
]

python_vs_rust(python_times, rust_times, "reg", "Regression")
print(f"Python: {np.array(python_times).mean():.2f}±{np.array(python_times).var():.2f}")
print(f"Rust: {np.array(rust_times).mean():.2f}±{np.array(rust_times).var():.4f}")

# Execution time

In [None]:
block_size = 50000

axissize = 20
legendsize = 16
ticksize=16
titlesize=24
dpi=150

In [None]:
data_no_caching = pd.read_csv(fpath_notopt_times)
data_caching = pd.read_csv(fpath_opt_times)

# Display the first few rows of each dataset to understand their structure
data_no_caching_head = data_no_caching.head()
data_caching_head = data_caching.head()

data_no_caching.columns = ['Inference Time', 'Train Time', 'Total Time']
data_caching.columns = ['Inference Time', 'Train Time', 'Total Time']

# Add the optimization labels
data_no_caching['Optimization'] = 'Without Optimization'
data_caching['Optimization'] = 'With Optimization'

# Add block numbers to both datasets
data_no_caching['Block'] = (data_no_caching.index // block_size) + 1
data_caching['Block'] = (data_caching.index // block_size) + 1

combined_data = pd.concat([data_no_caching, data_caching])

# Nano seconds to Micro seconds
combined_data['Inference Time'] = combined_data['Inference Time'] / 1000
combined_data['Train Time'] = combined_data['Train Time'] / 1000
combined_data['Total Time'] = combined_data['Total Time'] / 1000

combined_data

In [None]:
# Create a new figure
plt.figure(figsize=(14, 8), dpi=dpi)

# Create a boxplot using seaborn to handle the coloring more effectively
sns.boxplot(x='Block', y='Total Time', hue='Optimization', data=combined_data, 
            palette={'With Optimization': 'lightblue', 'Without Optimization': 'lightgreen'}, showfliers=False)

plt.title(f'Total execution time per iteration on {task_name} task', fontsize=titlesize)
plt.xlabel('Record Segments (Thousands)', fontsize=axissize)
plt.ylabel('Time (µs)', fontsize=axissize)
plt.xticks(ticks=range(len(combined_data['Block'].unique())), 
           labels=[f"{int(i*block_size/1000)}-{int((i+1)*block_size/1000)}" for i in range(len(combined_data['Block'].unique()))],
           rotation=45, fontsize=ticksize)

# Set yticks font size
plt.yticks(fontsize=ticksize)

plt.legend(title='Optimization', title_fontsize=legendsize, fontsize=legendsize)

plt.grid(True)

# Save the figure
plt.savefig(f'plots/time-per-iter-tot-{task}.png', dpi=dpi)
plt.show()


In [None]:
# Melt the data for plotting with specific labels
melted_data = pd.melt(combined_data, id_vars=['Block', 'Optimization'], value_vars=['Inference Time', 'Train Time'], 
                      var_name='Time Type', value_name='Time')

melted_data['Category'] = melted_data['Optimization'] + ' - ' + melted_data['Time Type']

# Filter for Train Time only
train_time_data = melted_data[melted_data['Time Type'] == 'Train Time']

# Create the box plot with the specified categories
plt.figure(figsize=(16, 10), dpi=dpi)

sns.boxplot(x='Block', y='Time', hue='Category', data=train_time_data, showfliers=False, 
            palette={'With Optimization - Train Time': 'lightcoral', 
                     'Without Optimization - Train Time': 'lightyellow'})

plt.title(f'Train execution time per iteration on {task_name} task', fontsize=titlesize)
plt.xlabel('Record Segments (Thousands)', fontsize=axissize)
plt.ylabel('Time (µs)', fontsize=axissize)
plt.xticks(ticks=range(len(combined_data['Block'].unique())), 
           labels=[f"{int(i*block_size/1000)}-{int((i+1)*block_size/1000)}" for i in range(len(combined_data['Block'].unique()))],
           rotation=45, fontsize=ticksize)

plt.legend(title='Category', title_fontsize=legendsize, fontsize=legendsize)
plt.grid(True)

plt.savefig(f'plots/time-per-iter-train-{task}.png', dpi=dpi)
plt.show()

In [None]:
# Filter for Inference Time only
inference_time_data = melted_data[melted_data['Time Type'] == 'Inference Time']

# Create the box plot with the specified categories
plt.figure(figsize=(16, 10), dpi=dpi)

sns.boxplot(x='Block', y='Time', hue='Category', data=inference_time_data, showfliers=False, 
            palette={'With Optimization - Inference Time': 'lightseagreen', 
                     'Without Optimization - Inference Time': 'paleturquoise'})

plt.title(f'Inference execution time per iteration on {task_name} task', fontsize=titlesize)
plt.xlabel('Record Segments (Thousands)', fontsize=axissize)
plt.ylabel('Time (µs)', fontsize=axissize)
plt.xticks(ticks=range(len(combined_data['Block'].unique())), 
           labels=[f"{int(i*block_size/1000)}-{int((i+1)*block_size/1000)}" for i in range(len(combined_data['Block'].unique()))],
           rotation=45, fontsize=ticksize)

plt.legend(title='Category', title_fontsize=legendsize, fontsize=legendsize)
plt.grid(True)

# plt.ylim(0, 140000)
# plt.xlim(0, 12.5)

plt.savefig(f'plots/time-per-iter-inf-{task}.png', dpi=dpi)
plt.show()


In [None]:
# Melt the data for plotting with specific labels
melted_data = pd.melt(combined_data, id_vars=['Block', 'Optimization'], value_vars=['Inference Time', 'Train Time'], 
                      var_name='Time Type', value_name='Time')

# Combine Optimization and Time Type for clearer labels in the plot
melted_data['Category'] = melted_data['Optimization'] + ' - ' + melted_data['Time Type']

# Create the box plot with the specified categories
plt.figure(figsize=(16, 10), dpi=dpi)

sns.boxplot(x='Block', y='Time', hue='Category', data=melted_data, showfliers=False, 
            palette={'With Optimization - Inference Time': 'lightseagreen', 
                     'Without Optimization - Inference Time': 'paleturquoise', 
                     'With Optimization - Train Time': 'lightcoral', 
                     'Without Optimization - Train Time': 'lightyellow'
                     })

plt.title(f'Train and inference execution time per iteration on {task_name} task', fontsize=titlesize)
plt.xlabel('Record Segments (Thousands)', fontsize=axissize)
plt.ylabel('Time (µs)', fontsize=axissize)
plt.xticks(ticks=range(len(combined_data['Block'].unique())), 
           labels=[f"{int(i*block_size/1000)}-{int((i+1)*block_size/1000)}" for i in range(len(combined_data['Block'].unique()))],
           rotation=45, fontsize=ticksize)

plt.legend(title='Category', title_fontsize=legendsize, fontsize=legendsize)
plt.grid(True)

# plt.ylim(0, 70000)

# plt.savefig()
plt.show()

# Number of Nodes (Image not saved)

In [None]:
axissize = 20
legendsize = 14
ticksize=16
titlesize=22
dpi=150

In [None]:
data = pd.read_csv(fpath_notopt_depth, header=None)

# Assuming the file has 5 columns, use only the first one
nodes_count = data.iloc[:, 0]

# Create a DataFrame for plotting
df = pd.DataFrame({'Number of Records': range(1, len(nodes_count) + 1), 'Number of Nodes': nodes_count})

# Multiply the number of records by 1000
df['Number of Records'] = df['Number of Records'] * 1000

# Plot the data again with the updated number of records
plt.figure(figsize=(12, 6), dpi=dpi)
plt.plot(df['Number of Nodes'], df['Number of Records'], color='purple')
plt.title('Number of Nodes Over Records', fontsize=titlesize)
plt.xlabel('Number of Nodes', fontsize=axissize)
plt.ylabel('Number of Records', fontsize=axissize)
plt.xticks(fontsize=ticksize)
plt.yticks(fontsize=ticksize)
plt.grid(True)

plt.show()


# Tree Depth

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
data = pd.read_csv(fpath_notopt_depth, header=None)

# Split the single column into four separate columns
# data = data[0].str.split(',', expand=True)

# Assign column names
data.columns = ['Number of Nodes', 'Optimal Depth', 'Average Depth', 'Average Weighted Depth', 'Max Depth']

# Convert columns to appropriate types
data['Number of Nodes'] = data['Number of Nodes'].astype(float)
data['Optimal Depth'] = data['Optimal Depth'].astype(float)
data['Average Depth'] = data['Average Depth'].astype(float)
data['Average Weighted Depth'] = data['Average Weighted Depth'].astype(float)
data['Max Depth'] = data['Max Depth'].astype(float)

In [None]:
# Plotting the data with secondary y-axis for Average Weighted Depth
fig, ax1 = plt.subplots(figsize=(10, 6), dpi=dpi)

# Plot Optimal Depth, Average Depth, and Max Depth on the primary y-axis
ax1.plot(data['Number of Nodes'], data['Optimal Depth'], label='Optimal Depth (log2(#nodes))')
ax1.plot(data['Number of Nodes'], data['Average Depth'], label='Average Depth')
ax1.plot(data['Number of Nodes'], data['Max Depth'], label='Max Depth', color='orange')

# Set labels and title for the primary y-axis
ax1.set_xlabel('Number of Nodes', fontsize=axissize)
ax1.set_ylabel('Depth', fontsize=axissize)
ax1.set_title('Depth Metrics Over Number of Nodes', fontsize=titlesize)
ax1.legend(loc='lower right', fontsize=legendsize)
ax1.grid(True)

# Set tick sizes
ax1.tick_params(axis='both', which='major', labelsize=ticksize)

plt.savefig(f'plots/depths-{task}.png', dpi=dpi)
plt.show()

In [None]:
# Plotting the data with secondary y-axis for Average Weighted Depth
fig, ax1 = plt.subplots(figsize=(10, 6), dpi=dpi)

# Plot Optimal Depth, Average Depth, and Max Depth on the primary y-axis
# ax1.plot(data['Number of Nodes'], data['Optimal Depth'], label='Optimal Depth (log2(#nodes))')
# ax1.plot(data['Number of Nodes'], data['Average Depth'], label='Average Depth')
ax1.plot(data['Number of Nodes'], data['Max Depth'], label='Max Depth', color='orange')

# Set labels and title for the primary y-axis
ax1.set_xlabel('Number of Nodes', fontsize=axissize)
ax1.set_ylabel('Depth', fontsize=axissize)
ax1.set_title(f'Max and Average weighted depth compared for {task_name}', fontsize=titlesize)
ax1.legend(loc='upper left', fontsize=legendsize)
ax1.grid(True)

# Set tick sizes
ax1.tick_params(axis='both', which='major', labelsize=ticksize)

# Add 10% padding to y-axis limits
y_min, y_max = ax1.get_ylim()
ax1.set_ylim(0, y_max + 0.1 * (y_max - y_min))

# Create a secondary y-axis for Average Weighted Depth
ax2 = ax1.twinx()
ax2.plot(data['Number of Nodes'], data['Average Weighted Depth'], label='Average Weighted Depth', color='purple')

# Set label for the secondary y-axis
ax2.set_ylabel('Average Weighted Depth', fontsize=axissize, color='purple')
ax2.tick_params(axis='y', labelcolor='purple', labelsize=ticksize)
ax2.legend(loc='upper right', fontsize=legendsize)

# Add 10% padding to secondary y-axis limits
y2_min, y2_max = ax2.get_ylim()
ax2.set_ylim(0, y2_max + 0.1 * (y2_max - y2_min))

plt.savefig(f'plots/depth-awd-{task}.png', dpi=dpi)
plt.show()


# Sequential access

In [None]:
axissize = 16
legendsize = 12
ticksize=13
titlesize=14
dpi=150

In [None]:
def sequential_access(df, title, opt_str):
    # Rename columns
    df.columns = ['Sequential accesses', 'Non-sequential accesses']

    # Add an index for the number of samples
    df.index = range(1, len(df) + 1)
    df.index.name = 'Sample'

    # Adjust the y-axis by multiplying by 100
    df *= 100

    # Create the area chart with the specified customizations
    plt.figure(dpi=dpi)
    df.plot(kind='area', stacked=True)
    plt.title(title, fontsize=titlesize)
    plt.xlabel('Iteration (thousands)', fontsize=axissize)
    plt.ylabel('Percentage', fontsize=axissize)
    plt.xticks(fontsize=ticksize)
    plt.yticks(fontsize=ticksize)
    plt.legend(title='Access Type', loc='center right', fontsize=legendsize, title_fontsize=legendsize)

    plt.savefig(f'plots/seq-accesses-{opt_str}-{task}.png', dpi=dpi)
    plt.show()

In [None]:
# Load the CSV file
df = pd.read_csv(fpath_opt_sorted_count)
sequential_access(df, f"Ratio of sequential accesses in a run with optimization for {task_name}", "opt")

In [None]:
# Load the CSV file
df = pd.read_csv(fpath_notopt_sorted_count)
sequential_access(df, f"Ratio of sequential accesses in a run without optimization for {task_name}", "nonopt")