In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

task = "reg"
# task = "clf"

if task == "clf":
    task_name = "Classification"
elif "reg":
    task_name = "Regression"

dpi=150

# Python vs Rust

In [None]:
def python_vs_rust(python_times, rust_times, task, task_name):
    df_python = pd.DataFrame(python_times, columns=['Execution Time (sec)'])
    df_rust = pd.DataFrame(rust_times, columns=['Execution Time (sec)'])

    # Plotting
    plt.figure(figsize=(6, 4))
    plt.bar('Python', df_python['Execution Time (sec)'].mean(), yerr=df_python['Execution Time (sec)'].std(), color='#6e6e6e', alpha=0.7, label='Execution Time (sec)')
    plt.bar('Rust', df_rust['Execution Time (sec)'].mean(), yerr=df_rust['Execution Time (sec)'].std(), color='#ff000f', alpha=0.7)
    
    plt.xlabel('Programming Languages')
    plt.ylabel('Execution Time (sec)')
    plt.title(f'Execution Time of Python vs Rust for {task_name}')
    
    plt.ylim(0, 15)

    # Hide verical gray lines inside plot
    plt.grid(axis='x', linestyle='')

    # plt.legend(fontsize=legendsize)
    plt.grid(True)
    plt.savefig(f'plots/python-vs-rust-{task}.png', dpi=dpi, bbox_inches='tight')
    plt.show()


In [None]:
# Classification
# Provided times for Python and Rust
python_times = [
    11.77, 10.62, 10.00, 9.46, 10.90, 10.50, 8.48, 11.26, 9.98, 10.40,
    9.18, 9.98, 9.16, 10.43, 9.31, 8.96, 12.19, 10.97, 9.15, 9.91,
    9.77, 12.49, 11.29, 10.32, 14.11, 10.17, 9.60, 10.85, 11.47, 10.57,
    10.34, 11.78, 9.52, 10.21, 10.53, 10.83, 11.63, 11.94, 10.05, 9.11
]

rust_times = [
    2.845, 2.919, 2.835, 2.818, 3.067, 2.988, 3.084, 2.790, 2.818, 2.791,
    2.807, 2.803, 2.797, 2.804, 2.780, 2.786, 2.807, 2.792, 2.791, 2.827,
    2.971, 3.015, 2.960, 2.958, 2.835, 2.868, 2.816, 2.944, 2.845, 2.993,
    2.840, 2.881, 2.833, 2.853, 2.935, 2.908, 2.920, 2.861, 2.900, 2.824
]

python_vs_rust(python_times, rust_times, "clf", "Classification")
print(f"Python: {np.array(python_times).mean():.2f}±{np.array(python_times).var():.2f}")
print(f"Rust: {np.array(rust_times).mean():.2f}±{np.array(rust_times).var():.2f}")

In [None]:
# Regression
# Provided new times for Python and Rust
python_times = [
    13.02, 13.19, 12.64, 13.14, 12.93, 13.27, 13.46, 13.45, 14.24, 13.26,
    13.58, 12.96, 13.44, 13.44, 13.38, 12.60, 13.33, 12.52, 13.01, 13.54,
    13.09, 13.57, 13.33, 13.43, 13.82, 13.83, 12.94, 12.50, 13.21, 14.28,
    13.35, 12.88, 13.52, 13.57, 14.24, 13.78, 13.65, 13.59, 13.38, 13.43
]

rust_times = [
    0.456, 0.458, 0.462, 0.468, 0.465, 0.472, 0.463, 0.475, 0.475, 0.471,
    0.480, 0.447, 0.467, 0.448, 0.471, 0.472, 0.507, 0.454, 0.460, 0.476,
    0.476, 0.473, 0.485, 0.481, 0.463, 0.478, 0.529, 0.446, 0.459, 0.503,
    0.473, 0.447, 0.450, 0.523, 0.483, 0.462, 0.470, 0.449, 0.483, 0.477
]

python_vs_rust(python_times, rust_times, "reg", "Regression")
print(f"Python: {np.array(python_times).mean():.2f}±{np.array(python_times).var():.2f}")
print(f"Rust: {np.array(rust_times).mean():.2f}±{np.array(rust_times).var():.4f}")

# Execution time

In [None]:
block_size = 50000

fpath_notopt_times = f"res_{task}_notopt_times_freq1000.csv"
fpath_opt_times = f"res_{task}_opt_times_freq1000.csv"
df_no_opt = pd.read_csv(fpath_notopt_times)
df_opt = pd.read_csv(fpath_opt_times)

df_no_opt_head = df_no_opt.head()
df_opt_head = df_opt.head()

df_no_opt.columns = ['Inference Time', 'Train Time', 'Total Time']
df_opt.columns = ['Inference Time', 'Train Time', 'Total Time']

# Add the optimization labels
df_no_opt['Optimization'] = 'Without Optimization'
df_opt['Optimization'] = 'With Optimization'

# Add block numbers to both datasets
df_no_opt['Block'] = (df_no_opt.index // block_size) + 1
df_opt['Block'] = (df_opt.index // block_size) + 1

df = pd.concat([df_no_opt, df_opt])

# Nano seconds to Micro seconds
df['Inference Time'] = df['Inference Time'] / 1000
df['Train Time'] = df['Train Time'] / 1000
df['Total Time'] = df['Total Time'] / 1000

# df

In [None]:
def print_last_block_mean(df, task):
    # Task: ["Inference Time", "Train Time", "Total Time"]
    noopt = df[df["Optimization"] == "Without Optimization"]
    opt = df[df["Optimization"] == "With Optimization"]

    noopt = noopt.iloc[-block_size:-1]
    opt = opt.iloc[-block_size:-1]

    opt_mean = opt[task].median()
    noopt_mean = noopt[task].median()
    print(f"Not optimized mean last block: {noopt_mean}")
    print(f"Optimized mean last block: {opt_mean}")
    print(f"Improvement: {100*(noopt_mean-opt_mean)/noopt_mean:.2f}%")

def print_last_block_inf_vs_train(df):
    noopt = df[df["Optimization"] == "Without Optimization"]
    noopt = noopt.iloc[-block_size:-1]
    inf_mean = noopt["Inference Time"].median()
    train_mean = noopt["Train Time"].median()

    print(f"Inference (no optimization) mean last block: {inf_mean}")
    print(f"Training (no optimization) mean last block: {train_mean}")
    print(f"Inference takes {inf_mean/train_mean:.2f} more times than train")

In [None]:
print_last_block_mean(df, "Total Time")

plt.figure(figsize=(6, 4))
sns.boxplot(
    x="Block",
    y="Total Time",
    hue="Optimization",
    data=df,
    linewidth=1,
    width=0.5,
    palette={"With Optimization": "lightblue", "Without Optimization": "lightgreen"},
    showfliers=False,
)
plt.title(f"Total execution time per iteration on {task_name} task")
plt.xlabel("Record Segments (Thousands)")
plt.ylabel("Time per iteration (µs)")
plt.xticks(
    ticks=range(len(df["Block"].unique())),
    labels=[
        f"{int(i*block_size/1000)}-{int((i+1)*block_size/1000)}"
        for i in range(len(df["Block"].unique()))
    ],
    rotation=45,
)
plt.legend(title="")
# Hide verical gray lines inside plot
plt.grid(axis="x", linestyle="")
plt.grid(True)
plt.savefig(f"plots/time-per-iter-tot-{task}.png", dpi=dpi, bbox_inches="tight")
plt.show()

In [None]:
df_melt = pd.melt(
    df,
    id_vars=["Block", "Optimization"],
    value_vars=["Inference Time", "Train Time"],
    var_name="Time Type",
    value_name="Time",
)
df_melt["Category"] = df_melt["Optimization"] + " - " + df_melt["Time Type"]

In [None]:
print_last_block_inf_vs_train(df)

df_melt_opt = df_melt[df_melt["Optimization"] == "With Optimization"]

plt.figure(figsize=(6, 4))
sns.boxplot(
    x="Block",
    y="Time",
    hue="Category",
    data=df_melt_opt,
    showfliers=False,
    linewidth=1,
    width=0.5,
    palette={
        "With Optimization - Inference Time": "lightseagreen",
        # "Without Optimization - Inference Time": "paleturquoise",
        "With Optimization - Train Time": "lightcoral",
        # "Without Optimization - Train Time": "lightyellow",
    },
)
plt.title(f"Train vs Inference execution time per iteration on {task_name} task")
plt.xlabel("Record Segments (Thousands)")
plt.ylabel("Time per iteration (µs)")
plt.xticks(
    ticks=range(len(df["Block"].unique())),
    labels=[
        f"{int(i*block_size/1000)}-{int((i+1)*block_size/1000)}"
        for i in range(len(df["Block"].unique()))
    ],
    rotation=45,
)
plt.legend(title="")
# Hide verical gray lines inside plot
plt.grid(axis="x", linestyle="")
plt.grid(True)
# plt.ylim(0, 70000)
plt.savefig(f"plots/time-per-iter-noopt-{task}.png", dpi=dpi, bbox_inches="tight")
plt.show()

In [None]:
print_last_block_mean(df, "Train Time")

df_melt_train = df_melt[df_melt["Time Type"] == "Train Time"]

# Create the box plot with the specified categories
plt.figure(figsize=(6, 4))

sns.boxplot(
    x="Block",
    y="Time",
    hue="Category",
    data=df_melt_train,
    showfliers=False,
    linewidth=1,
    width=0.5,
    palette={
        "With Optimization - Train Time": "lightcoral",
        "Without Optimization - Train Time": "lightyellow",
    },
)

plt.title(f"Train execution time per iteration on {task_name} task")
plt.xlabel("Record Segments (Thousands)")
plt.ylabel("Time per iteration (µs)")
plt.xticks(
    ticks=range(len(df["Block"].unique())),
    labels=[
        f"{int(i*block_size/1000)}-{int((i+1)*block_size/1000)}"
        for i in range(len(df["Block"].unique()))
    ],
    rotation=45,
)

plt.legend(title="Category")

# Hide verical gray lines inside plot
plt.grid(axis="x", linestyle="")

plt.grid(True)

plt.savefig(f"plots/time-per-iter-train-{task}.png", dpi=dpi, bbox_inches="tight")
plt.show()



In [None]:
print_last_block_mean(df, "Inference Time")

# Filter for Inference Time only
df_melt_inf = df_melt[df_melt['Time Type'] == 'Inference Time']

# Create the box plot with the specified categories
plt.figure(figsize=(6, 4))

sns.boxplot(x='Block', y='Time', hue='Category', data=df_melt_inf, showfliers=False, 
            linewidth=1, width=0.5,
            palette={'With Optimization - Inference Time': 'lightseagreen', 
                     'Without Optimization - Inference Time': 'paleturquoise'})

plt.title(f'Inference execution time per iteration on {task_name} task')
plt.xlabel('Record Segments (Thousands)')
plt.ylabel('Time per iteration (µs)')
plt.xticks(ticks=range(len(df['Block'].unique())), 
           labels=[f"{int(i*block_size/1000)}-{int((i+1)*block_size/1000)}" for i in range(len(df['Block'].unique()))],
           rotation=45)

plt.legend(title='Category')

# Hide verical gray lines inside plot
plt.grid(axis='x', linestyle='')

plt.grid(True)

# plt.ylim(0, 140000)
# plt.xlim(0, 12.5)

plt.savefig(f'plots/time-per-iter-inf-{task}.png', dpi=dpi, bbox_inches='tight')
plt.show()

# Number of Nodes

In [None]:
def load_df_depth(freq):
    fpath_notopt_depth = f"res_{task}_notopt_depth_freq{freq}.csv"
    df = pd.read_csv(fpath_notopt_depth, header=None)

    df.columns = ['Number of Nodes', 'Optimal Depth', 'Average Depth', 'Average Weighted Depth', 'Max Depth']

    df['Number of Nodes'] = df['Number of Nodes'].astype(float) / 1000
    df['Optimal Depth'] = df['Optimal Depth'].astype(float)
    df['Average Depth'] = df['Average Depth'].astype(float)
    df['Average Weighted Depth'] = df['Average Weighted Depth'].astype(float)
    df['Max Depth'] = df['Max Depth'].astype(float)

    df["Iteration"] = df.index * freq / 1000
    return df

In [None]:
df_depth = load_df_depth(100000)

plt.figure(figsize=(6, 4))
plt.plot(df_depth["Number of Nodes"], df_depth["Iteration"], color="purple", marker="o")
plt.title(f"Number of Nodes over Records for {task_name}")
plt.xlabel("Number of Nodes (thousands)")
plt.ylabel("Iterations (thousands)")
plt.grid(True)
plt.savefig(f"plots/node-count-{task}.png", dpi=dpi, bbox_inches="tight")
plt.show()

last_iter = df_depth.iloc[-1]["Iteration"]
last_node_count = df_depth.iloc[-1]["Number of Nodes"]
print(f"Number of iterations: {last_iter}, Number of nodes: {last_node_count}")

# Tree Depth

In [None]:
df_depth = load_df_depth(1000)

In [None]:
fig, ax1 = plt.subplots(figsize=(6, 4))
ax1.plot(
    df_depth["Number of Nodes"],
    df_depth["Optimal Depth"],
    label="Optimal Depth (log2(#nodes))",
    # marker="o",
)
ax1.plot(
    df_depth["Number of Nodes"],
    df_depth["Average Depth"],
    label="Average Depth",
    # marker="o",
)
ax1.plot(
    df_depth["Number of Nodes"],
    df_depth["Max Depth"],
    label="Max Depth",
    color="orange",
    # marker="o",
)
ax1.set_xlabel("Number of Nodes (thousands)")
ax1.set_ylabel("Depth")
ax1.set_title(f"Depth metrics over Number of nodes for {task_name}")
ax1.grid(True)
ax1.tick_params(axis="both", which="major")
ax1.legend()
plt.savefig(f"plots/depths-{task}.png", dpi=dpi, bbox_inches="tight")
plt.show()
print(f"Max depth: {df_depth['Max Depth'].max():.0f}")

In [None]:
fig, ax1 = plt.subplots(figsize=(6, 4))
ax1.plot(
    df_depth["Number of Nodes"],
    df_depth["Max Depth"],
    label="Max Depth",
    color="orange",
)
ax1.set_xlabel("Number of Nodes")
ax1.set_ylabel("Depth")
ax1.set_title(f"Max and Average weighted depth compared for {task_name}")
ax1.legend(loc="upper left")
ax1.grid(True)
ax1.tick_params(axis="both", which="major")

ax2 = ax1.twinx()
ax2.plot(
    df_depth["Number of Nodes"],
    df_depth["Average Weighted Depth"],
    label="Average Weighted Depth",
    color="purple",
)
ax2.set_ylabel("Average Weighted Depth", color="purple")
ax2.tick_params(axis="y", labelcolor="purple")
ax2.legend(loc="upper right")

# Add 10% padding to y-axis limits
y_min, y_max = ax1.get_ylim()
ax1.set_ylim(0, y_max + 0.1 * (y_max - y_min))
y2_min, y2_max = ax2.get_ylim()
ax2.set_ylim(0, y2_max + 0.1 * (y2_max - y2_min))

plt.savefig(f"plots/depth-awd-{task}.png", dpi=dpi, bbox_inches="tight")
plt.show()

# Sequential access

In [None]:
def sequential_access(df, title, opt_str):
    df.columns = ["Sequential accesses", "Non-sequential accesses"]
    df.index = range(1, len(df) + 1)
    df.index.name = "Sample"
    df *= 100

    plt.figure(figsize=(6, 4))
    df.plot(kind="area", stacked=True, color=["#ff000f", "#6e6e6e"])
    plt.title(title)
    plt.xlabel("Iteration (thousands)")
    plt.ylabel("Percentage")
    plt.legend(title="Access Type", loc="center right")
    plt.savefig(
        f"plots/seq-accesses-{opt_str}-{task}.png", dpi=dpi, bbox_inches="tight"
    )
    plt.show()

In [None]:
fpath_opt_sorted_count = f"res_{task}_opt_sorted_count_freq1000.csv"
fpath_notopt_sorted_count = f"res_{task}_notopt_sorted_count_freq1000.csv"

df_opt = pd.read_csv(fpath_opt_sorted_count)
df_notopt = pd.read_csv(fpath_notopt_sorted_count)

# Keep only 50k iterations
df_opt = df_opt[df_opt.index < 50]
df_notopt = df_notopt[df_notopt.index < 50]

In [None]:
sequential_access(df_opt, f"Ratio of sequential accesses with optimization for {task_name}", "opt")
print(f"Percentage of sequential accesses: {df_opt.iloc[-1]['Sequential accesses']:.1f}")

In [None]:
sequential_access(df_notopt, f"Ratio of sequential accesses without optimization for {task_name}", "nonopt")
print(f"Percentage of sequential accesses: {df_notopt.iloc[-1]['Sequential accesses']:.1f}")

## Sort time

In [None]:
columns = ['number_of_nodes', 'iteration_number', 'time_taken_to_sort', 'train_inference_sum']
fpath_notopt_sort_time = f"res_{task}_notopt_sort_time_freq100000.csv"
fpath_opt_sort_time = f"res_{task}_opt_sort_time_freq100000.csv"
df_not_optimized = pd.read_csv(fpath_notopt_sort_time, header=None)
df_not_optimized.columns = columns
df_optimized = pd.read_csv(fpath_opt_sort_time, header=None)
df_optimized.columns = columns

df_optimized['time_taken_to_sort_sec'] = df_optimized['time_taken_to_sort'] / 1e9
df_combined = df_optimized.set_index("number_of_nodes").join(df_not_optimized.set_index("number_of_nodes"), lsuffix='_opt', rsuffix='_notopt')
execution_time_gain = df_combined["train_inference_sum_notopt"] - df_combined["train_inference_sum_opt"]
execution_time_gain = execution_time_gain / 1e9

df_optimized["number_of_nodes"] = df_optimized["number_of_nodes"] / 1000

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(
    df_optimized["number_of_nodes"],
    df_optimized["time_taken_to_sort_sec"],
    label="Cost",
    marker="s",
    color="orange",
)
plt.xlabel("Number of Nodes (thousands)")
plt.ylabel("Time (sec)")
plt.title(f"Time Cost over Size of the Tree for {task_name}")
plt.grid(True)

plt.savefig(
    f"plots/sorting-time-cost-{task}.png", dpi=dpi, bbox_inches="tight"
)
plt.show()

In [None]:
# Plot data on the same axis
plt.figure(figsize=(6, 4))

plt.plot(
    df_combined.index / 1000,
    execution_time_gain,
    label="Gain",
    marker="o",
    color="orange",
)

plt.xlabel("Number of Nodes (thousands)")
plt.ylabel("Time (sec)")
plt.title(f"Time Gain over Size of the Tree for {task_name}")
# plt.legend()
plt.grid(True)

plt.savefig(
    f"plots/sorting-time-gain-{task}.png", dpi=dpi, bbox_inches="tight"
)
plt.show()

## Test on Robotic dataset

In [None]:
# Regression

number_of_trees = list(range(1, 11))
memory_footprint_mb = [
    117977251, 254379320, 381582296, 508764584, 635225768,
    763115536, 889746376, 1016018832, 1144669632, 1271891064
]
memory_footprint_mb = [x / (1024 ** 2) for x in memory_footprint_mb]

plt.figure(figsize=(6,4))
plt.bar(number_of_trees, memory_footprint_mb)
plt.title('Memory Footprint over Number of Trees for Regression')
plt.xlabel('Number of Trees')
plt.ylabel('Memory Footprint (MB)')
plt.xticks(number_of_trees)
plt.grid(True)

plt.savefig(f'plots/tree-count-vs-memory-reg.png', dpi=dpi, bbox_inches='tight')
plt.show()


In [None]:
# Regression

number_of_trees = list(range(1, 11))
mse_values = [
    0.12732017, 0.10974102, 0.10380474, 0.10095829, 0.099099696,
    0.09776777, 0.09740372, 0.097054236, 0.09652473, 0.096195795
]

plt.figure(figsize=(6,4))
plt.bar(number_of_trees, mse_values)
plt.title('MSE over Number of Trees for Regression')
plt.xlabel('Number of Trees')
plt.ylabel('Mean Squared Error (MSE)')
plt.xticks(number_of_trees)
plt.ylim(0.08, max(mse_values) + 0.01)
plt.grid(True)

plt.savefig(f'plots/tree-count-vs-mse-reg.png', dpi=dpi, bbox_inches='tight')

plt.show()


In [None]:
# Classification
import matplotlib.pyplot as plt

number_of_trees = list(range(1, 11))
memory_footprint_mb = [
    5407088,
    11396704,
    17181952,
    23638064,
    29309696,
    36857288,
    40638920,
    45845832,
    52426928,
    58275096,
]
memory_footprint_mb = [x / (1024 ** 2) for x in memory_footprint_mb]

plt.figure(figsize=(6,4))
plt.bar(number_of_trees, memory_footprint_mb)
plt.title('Memory Footprint over Number of Trees for Classification')
plt.xlabel('Number of Trees')
plt.ylabel('Memory Footprint (MB)')
plt.xticks(number_of_trees)
plt.grid(True)

plt.savefig(f'plots/tree-count-vs-memory-clf.png', dpi=dpi, bbox_inches='tight')
plt.show()


In [None]:
# Regression

number_of_trees = list(range(1, 11))
mse_values = [
    0.88760096, 0.91904557, 0.9264443, 0.9316851, 0.93180835,
    0.9337197, 0.9339047, 0.93495286, 0.9363709, 0.9363709
]

plt.figure(figsize=(6,4))
plt.bar(number_of_trees, mse_values)
plt.title('Accuracy over Number of Trees for Classification')
plt.xlabel('Number of Trees')
plt.ylabel('Mean Squared Error (MSE)')
plt.xticks(number_of_trees)
plt.ylim(0.84, max(mse_values) + 0.01)
plt.grid(True)

plt.savefig(f'plots/tree-count-vs-acc-clf.png', dpi=dpi, bbox_inches='tight')

plt.show()


## Future works

In [None]:
import matplotlib.pyplot as plt

features = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
throughput = [2, 3.7, 5.2, 6.4, 7.4, 8.2, 8.8, 9.3, 9.7, 10]

plt.figure(figsize=(6, 4))
plt.plot(features, throughput, marker="o", linestyle="-", color="b")
plt.ylabel("Throughput (MB/sec)")
plt.xlabel("# Features")
plt.xticks(features)
plt.ylim(0, 11)
plt.xlim(0, 21)
plt.title("Relationship between Number of Features and Throughput")
plt.grid(True)
plt.show()
# Do not export the image: it's not added in directory "6-results" by in "7-future-works"