In [None]:
import pandas as pd

# task = "reg"
task = "clf"

if task == "clf":
    task_name = "Classification"
elif "reg":
    task_name = "Regression"

fpath_notopt_times = f'res_{task}_notopt_times.csv'
fpath_opt_times = f'res_{task}_opt_times.csv'

fpath_notopt_depth = f'res_{task}_notopt_depth.csv'

fpath_notopt_sorted_count = f'res_{task}_notopt_sorted_count.csv'
fpath_opt_sorted_count = f'res_{task}_opt_sorted_count.csv'


# Execution time

In [None]:
data_no_caching = pd.read_csv(fpath_notopt_times)
data_caching = pd.read_csv(fpath_opt_times)

# Display the first few rows of each dataset to understand their structure
data_no_caching_head = data_no_caching.head()
data_caching_head = data_caching.head()

data_no_caching_head, data_caching_head


# Rename the columns for clarity
data_no_caching.columns = ['Inference Time', 'Train Time', 'Total Time']
data_caching.columns = ['Inference Time', 'Train Time', 'Total Time']

# Add the optimization labels
data_no_caching['Optimization'] = 'Without Optimization'
data_caching['Optimization'] = 'With Optimization'

# Add block numbers to both datasets
block_size = 20000
data_no_caching['Block'] = (data_no_caching.index // block_size) + 1
data_caching['Block'] = (data_caching.index // block_size) + 1

# Combine both datasets
combined_data = pd.concat([data_no_caching, data_caching])
combined_data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a new figure
plt.figure(figsize=(14, 8))

# Create a boxplot using seaborn to handle the coloring more effectively
sns.boxplot(x='Block', y='Total Time', hue='Optimization', data=combined_data, 
            palette={'With Optimization': 'lightblue', 'Without Optimization': 'lightgreen'}, showfliers=False)

# Set title and labels
plt.title(f'Boxplot of execution time by iteration by Segments of 20 000 Records (With and Without Optimization) on {task_name} task')
plt.xlabel('Record Segments')
plt.ylabel('Total Time (ns)')
plt.xticks(ticks=range(len(combined_data['Block'].unique())), 
           labels=[f"{i*block_size}-{(i+1)*block_size}" for i in range(len(combined_data['Block'].unique()))],
           rotation=45)
plt.grid(True)

# Display the plot
plt.show()


In [None]:
# Melt the data for plotting with specific labels
melted_data = pd.melt(combined_data, id_vars=['Block', 'Optimization'], value_vars=['Inference Time', 'Train Time'], 
                      var_name='Time Type', value_name='Time')

# Combine Optimization and Time Type for clearer labels in the plot
melted_data['Category'] = melted_data['Optimization'] + ' - ' + melted_data['Time Type']

# Create the box plot with the specified categories
plt.figure(figsize=(16, 10))

sns.boxplot(x='Block', y='Time', hue='Category', data=melted_data, showfliers=False, 
            palette={'With Optimization - Inference Time': 'lightseagreen', 
                     'Without Optimization - Inference Time': 'paleturquoise', 
                     'With Optimization - Train Time': 'lightcoral', 
                     'Without Optimization - Train Time': 'lightyellow'
                     })

# Set title and labels
plt.title(f'Boxplot of Train and Inference Times by Segments of 20,000 Records (With and Without Optimization) on {task_name} task')
plt.xlabel('Record Segments')
plt.ylabel('Time (ns)')
plt.xticks(ticks=range(len(melted_data['Block'].unique())), 
           labels=[f"{i*block_size}-{(i+1)*block_size}" for i in range(len(melted_data['Block'].unique()))],
           rotation=45)
plt.grid(True)

# Display the plot
plt.show()


In [None]:
# Filter for Inference Time only
inference_time_data = melted_data[melted_data['Time Type'] == 'Inference Time']

# Create the box plot with the specified categories
plt.figure(figsize=(16, 10))

sns.boxplot(x='Block', y='Time', hue='Category', data=inference_time_data, showfliers=False, 
            palette={'With Optimization - Inference Time': 'lightseagreen', 
                     'Without Optimization - Inference Time': 'paleturquoise'})

# Set title and labels
plt.title(f'Boxplot of Inference Times by Segments of 20,000 Records (With and Without Optimization) on {task_name} task')
plt.xlabel('Record Segments')
plt.ylabel('Inference Time (ns)')
plt.xticks(ticks=range(len(inference_time_data['Block'].unique())), 
           labels=[f"{i*block_size}-{(i+1)*block_size}" for i in range(len(inference_time_data['Block'].unique()))],
           rotation=45)
plt.grid(True)

# Display the plot
plt.show()

In [None]:
melted_data['Category'] = melted_data['Optimization'] + ' - ' + melted_data['Time Type']

# Filter for Train Time only
train_time_data = melted_data[melted_data['Time Type'] == 'Train Time']

# Create the box plot with the specified categories
plt.figure(figsize=(16, 10))

sns.boxplot(x='Block', y='Time', hue='Category', data=train_time_data, showfliers=False, 
            palette={'With Optimization - Train Time': 'lightcoral', 
                     'Without Optimization - Train Time': 'lightyellow'})

# Set title and labels
plt.title(f'Boxplot of Train Times by Segments of 20,000 Records (With and Without Optimization) on {task_name} task')
plt.xlabel('Record Segments')
plt.ylabel('Train Time (ns)')
plt.xticks(ticks=range(len(train_time_data['Block'].unique())), 
           labels=[f"{i*block_size}-{(i*block_size+block_size)}" for i in range(len(train_time_data['Block'].unique()))],
           rotation=45)
plt.grid(True)

# Display the plot
plt.show()


# Number of Nodes

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
data = pd.read_csv(fpath_notopt_depth, header=None)

# Assuming the file has 5 columns, use only the first one
nodes_count = data.iloc[:, 0]

# Create a DataFrame for plotting
df = pd.DataFrame({'Number of Records': range(1, len(nodes_count) + 1), 'Number of Nodes': nodes_count})

# Multiply the number of records by 1000
df['Number of Records'] = df['Number of Records'] * 1000

# Plot the data again with the updated number of records
plt.figure(figsize=(12, 6))
plt.plot(df['Number of Records'], df['Number of Nodes'], color='purple')
plt.title('Number of Nodes Over Records')
plt.xlabel('Number of Records')
plt.ylabel('Number of Nodes')
plt.grid(True)
plt.show()


# Tree Depth

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
data = pd.read_csv(fpath_notopt_depth, header=None)

# Split the single column into four separate columns
# data = data[0].str.split(',', expand=True)

# Assign column names
data.columns = ['Number of Nodes', 'Optimal Depth', 'Average Depth', 'Average Weighted Depth', 'Max Depth']

# Convert columns to appropriate types
data['Number of Nodes'] = data['Number of Nodes'].astype(float)
data['Optimal Depth'] = data['Optimal Depth'].astype(float)
data['Average Depth'] = data['Average Depth'].astype(float)
data['Average Weighted Depth'] = data['Average Weighted Depth'].astype(float)
data['Max Depth'] = data['Max Depth'].astype(float)

In [None]:
# Plotting the data with secondary y-axis for Average Weighted Depth
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot Optimal Depth, Average Depth, and Max Depth on the primary y-axis
ax1.plot(data['Number of Nodes'], data['Optimal Depth'], label='Optimal Depth (log2(#nodes))')
ax1.plot(data['Number of Nodes'], data['Average Depth'], label='Average Depth')
ax1.plot(data['Number of Nodes'], data['Max Depth'], label='Max Depth', color='orange')

# Set labels and title for the primary y-axis
ax1.set_xlabel('Number of Nodes')
ax1.set_ylabel('Depth')
ax1.legend(loc='upper left')
ax1.grid(True)

plt.show()

In [None]:
# Plotting the data with secondary y-axis for Average Weighted Depth
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot Optimal Depth, Average Depth, and Max Depth on the primary y-axis
# ax1.plot(data['Number of Nodes'], data['Optimal Depth'], label='Optimal Depth (log2(#nodes))')
# ax1.plot(data['Number of Nodes'], data['Average Depth'], label='Average Depth')
ax1.plot(data['Number of Nodes'], data['Max Depth'], label='Max Depth', color='orange')

# Set labels and title for the primary y-axis
ax1.set_xlabel('Number of Nodes')
ax1.set_ylabel('Depth')
ax1.legend(loc='upper left')
ax1.grid(True)

# Add 10% padding to y-axis limits
y_min, y_max = ax1.get_ylim()
ax1.set_ylim(0, y_max + 0.1 * (y_max - y_min))

# Create a secondary y-axis for Average Weighted Depth
ax2 = ax1.twinx()
ax2.plot(data['Number of Nodes'], data['Average Weighted Depth'], label='Average Weighted Depth', color='purple')

# Set label for the secondary y-axis
ax2.set_ylabel('Average Weighted Depth', color='purple')
ax2.tick_params(axis='y', labelcolor='purple')
ax2.legend(loc='upper right')

# Add 10% padding to secondary y-axis limits
y2_min, y2_max = ax2.get_ylim()
ax2.set_ylim(0, y2_max + 0.1 * (y2_max - y2_min))

plt.show()


# Sequential access

In [None]:
def sequential_access(df, title):
    # Rename columns
    df.columns = ['Sequential accesses', 'Non-sequential accesses']

    # Add an index for the number of samples
    df.index = range(1, len(df) + 1)
    df.index.name = 'Sample'

    # Adjust the y-axis by multiplying by 100
    df *= 100

    # Create the area chart with the specified customizations
    plt.figure(figsize=(10, 6))
    df.plot(kind='area', stacked=True)
    plt.title(title)
    plt.xlabel('Iteration (every 1000 samples)')
    plt.ylabel('Percentage')
    plt.legend(title='Access Type', loc='center right')

    # Set custom ticks and labels
    plt.show()

In [None]:
# Load the CSV file
df = pd.read_csv(fpath_opt_sorted_count)
sequential_access(df, "Ratio of sequential accesses in a run. Run with Optimization.")

In [None]:
# Load the CSV file
df = pd.read_csv(fpath_notopt_sorted_count)
sequential_access(df, "Ratio of sequential accesses in a run. Run without Optimization.")