In [1]:
%matplotlib widget

import pathlib
import pprint

from Visualizer.ControlledDataSet import ControlComparison, ControlledDataSet
from Visualizer.DataSet import DataSet
from Visualizer.Monitoring.Persistence.ProfilerSession import ProfilerSession

In [2]:
# Load the complete data set
source_directory = pathlib.Path().parent.absolute()
project_directory = f"{source_directory}/.."
database = f"{project_directory}/../EnergyManager/Resources/Test Results/database.sqlite"
complete_data_set = DataSet(ProfilerSession.load_all(database))

# Load the individual data sets per application type and configuration
fixed_frequency_matrix_multiply_data_set = DataSet([profiler_session for profiler_session in complete_data_set.data if profiler_session.label == "Fixed Frequency Matrix Multiply" and profiler_session.profile.get("matrixAWidth") == 32 * 30 and "iterations" not in profiler_session.profile])
matrix_multiply_data_set = DataSet([profiler_session for profiler_session in complete_data_set.data if profiler_session.label == "Matrix Multiply" and profiler_session.profile.get("matrixAWidth") == 32 * 30 and "iterations" not in profiler_session.profile])
fixed_frequency_long_matrix_multiply_data_set = DataSet([profiler_session for profiler_session in complete_data_set.data if profiler_session.label == "Fixed Frequency Matrix Multiply" and profiler_session.profile.get("matrixAWidth") == 32 * 30 and profiler_session.profile.get("iterations") == 4])
long_matrix_multiply_data_set = DataSet([profiler_session for profiler_session in complete_data_set.data if profiler_session.label == "Matrix Multiply" and profiler_session.profile.get("matrixAWidth") == 32 * 30 and profiler_session.profile.get("iterations") == 4])

fixed_frequency_kmeans_data_set = DataSet([profiler_session for profiler_session in complete_data_set.data if profiler_session.label == "Fixed Frequency KMeans" and "file" in profiler_session.profile and profiler_session.profile["file"].endswith("kdd_cup") and "iterations" not in profiler_session.profile])
kmeans_data_set = DataSet([profiler_session for profiler_session in complete_data_set.data if profiler_session.label == "KMeans" and "file" in profiler_session.profile and profiler_session.profile["file"].endswith("kdd_cup") and "iterations" not in profiler_session.profile])
fixed_frequency_long_kmeans_data_set = DataSet([profiler_session for profiler_session in complete_data_set.data if profiler_session.label == "Fixed Frequency KMeans" and "file" in profiler_session.profile and profiler_session.profile["file"].endswith("kdd_cup") and profiler_session.profile.get("iterations") == 4])
long_kmeans_data_set = DataSet([profiler_session for profiler_session in complete_data_set.data if profiler_session.label == "KMeans" and "file" in profiler_session.profile and profiler_session.profile["file"].endswith("kdd_cup") and profiler_session.profile.get("iterations") == 4])

# Convert the data sets to separate sets of data and associated control data
matrix_multiply_controlled_data_set = ControlledDataSet(
    data_set=fixed_frequency_matrix_multiply_data_set,
    control_data_set=matrix_multiply_data_set
)
long_matrix_multiply_controlled_data_set = ControlledDataSet(
    data_set=fixed_frequency_long_matrix_multiply_data_set,
    control_data_set=long_matrix_multiply_data_set
)

kmeans_controlled_data_set = ControlledDataSet(
    data_set=fixed_frequency_kmeans_data_set,
    control_data_set=kmeans_data_set
)
long_kmeans_controlled_data_set = ControlledDataSet(
    data_set=fixed_frequency_long_kmeans_data_set,
    control_data_set=long_kmeans_data_set
)

# Effects of CPU and GPU Frequency on Runtime and Energy Consumption

## Matrix Multiply

- Performed using the samples from **CUDA 10.1**
- Uses a matrix size of `960`

### Control Data

- Plot shows runs of matrix multiply with default configuration
    - Device **frequencies unrestricted**
- Runtime averages about `0.55 seconds`
- Energy consumption averages about `80 joules`

In [3]:
matrix_multiply_data_set.energy_consumption_vs_runtime_scatter_plot(normalized=False).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Experimental Data

- This plot shows runs of **matrix multiply** where the **frequencies have been restricted**
- **Similar colors** have **similar frequency restrictions**
- **Runtime** mostly dominated by **GPU frequency**
- **Energy consumption** dominated by **CPU frequency**
- Different frequency restrictions result in similar runtimes and different energy consumptions
    - **Changing the frequency** can **save energy without increasing runtime**

In [4]:
fixed_frequency_matrix_multiply_data_set.energy_consumption_vs_runtime_scatter_plot(normalized=False).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- This next plot shows the same data, but instead each point is **compared to the average of the control data**
- This results in the **energy savings** and **runtime increase** for each run compared to control data

In [5]:
matrix_multiply_controlled_data_set.energy_savings_vs_runtime_increase_plot(normalized=False, control_comparison=ControlComparison.MEAN).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The most interesting portion is everything in the **top left corner**
    - These are runs that **save energy and time** compared to the control data
- For most of these points, **GPU frequency is at or near the maximum**
    - This also suggests that **changing GPU frequencies** for **GPU bound applications** does **not** lead to **energy savings**
    - **CPU frequency** seems to be the **major factor** in **determining energy savings** for **GPU bound applications**

In [6]:
plot = matrix_multiply_controlled_data_set.energy_savings_vs_runtime_increase_plot(normalized=False, control_comparison=ControlComparison.MEAN)
plot.x_max = 0
plot.y_min = 0
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- Plot shows the **effect of different CPU and GPU frequencies on runtime**
- Different **CPU frequencies have a minor impact**
- **Low GPU frequencies have a major impact**
    - Impact becomes smaller as the clock rate increases
    - Again points to the application being **GPU bound**

In [7]:
fixed_frequency_matrix_multiply_data_set.core_clock_rate_vs_gpu_clock_rate_vs_runtime_scatter_plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- This graph plots the same data, but **compared to the average control data**
- Runs that save time are on the upper end of the frequency spectrum as expected

In [8]:
matrix_multiply_controlled_data_set.core_clock_rate_vs_gpu_clock_rate_vs_runtime_increase_scatter_plot(control_comparison=ControlComparison.MEAN).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The next graph plots the **effects on energy consumption**
- **Low GPU frequencies** seem to **increase energy consumption by a lot**
    - Probably due to the observed **increase in runtime**
    - Beyond `1 GHz` the impact of GPU frequency is minimal
- **Low CPU frequencies reduce** the **energy consumption**
    - GPU bound applications have the CPU waiting for the GPU
    - **Lowering the CPU frequency decreases idle time on the CPU** while the GPU is busy
    - In contrast to GPU frequency, **lower frequencies save energy**

In [9]:
fixed_frequency_matrix_multiply_data_set.core_clock_rate_vs_gpu_clock_rate_vs_energy_consumption_scatter_plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- This graph plots the same data, but again **compared to the average control data**
- **Runs that save energy** compared to the control data tend to have a **high GPU frequency** and a **low to medium CPU frequency**

In [10]:
matrix_multiply_controlled_data_set.core_clock_rate_vs_gpu_clock_rate_vs_energy_savings_scatter_plot(control_comparison=ControlComparison.MEAN).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## K-Means

- Performed with Rodinia 3.1
- Used `kdd_cup` data set

### Control Data

- Plot shows runs of **k-means** with **unrestricted frequencies**
- Runtime averages about `0.68 seconds`
- Energy consumption averages about `65 joules`

In [11]:
kmeans_data_set.energy_consumption_vs_runtime_scatter_plot(normalized=False).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Experimental Data

- This plot shows runs of matrix multiply where the frequencies have been restricted
- **Runtime** mostly dominated by **CPU frequency**
- **Energy consumption** dominated by **GPU frequency**
- Application seems to be **CPU bound**
- Different frequency restrictions result in similar runtimes and different energy consumptions
    - Changing the frequency can save energy without increasing runtime

In [12]:
fixed_frequency_kmeans_data_set.energy_consumption_vs_runtime_scatter_plot(normalized=False).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- This next plot again shows the same data **compared to the average of the control data**
- In contrast to matrix multiply, k-means has **no runs where runtime is decreased**

In [13]:
kmeans_controlled_data_set.energy_savings_vs_runtime_increase_plot(normalized=False, control_comparison=ControlComparison.MEAN).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- However, there are a few runs that **save energy**, but **not many** in the entire data set
- This suggests that **CPU bound** applications offer **less opportunities** to **save energy**

In [14]:
plot = kmeans_controlled_data_set.energy_savings_vs_runtime_increase_plot(normalized=False, control_comparison=ControlComparison.MEAN)
plot.y_min = 0
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- Plot shows the **effect of different CPU and GPU frequencies on runtime**
- Different **GPU frequencies have no impact**
- **Low GPU frequencies have a major impact**
    - Impact becomes smaller as the clock rate increases
    - Again points to the application being **CPU bound**

In [15]:
fixed_frequency_kmeans_data_set.core_clock_rate_vs_gpu_clock_rate_vs_runtime_scatter_plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- This graph plots the same data, but **compared to the average control data**
- As shown in the previous graphs, **no runs save time**

In [16]:
kmeans_controlled_data_set.core_clock_rate_vs_gpu_clock_rate_vs_runtime_increase_scatter_plot(control_comparison=ControlComparison.MEAN).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The next graph plots the **effects on energy consumption**
- **Low CPU frequencies** seem to **increase energy consumption by a lot**
    - Probably due to the observed **increase in runtime**
    - Beyond `0.4 GHz` the impact of CPU frequency is minimal
- **GPU frequencies** have a **minimal impact** on **energy consumption**
    - CPU bound applications have the GPU waiting for the CPU
    - Apparently, **reducing GPU frequency** does **not lead** to **major energy savings** for **CPU bound** applications

In [17]:
fixed_frequency_kmeans_data_set.core_clock_rate_vs_gpu_clock_rate_vs_energy_consumption_scatter_plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- This graph plots the same data, but again **compared to the average control data**
- The few **runs that save energy** seem to have **higher CPU frequencies** and **lower GPU frequencies** but the data is **inconclusive**

In [18]:
kmeans_controlled_data_set.core_clock_rate_vs_gpu_clock_rate_vs_energy_savings_scatter_plot(control_comparison=ControlComparison.MEAN).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Effects of CPU and GPU Frequency on Workload Patterns

## Matrix Multiply

- Performed using the samples from **CUDA 10.1**
- Uses a matrix size of `960`
- All runs pinned to `core 0`
- Each run contains `4 executions` of matrix multiply to clearly show **periodic behavior**

### Data Overview

- The next plots shows the **occurrence of different CPU utilization rates** in the control data set
- The average **CPU utilization rate** seems to be around `40%`

In [19]:
for histogram_plot in matrix_multiply_controlled_data_set.data_set.utilization_rate_histogram_plots(bins=30, title_filter="CPUCoreMonitor.0."):
    histogram_plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The next plots shows the **occurrence of different GPU utilization rates** in the entire control data set
- The average **GPU utilization rate** is spread evenly

In [20]:
for histogram_plot in matrix_multiply_controlled_data_set.data_set.utilization_rate_histogram_plots(bins=30, title_filter="GPUMonitor.0."):
    histogram_plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Usage Pattern

- This section shows a run that is representative of the observed utilization pattern
- Below information about the system on which the run was performed is shown

In [21]:
profiler_session = [profiler_session for profiler_session in complete_data_set.data if profiler_session.id == 4809][0]
pprint.pprint(profiler_session.summary)

{'GPUs': [{'Brand': 'GeForce',
           'Compute Capability Major Version': 7,
           'Compute Capability Minor Version': 5,
           'Default Auto Boosted Clocks Enabled': None,
           'Default Power Limit (W)': '260.0',
           'Memory Size (B)': '8.3 GB',
           'Name': 'GeForce RTX 2080',
           'PCIe Link Width (B)': '16 Bytes',
           'Supported Core Clock Rates (Hz)': [''],
           'Supported Memory Clock Rates (Hz)': ['']}],
 'Label': 'Fixed Frequency Matrix Multiply'}


- The next plot shows **device utilization rate over time**
- The **dips** in **CPU utilization rate** seem to **correlate with the start** of each iteration
- The **CPU utilization rate** seems to remain near `100%`
    - Possible explanation is the use of `cudaDeviceSynchronize()` on the **CPU**
        - This puts the **CPU** in a **busy** loop waiting on the **GPU** to complete all work
        - The **busy** loop costs more energy
        - `cudaDeviceSynchronize()` is called twice in the code
            - Matches the amount of peaks per iteration in the graph

In [22]:
plot = profiler_session.utilization_rate_timeseries_plot
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The following plot shows the **power consumption over time**
- As expected, it remains relatively constant since both **CPU and GPU utilization rate** is at `100%`
    - Nonetheless, dips can be observed whenever a new iteration starts

In [23]:
plot = profiler_session.power_consumption_timeseries_plot(plot_limits=False)
#plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") or key.startswith("Node") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- This graph shows the **memory consumption over time**
- The matrices do not consume a lot of memory
- Dips can be observed at the start of each iteration

In [24]:
profiler_session.memory_consumption_timeseries_plot(plot_sizes=False).plot()

  self.figure = pyplot.figure(figsize=(15, 5))


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## K-Means

- Performed with **Rodinia 3.1**
- Used `kdd_cup` data set
- All runs pinned to `core 0`
- Each run contains `4 executions` of k-means to clearly show **periodic behavior**

### Data Overview

- The next plots shows the **occurrence of different CPU utilization rates** in the control data set
- The most common **CPU utilization rates** seems to be spread between `0-50%`

In [25]:
for histogram_plot in kmeans_controlled_data_set.data_set.utilization_rate_histogram_plots(bins=30, title_filter="CPUCoreMonitor.0."):
    histogram_plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The next plots shows the **occurrence of different GPU utilization rates** in the entire control data set
- The average **GPU utilization rate** is spread evenly

In [26]:
for histogram_plot in kmeans_controlled_data_set.data_set.utilization_rate_histogram_plots(bins=30, title_filter="GPUMonitor.0."):
    histogram_plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Usage Pattern

- This section shows a run that is representative of the observed utilization pattern
- Below information about the system on which the run was performed is shown

In [27]:
profiler_session = [profiler_session for profiler_session in complete_data_set.data if profiler_session.id == 4861][0]

- The next plot shows **device utilization rate over time**
- Iterations not readily apparent in the data
- The **CPU utilization rate** switches between `0%` and `100%`
    - Again, a possible explanation is the use of `cudaDeviceSynchronize()` on the **CPU**
- **GPU utilization** moves up and down but does not reach `100%`

In [28]:
plot = profiler_session.utilization_rate_timeseries_plot
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The following plot shows the **power consumption over time**
- There is an initial dip, which is probably when **data is being read from disk**
    - This data is probably **cached in subsequent iterations**, thus resulting in only one spike
- It remains relatively constant over time

In [29]:
plot = profiler_session.power_consumption_timeseries_plot(plot_limits=False)
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") or key.startswith("Node") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- This graph shows the **memory consumption over time**
- The data does not consume a lot of memory
- Here, dips can be observed at the start of each new iteration

In [30]:
profiler_session.memory_consumption_timeseries_plot(plot_sizes=False).plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Investigating High CPU Usage

- Using **matrix multiply**
- Performed using the samples from **CUDA 10.1**
- Uses a matrix size of `960`
- All runs pinned to `core 0`
- Each run contains `3 executions` of matrix multiply to show **periodic behavior**

## Method

- To investigate the **high CPU usage** we looked at `cudaDeviceSynchronize()` as a possible cause
- **3 versions** of the source code were tested to observe the **effect on the utilization pattern**
    1. A version **without explicit synchronization** calls like `cudaDeviceSynchronize()`
        - Will still synchronize on data transfers
        - Shows if the device is doing anything besides busy waiting
    2. A version with **sleep** calls instead of **explicit synchronization** calls
        - Will still synchronize on data transfers
        - Shows if the sleep call reduces the time spent at `100%` **CPU utilization**
            - That would indicate that the device is in a busy wait loop
    3. A version with **sleep** calls and **explicit synchronization** calls
        - Tests the impact of the explicit synchronization calls

## Without Explicit Synchronization

- This section shows a run where all **explicit synchronziation** calls were **removed**

In [31]:
profiler_session = [profiler_session for profiler_session in complete_data_set.data if profiler_session.id == 5003][0]

- The next plot shows **device utilization rate over time**
- Simply **removing the explicit synchronization** calls **does not seem to affect the observed behavior**

In [32]:
plot = profiler_session.utilization_rate_timeseries_plot
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The next plot shows the power consumption is also still similar to the version with explicit synchronization

In [33]:
plot = profiler_session.power_consumption_timeseries_plot(plot_limits=False)
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") or key.startswith("Node") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## With Sleep Instead of Explicit Synchronization

- This section shows a run where all **explicit synchronziation** calls were **replaced** with **sleep** calls
- Each **sleep** call takes `2 seconds`

In [34]:
profiler_session = [profiler_session for profiler_session in complete_data_set.data if profiler_session.id == 5166][0]

- The next plot shows **device utilization rate over time**
- The effect of the **sleep** calls is immediately apparent as 2 second long dips in **CPU utilization** at the start of each run
    - The time per iteration is still similar, which suggests that our hypothesis is correct
    - The high **CPU utilization** seems to be caused by the busy wait loop

In [35]:
plot = profiler_session.utilization_rate_timeseries_plot
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The next plot shows that the **dips in power consumption** at the start of each iteration are now **more pronounced**
    - This suggests that by reducing the **CPU frequency** in a **busy loop** we can **save energy**

In [36]:
plot = profiler_session.power_consumption_timeseries_plot(plot_limits=False)
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") or key.startswith("Node") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## With Sleep and Explicit Synchronization

- This section shows a run where all **explicit synchronziation** calls **preceded** with **sleep** calls
- Each **sleep** call takes `2 seconds`

In [37]:
profiler_session = [profiler_session for profiler_session in complete_data_set.data if profiler_session.id == 5202][0]

- The next plot shows **device utilization rate over time**
- The **sleep** calls appear to have the **same effect** as in the previous run
    - This seems to confirm the suspicion that **explicit synchronization is not needed**
    - Synchronization will occur on memory access when results are copied back to the CPU

In [38]:
plot = profiler_session.utilization_rate_timeseries_plot
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

- The power consumption pattern seems to be similar to the previous version as well
    - The dips for every iteration are slightly less pronounced but still visible

In [39]:
plot = profiler_session.power_consumption_timeseries_plot(plot_limits=False)
plot.plot_series = { key: plot.plot_series[key] for key in plot.plot_series if key.startswith("CPU 0 Core 0") or key.startswith("GPU 0") or key.startswith("Node") }
plot.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …