In [1]:
import sys
import pandas as pd
import pathlib

# Data Loading

Change this constant and re-run to load a different source file. The file should be generated by the 'gatherTrace.sh' script.

In [2]:
# sourceFile = pathlib.Path('example.csv')
sourceFile = pathlib.Path('data/2iter.csv')

In [3]:
fullDF = pd.read_csv(sourceFile, skiprows=[4], usecols=['Start', 'Duration', 'Size', 'Name'], dtype={'Start':float, 'Duration':float, 'Size':float, 'Name':str}, comment="=")

In [4]:
# Cleanup
#   All times will be ms
#   All sizes with be MB
# The CSV has the units as the second row

# Start times are in S, duration is already ms
fullDF['Start'] *= 1000

# Views

In [5]:
fullDF['isKern'] = fullDF['Size'].isnull()
kernDF = fullDF[ fullDF['isKern'] ]
memDF = fullDF[ ~fullDF['isKern'] ]

# Basic Stats

In [6]:
avgKDur = kernDF['Duration'].mean()
print("Average Kernel Runtime: ", avgKDur, "ms")

Average Kernel Runtime:  0.052752990486714516 ms


In [7]:
print("Average time between kernel invocations: ", kernDF['Start'].diff().mean(), "ms")

Average time between kernel invocations:  0.04443235494733129 ms


In [8]:
# This isn't actuall the total time, it's just the time from the beginning to the last GPU function, there could be more time spent in the CPU-only end of the run.
# TODO: collect more stats for runs from sources other than nvprof
totalTime = fullDF['Start'].iloc[-1]
totalKernTime = kernDF['Duration'].sum()
print("Fraction of runtime spent in GPU: ", totalKernTime / totalTime)

Fraction of runtime spent in GPU:  0.9942946433281026


In [9]:
memcpySizes = memDF[ memDF['Name'] == '[CUDA memcpy HtoD]' ][['Size','Duration']]
print("Average memcpy H->D size: ", memcpySizes['Size'].mean(), 'MB')
print("Total memcpy H->D size: ", memcpySizes['Size'].sum(), 'MB')
print("Average memcpy H->D Duration: ", memcpySizes['Duration'].mean(), 'ms')

Average memcpy H->D size:  0.005816912391993873 MB
Total memcpy H->D size:  425.47223999999983 MB
Average memcpy H->D Duration:  0.002996896943016515 ms


In [10]:
memcpySizes = memDF[ memDF['Name'] == '[CUDA memcpy DtoH]' ][['Size','Duration']]
print("Average memcpy D->H size: ", memcpySizes.mean(), 'MB')
print("Total memcpy D->H size: ", memcpySizes.sum(), 'MB')
print("Average memcpy H->D Duration: ", memcpySizes['Duration'].mean(), 'ms')
del memcpySizes

Average memcpy D->H size:  Size        0.000004
Duration    0.001822
dtype: float64 MB
Total memcpy D->H size:  Size        0.000992
Duration    0.415467
dtype: float64 MB
Average memcpy H->D Duration:  0.0018222236842105265 ms


## Interleaving of memory operations
When multiple kernels run without an intervening memory operation, we can assume that they communicate through device memory (or use the transparent memory paging). This value is a good indicator of how many kernels we could pack into a single KaaS invocation. These traces don't capture host-side execution, so we can't be sure what is happening in between kernel invocations, but the average time between calls is small (see above) which would indicate that not much is going on. Still, it's possible that non-trivial synchronization or other calculations are occuring between invocations which would limit the kernel chain size.

In [11]:
filt = fullDF['isKern']
runGroups = pd.DataFrame({ 'group' : (filt != filt.shift(1)).cumsum()})
runCounts = runGroups.groupby(by='group').size()
del runGroups

avgRunLen = runCounts.mean()
print("Number of runs kernels without interleaving memory ops: ", len(runCounts))
print("Average run length: ", avgRunLen)
del runCounts

Number of runs kernels without interleaving memory ops:  150013
Average run length:  4.2181010979048486


In [12]:
print("Estimated kernel chain duration (on average): ",  avgRunLen * avgKDur, 'ms')

Estimated kernel chain duration (on average):  0.22251744708977453 ms
