In [9]:
from collections import defaultdict
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import numpy as np
import pandas as pd
import seaborn as sns

## ランダムアクセス用の配列生成

In [10]:
def words_shuffle(max_size, shuffle=False):
	words = list(range(max_size))

	if(shuffle):
		random.shuffle(words)
	return words

def shuffle_warp_align(buffer_size, warp_size=32, shuffle=False):
	assert buffer_size % warpsize == 0

	page_size = buffer_size // warpsize
	buffer = np.zeros(buffer_size)

	for j in range(warp_size):
		words = words_shuffle(page_size, shuffle)

		for i in range(page_size):
			buffer[words[i % page_size] * warp_size + j] = words[(i+1) % page_size] * warpsize + j
	return buffer
	
def shuffle_array_align(buffer_size, shuffle=False):
	buffer = np.zeros(buffer_size)
	words = words_shuffle(buffer_size, shuffle)

	for i in range(buffer_size):
		buffer[words[i % size]]=words[(i + 1) % buffer_size]

	

## メモリレイテンシ計測用カーネルの実行

In [11]:
def calc_gpu_memcpy_time(
        buffer_gpu, gputime_gpu, dummy_gpu, buffer_size, 
        grid_size=128, block_size=256, 
        partition_num=16, copy_iter=16,
        is_random=False):
    latency_mod = SourceModule("""
    __global__ void latency_calc(
            int *buffer, int* gputime, int* dummy, int repeat_num)
    {
    	unsigned int t1, t2;
        int index = blockIdx.x * blockDim.x + threadIdx.x;

        t1 = clock();
        for( int rep=0; rep<repeat_num; rep++ )
        {
            index = buffer[index];
        }
        t2 = clock();

        gputime[0] = t2-t1;
        dummy[0] = index;
    }
    """)

    latency_kernel = latency_mod.get_function("latency_calc")
    latency_kernel.prepare("PPPi")
    latency_kernel.set_cache_config(pycuda.driver.func_cache.PREFER_L1)

    buf_num = buffer_size // 4
    kernel_times = list()
    clock_times = list()
    for iter in range(copy_iter+1):
        if is_random:
            loop_num = buffer_size * partition_num // 32
            buffer_cpu = shuffle_warp_align(buffer_size, False)
        else:
            loop_num = buffer_size * partition_num // 32 // 32
            buffer_cpu = shuffle_array_align(buffer_size, True)

        cuda.memcpy_htod(buffer_gpu, buffer_cpu)

        ev_start  = drv.Event()
        ev_finish = drv.Event()

        ev_start.record()
        latency_kernel.prepared_call(
            (grid_size,1), (block_size,1,1),
            buffer_gpu, gputime_gpu, dummy_gpu, np.int32(loop_num)
        )
        ev_finish.record()
        ev_finish.synchronize()

        gputime_cpu = np.zeros(1).astype(np.int32)
        dummpy_cpu = np.zeros(1).astype(np.int32)
        cuda.memcpy_dtoh(gputime_cpu, gputime_gpu)
        cuda.memcpy_dtoh(dummpy_cpu, dummpy_gpu)
      
        if iter != 0:
            kernel_times.append(ev_start.time_till(ev_finish) * 0.001)
            clock_times.append(gputime_cpu)
    return times, clocks

## 計測するバッファサイズ

In [12]:
mem_step = 10
min_mem_size = 1024
max_mem_size = 1 * 1024 * 1024 * 1024

min_mem_log2 = np.log2(min_mem_size)
max_mem_log2 = np.log2(max_mem_size)

# step size: 10
print(f"{min_mem_log2} - {max_mem_log2}: {mem_step} step")

10.0 - 30.0: 10 step


In [13]:
devdat = pycuda.tools.DeviceData()
copy_buffers = np.logspace(min_mem_log2, max_mem_log2, mem_step, base=2)
copy_buffers_align = [devdat.align(buf, word_size = 4) for buf in copy_buffers]

In [15]:
max_buffer_size = max(copy_buffers_align)
buffer_gpu = drv.mem_alloc(max_buffer_size)
gputime_gpu = drv.mem_alloc(np.dtype(np.int32).itemsize)
dummy_gpu = drv.mem_alloc(np.dtype(np.int32).itemsize)

## コアレスアクセス

In [16]:
def show_latency(time_df, target="buffer_size"):
    mean_df = time_df.groupby([target], as_index=False).mean()

    band_dict = defaultdict(list)
    for row in mean_df.itertuples():
        band_dict[target].append(getattr(row, target))
        band_dict["latency"].append(row.clock_time)
    band_df = pd.DataFrame(band_dict)

    ax = sns.catplot(x=target, y="latency", data=band_df, kind="bar")
    ax = ax.set_xticklabels(rotation=90)
    return ax

In [None]:
dev = pycuda.driver.Context.get_device()
memory_clock_rate = dev.get_attribute(pycuda._driver.device_attribute.MEMORY_CLOCK_RATE)

clock_dict = defaultdict(list)
for buffer_size in copy_buffers_align:
    _, clocks = calc_gpu_memcpy_time(buffer_gpu, gputime_gpu, dummy_gpu, max_buffer_size)
    for iter, clock in enumerate(clocks):
        clock_df["buffer_size"].append(buffer_size)
        clock_df["iteration"].append(iter)
        clock_df["clock_time"].append(clock/memory_clock_rate)
clock_df = pd.DataFrame(time_dict)

In [13]:
show_latency(clock_df, target="buffer_size")


NameError: name 'time_df' is not defined

## ランダムアクセス

In [None]:
dev = pycuda.driver.Context.get_device()
memory_clock_rate = dev.get_attribute(pycuda._driver.device_attribute.MEMORY_CLOCK_RATE)

clock_dict = defaultdict(list)
for buffer_size in copy_buffers_align:
    _, clocks = calc_gpu_memcpy_time(buffer_gpu, gputime_gpu, dummy_gpu, max_buffer_size, is_random=True)
    for iter, clock in enumerate(clocks):
        clock_df["buffer_size"].append(buffer_size)
        clock_df["iteration"].append(iter)
        clock_df["clock_time"].append(clock/memory_clock_rate)
clock_df = pd.DataFrame(time_dict)

In [None]:
show_latency(clock_df, target="buffer_size")
