## GPUの理論性能計算

http://optimisationcpugpu-hpc.blogspot.com/2012/10/how-to-calculate-flops-of-gpu.html

In [4]:
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import numpy

In [32]:
def convert_SMVer2ArchName(major, minor):
    n_GpuArch_NameSM = {
        0x30: "Kepler",
        0x32: "Kepler",
        0x35: "Kepler",
        0x37: "Kepler",
        0x50: "Maxwell",
        0x52: "Maxwell",
        0x53: "Maxwell",
        0x60: "Pascal",
        0x61: "Pascal",
        0x62: "Pascal",
        0x70: "Volta",
        0x72: "Xavier",
        0x75: "Turing",
        0x80: "Ampere",
        0x86: "Ampere",
    }

    n_GpuArch_CoresPerSM = {
        0x30: 192,
        0x32: 192,
        0x35: 192,
        0x37: 192,
        0x50: 128,
        0x52: 128,
        0x53: 128,
        0x60:  64,
        0x61: 128,
        0x62: 128,
        0x70:  64,
        0x72:  64,
        0x75:  64,
        0x80:  64,
        0x86: 128,
    }
    
    key = (major << 4) + minor
    arch_name = n_GpuArch_NameSM[key]
    core_num  = n_GpuArch_CoresPerSM[key]
    return arch_name, cores_per_sm

### 1. GPUの浮動小数点演算性能(FLOPS)

```
1秒あたりの浮動小数点演算 = コア数 * SIMDユニット数 * ((muladdユニット数 * 2) + mulユニット数) * クロック速度
flops = core_num * simd_num * ((mul_add_units)*2 + mul_units) * clock_rate
```

In [42]:
dev = pycuda.driver.Context.get_device()

In [43]:
# SMごとのコア数の計算
major = dev.get_attribute(pycuda._driver.device_attribute.COMPUTE_CAPABILITY_MAJOR)
minor = dev.get_attribute(pycuda._driver.device_attribute.COMPUTE_CAPABILITY_MINOR)
arch_name, cores_per_sm = convert_SMVer2ArchName(major, minor)

In [44]:
# コア数の計算
sm_num = dev.get_attribute(pycuda._driver.device_attribute.MULTIPROCESSOR_COUNT)
core_num = cores_per_sm * sm_num

In [45]:
# 演算ユニット数
simd_num = 1
mul_add_units = 1
mul_units = 0

In [46]:
# クロック速度
clock_rate = dev.get_attribute(pycuda._driver.device_attribute.CLOCK_RATE) / 1000 / 1000

In [66]:
flops = core_num * simd_num * (mul_add_units * 2 + mul_units) * clock_rate
print("{} GFLOPS".format(flops))

2876.16 GFLOPS


### 2. GPUのメモリ帯域幅

```
メモリ帯域幅 = メモリバス幅 * レーン数 * メモリクロック速度
memory_bandwidth = memory_bus_width * memory_lane_num * memory_clock_rate
```

In [63]:
memory_bus_width = dev.get_attribute(pycuda._driver.device_attribute.GLOBAL_MEMORY_BUS_WIDTH) / 8
memory_lane_num = 2
memory_clock_rate = dev.get_attribute(pycuda._driver.device_attribute.MEMORY_CLOCK_RATE) / 1000 / 1000

In [67]:
memory_bandwidth = memory_bus_width * memory_lane_num * memory_clock_rate
print("{} GB/sec".format(memory_bandwidth))

192.032 GB/sec


### 3. コアごとの演算器割り当て

In [69]:
maxthreads_per_sm = dev.get_attribute(pycuda._driver.device_attribute.MAX_THREADS_PER_MULTIPROCESSOR)
maxregs_per_sm = dev.get_attribute(pycuda._driver.device_attribute.MAX_REGISTERS_PER_MULTIPROCESSOR)
maxsharedmem_per_sm = dev.get_attribute(pycuda._driver.device_attribute.MAX_SHARED_MEMORY_PER_MULTIPROCESSOR)
l2_cache_size = dev.get_attribute(pycuda._driver.device_attribute.L2_CACHE_SIZE)
constmem_size = dev.get_attribute(pycuda._driver.device_attribute.TOTAL_CONSTANT_MEMORY)

In [88]:
print("Thread:    {} threads/core".format(maxthreads_per_sm/cores_per_sm))
print("Register:  {} regs/core".format(maxregs_per_sm/cores_per_sm))
print("SharedMem: {} KB/core".format(maxsharedmem_per_sm/cores_per_sm))
print("L2 cache:  {:0.1f} KB/core".format(l2_cache_size/core_num))
print("ConstMem:  {:0.1f} KB/core".format(constmem_size/core_num))

Thread:    16.0 threads/core
Register:  1024.0 regs/core
SharedMem: 1024.0 KB/core
L2 cache:  1170.3 KB/core
ConstMem:  73.1 KB/core
