# Debug SqueezeNet v1.3 (Simple Task) OpenCL implement with PyOpenCL and PyTorch
Partial code are copied heavily from https://github.com/pytorch/vision/blob/master/torchvision/models/squeezenet.py  
SqueezeNet Paper:https://arxiv.org/abs/1602.07360  
SqueezeNet 1.1 model from https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1   
SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0, without sacrificing accuracy.

TEST DE IMPLEMENTACIÓN MAXPOOL 3x3 stride = 2

In [1]:
#some set up
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from PIL import Image
import math
import time
from time import sleep, perf_counter as pc
from matplotlib.pyplot import imshow
%matplotlib inline

## Aquí tenemos el medidor de pytorch

- En esta aproximacion tengo en cuenta el tiempo de definición de capas y la carga de pesos porque se supone que entre fire y fire estas operaciones tiene que hacerse


In [2]:
canales_iniciales=4 #input_channels
canales_finales= canales_iniciales

acumulado_pytorch=0
idea=True
count=100
tamanyo=5 #input_size

for i in range(count):

    imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo))
      
    tic=pc()
    maxpool = nn.MaxPool2d(3, stride=2)
    imagen1  = torch.from_numpy(imagen).float()
    
    salida1 = maxpool(imagen1)
    
    salida1_a_numpy=salida1.detach().numpy()
    
    toc=pc()
    
    acumulado_pytorch=toc-tic+acumulado_pytorch
    
print(salida1_a_numpy)
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)

[[[[9. 6.]
   [8. 6.]]

  [[9. 9.]
   [9. 9.]]

  [[8. 7.]
   [9. 7.]]

  [[8. 8.]
   [8. 5.]]]]
tiempo en segundos con pytorch=  0.00021743616990534066


## Veamos ahora solo conv3x3 con opencl

In [3]:
# OpenCL setup
import pyopencl as cl
import sys
sys.path.append('../common')
import deviceinfo
from time import time

#wksp = '../device/v1.3/conv3x3'


#### Step0-A: compilation for emulation

In [38]:
%%bash
aoc -march=emulator ../device/v1.3/maxpool/maxpool_NDRange.cl -o ../device/v1.3/maxpool/bin_em/maxpool_NDRange.aocx
aoc -march=emulator ../device/v1.3/maxpool/maxpool_ST.cl -o ../device/v1.3/maxpool/bin_em/maxpool_ST.aocx

aoc: OpenCL kernel compilation completed successfully.
aoc: Linking Object files....
aoc: Compiling for Emulation ....
aoc: OpenCL kernel compilation completed successfully.
aoc: Linking Object files....
aoc: Compiling for Emulation ....


#### Step0-B: compilation for simulation

In [53]:
%%bash
aoc -march=simulator -v -ghdl ../device/v1.3/maxpool/maxpool_NDRange.cl -o ../device/v1.3/maxpool/bin_sim/maxpool_NDRange.aocx -board=a10gx
aoc -march=simulator -v -ghdl ../device/v1.3/maxpool/maxpool_ST.cl -o ../device/v1.3/maxpool/bin_sim/maxpool_ST.aocx -board=a10gx


aoc: Environment checks completed successfully.
Quartus location: /home/joerock/intelFPGA_pro/21.1/quartus/bin/quartus_sh
aoc: Cached files in /var/tmp/aocl/joerock may be used to reduce compilation time
aoc: Selected target board package /home/joerock/intelFPGA_pro/21.1/hld/board/a10_ref
aoc: Selected target board a10gx
aoc: Running OpenCL parser....
aoc: OpenCL parser completed 
aoc: Linking Object files....
aoc: Optimizing and doing static analysis of code...
aoc: Linking with IP library ...
aoc: Checking if memory usage is larger than 100%...
aoc: Memory usage is not above 100.
aoc: First stage compilation completed successfully.
aoc: Compiling for Simulator.
Quartus location: /home/joerock/intelFPGA_pro/21.1/quartus/bin/quartus_sh
Creating simulation system...
Generating simulation system...
Compiling simulation...
aoc: Simulation generation done!
Simulator flow is successful.
To execute simulator, invoke host with 
	env CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 <host_program>
aoc: Envi

#### Step1: OpenCL preparation

In [5]:
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
queue = cl.CommandQueue(context)

context

<pyopencl.Context at 0x515c518 on <pyopencl.Device '12th Gen Intel(R) Core(TM) i7-12650H' on 'Intel(R) OpenCL' at 0x51689b8>>

#### Step 2: creat kernels
Creat & build program

In [6]:
wksp = '../device/v1.3/maxpool/'

file_dir = wksp + 'maxpool_NDRange.cl'

kernelSource = open(file_dir).read()
program_NDR = cl.Program(context, kernelSource).build()

file_dir = wksp + 'maxpool_ST.cl'

kernelSource = open(file_dir).read()
program_ST = cl.Program(context, kernelSource).build()

  warn("Non-empty compiler output encountered. Set the "


Creat kernels

In [7]:
maxpool_NDR = program_NDR.maxpool2d
maxpool_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None])

maxpool_ST = program_ST.maxpool2d
maxpool_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None])

#### OpenCL kernel: maxpool_NDRange.cl

maxpool2d: 2-D 3x3 maxpool stride 2.  

```C
//maxPool2d 
//kernel_size=3 stride=2
//output one feature map per kernel
__kernel void maxpool2d(
	const int input_size,
	const int output_size,
	__global const float* restrict input_im,
    __global float* restrict output_im)
{
	int channels = get_global_id(0);//get output channel index
	
	input_im += channels * input_size * input_size;
	output_im += channels * output_size * output_size;

	//loop over output feature map
	for(int i = 0; i < output_size; i++)//row
	{
		for(int j = 0; j < output_size; j++)//col
		{
			//find the max value in 3x3 reigon 
			//to be one element in the output feature map
			float tmp = 0.0;

			#pragma unroll 1
			for(int k = 0; k < 3; k++)//row
			{
				#pragma unroll 1
				for(int l = 0; l < 3; l++)//col
				{
					float value = input_im[(i * 2 + k) * input_size  + j * 2 + l ];
					if(value > tmp)
						tmp = value;
				}
			}
			//store the result to output feature map
			output_im[i * output_size + j] = tmp; 
		}
	}
}
```
#### OpenCL kernel: maxpool_ST.cl

maxpool2d: 2-D 3x3 maxpool stride 2. 

```C
//maxPool2d 
//kernel_size=3 stride=2
//output one feature map per kernel
__kernel void maxpool2d(
	const int input_size,
	const int output_size,
    const int channel_size,
	__global const float* restrict input_im,
    __global float* restrict output_im)
{

    for(int channel_index = 0; channel_index < channel_size; channel_index++)
    {
        //loop over output feature map
        for(int i = 0; i < output_size; i++)//row
        {
            for(int j = 0; j < output_size; j++)//col
            {
                //find the max value in 3x3 reigon 
                //to be one element in the output feature map
                float tmp = 0.0;

                #pragma unroll 1
                for(int k = 0; k < 3; k++)//row
                {
                    #pragma unroll 1
                    for(int l = 0; l < 3; l++)//col
                    {
                        float value = input_im[(i * 2 + k) * input_size  + j * 2 + l ];
                        if(value > tmp)
                            tmp = value;
                    }
                }
                //store the result to output feature map
                output_im[i * output_size + j] = tmp;
            }
        }
    
        input_im += input_size * input_size;
        output_im += output_size * output_size;
    }
}
```

Run OpenCL implement  

In [24]:
tamanyo=113 #input_size
canales_iniciales=64 #input_channels
canales_finales = canales_iniciales
tamanyo_final = np.int32((tamanyo - 3 + 2 ) / 2)

acumulado_pytorch=0

imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
#imagen = np.ones((1,canales_contraidos, tamanyo, tamanyo)).astype(np.float32)

tic=pc()

maxpool = nn.MaxPool2d(3, stride=2)
imagen1  = torch.from_numpy(imagen).float()
    
salida1 = maxpool(imagen1)
    
salida1_a_numpy=salida1.detach().numpy()

toc=pc()
acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [25]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic2 = pc()

maxpool_NDR(queue,(canales_iniciales, ), None, tamanyo, tamanyo_final, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)

rtime = pc() - tic2


In [26]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic3 = pc()

maxpool_ST(queue, (1, ), None, tamanyo, tamanyo_final, canales_iniciales, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos1 = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)

rtime1 = pc() - tic3


In [27]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida1_a_numpy, veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.005047947999628377
tiempo en segundos con opencl (NDRANGE)= 0.0011522969998623012
tiempo en segundos con opencl (Simple Task)= 0.001221702999828267
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  True
comparativa (NDRange == Simple Task):  True


In [28]:
for i in range(canales_iniciales):
    for j in range(tamanyo_final):
        for k in range(tamanyo_final):
            if (abs(salida1_a_numpy.reshape(-1,tamanyo_final,tamanyo_final)[i][j][k] - veamos1[i][j][k])) > 1e-01:
                print("i:", i, "j:", j, "k:", k, salida1_a_numpy.reshape(-1,tamanyo_final,tamanyo_final)[i][j][k], veamos1[i][j][k])

#### Step 3: emulation
Creat & build program

In [43]:
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[1])])
device = platforms[1].get_devices()

queue = cl.CommandQueue(context)

context

<pyopencl.Context at 0x9862438 on <pyopencl.Device 'Intel(R) FPGA Emulation Device' on 'Intel(R) FPGA Emulation Platform for OpenCL(TM)' at 0x5173268>>

In [46]:
wksp = '../device/v1.3/maxpool/bin_em/'

file_dir = wksp + 'maxpool_NDRange.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_NDR = cl.Program(context, device, [kernelSource]).build()

file_dir = wksp + 'maxpool_ST.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_ST = cl.Program(context, device, [kernelSource]).build()

  warn("Non-empty compiler output encountered. Set the "


Creat kernels

In [47]:
maxpool_NDR = program_NDR.maxpool2d
maxpool_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None])

maxpool_ST = program_ST.maxpool2d
maxpool_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None])


Run OpenCL implement  

In [48]:
tamanyo=113 #input_size
canales_iniciales=64 #input_channels
canales_finales = canales_iniciales
tamanyo_final = np.int32((tamanyo - 3 + 2 ) / 2)

acumulado_pytorch=0

imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
#imagen = np.ones((1,canales_contraidos, tamanyo, tamanyo)).astype(np.float32)

tic=pc()

maxpool = nn.MaxPool2d(3, stride=2)
imagen1  = torch.from_numpy(imagen).float()
    
salida1 = maxpool(imagen1)
    
salida1_a_numpy=salida1.detach().numpy()

toc=pc()
acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [49]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic2 = pc()

maxpool_NDR(queue,(canales_iniciales, ), None, tamanyo, tamanyo_final, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)

rtime = pc() - tic2


In [50]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic3 = pc()

maxpool_ST(queue, (1, ), None, tamanyo, tamanyo_final, canales_iniciales, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos1 = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)

rtime1 = pc() - tic3


In [51]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida1_a_numpy, veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.00447581599928526
tiempo en segundos con opencl (NDRANGE)= 0.03291233500021917
tiempo en segundos con opencl (Simple Task)= 0.01775529499991535
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  True
comparativa (NDRange == Simple Task):  True


In [52]:
for i in range(canales_iniciales):
    for j in range(tamanyo_final):
        for k in range(tamanyo_final):
            if (abs(salida1_a_numpy.reshape(-1,tamanyo_final,tamanyo_final)[i][j][k] - veamos1[i][j][k])) > 1e-01:
                print("i:", i, "j:", j, "k:", k, salida1_a_numpy.reshape(-1,tamanyo_final,tamanyo_final)[i][j][k], veamos1[i][j][k])

#### Step 4: simulación
Creat & build program

In [4]:
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[2])])
device = platforms[2].get_devices()

queue = cl.CommandQueue(context)

context

<pyopencl.Context at 0x47f7330 on <pyopencl.Device 'SimulatorDevice : Multi-process Simulator (aclmsim0)' on 'Intel(R) FPGA SDK for OpenCL(TM)' at 0x7f9d916f00d8>>

In [5]:
wksp = '../device/v1.3/maxpool/bin_sim/'

file_dir = wksp + 'maxpool_NDRange.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_NDR = cl.Program(context, device, [kernelSource]).build()

file_dir = wksp + 'maxpool_ST.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_ST = cl.Program(context, device, [kernelSource]).build()

  warn("Non-empty compiler output encountered. Set the "


Creat kernels

In [6]:
maxpool_NDR = program_NDR.maxpool2d
maxpool_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None])

maxpool_ST = program_ST.maxpool2d
maxpool_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None])


Run OpenCL implement  

In [7]:
tamanyo=5 #input_size
canales_iniciales=8 #input_channels
canales_finales = canales_iniciales
tamanyo_final = np.int32((tamanyo - 3 + 2 ) / 2)

acumulado_pytorch=0

imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
#imagen = np.ones((1,canales_contraidos, tamanyo, tamanyo)).astype(np.float32)

tic=pc()

maxpool = nn.MaxPool2d(3, stride=2)
imagen1  = torch.from_numpy(imagen).float()
    
salida1 = maxpool(imagen1)
    
salida1_a_numpy=salida1.detach().numpy()

toc=pc()
acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [8]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic2 = pc()

maxpool_NDR(queue,(canales_iniciales, ), None, tamanyo, tamanyo_final, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)

rtime = pc() - tic2

In [9]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic3 = pc()

maxpool_ST(queue, (1, ), None, tamanyo, tamanyo_final, canales_iniciales, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos1 = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)

rtime1 = pc() - tic3


In [10]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida1_a_numpy, veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.00023188700015452923
tiempo en segundos con opencl (NDRANGE)= 13.07889768899986
tiempo en segundos con opencl (Simple Task)= 52.495376027999555
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  True
comparativa (NDRange == Simple Task):  True


In [14]:
print("pytorch", veamos.shape, salida1_a_numpy[0][0][0])
print("NDRange", veamos[0][0])
print("Simple task", veamos1[0][0])
#print(imagen.shape, np.allclose(imagen, np.ones((1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32), rtol=1e-01, atol=1e-01))
#print(weights1[6])
# print(fire1_squeeze_weight)
#print(bias1[6])
# print(imagen1_obtenida)
# print(veamos2)
print(np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01))

pytorch (8, 2, 2) [8. 7.]
NDRange [8. 7.]
Simple task [8. 7.]
True


In [11]:
for i in range(canales_iniciales):
    for j in range(tamanyo_final):
        for k in range(tamanyo_final):
            if (abs(salida1_a_numpy.reshape(-1,tamanyo_final,tamanyo_final)[i][j][k] - veamos1[i][j][k])) > 1e-01:
                print("i:", i, "j:", j, "k:", k, salida1_a_numpy.reshape(-1,tamanyo_final,tamanyo_final)[i][j][k], veamos1[i][j][k])