# Debug SqueezeNet v1.3 (Simple Task) OpenCL implement with PyOpenCL and PyTorch
Partial code are copied heavily from https://github.com/pytorch/vision/blob/master/torchvision/models/squeezenet.py  
SqueezeNet Paper:https://arxiv.org/abs/1602.07360  
SqueezeNet 1.1 model from https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1   
SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0, without sacrificing accuracy.

TEST DE IMPLEMENTACIÓN MAXPOOL 3x3 stride = 2

In [1]:
#some set up
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from PIL import Image
import math
import time
from time import sleep, perf_counter as pc
from matplotlib.pyplot import imshow
%matplotlib inline

In [2]:
# OpenCL setup
import pyopencl as cl
import sys
sys.path.append('../python_common')
import deviceinfo
from time import time

#wksp = '../device/v1.3/conv3x3'


## Veamos ahora solo conv3x3 con opencl

#### Step1: OpenCL preparation

In [3]:
## Devices and compute context
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
device = platforms[0].get_devices()

# Create a command queue
queue = cl.CommandQueue(context)

context

<pyopencl.Context at 0x-465c3138 on <pyopencl.Device 'de10_nano_sharedonly : Cyclone V SoC Development Kit' on 'Intel(R) FPGA SDK for OpenCL(TM)' at 0x-5b0db690>>

#### Step 2: creat kernels
Creat & build program

Create kernels

In [4]:
wksp = ''

file_dir = wksp + 'maxpool_NDRange.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_NDR = cl.Program(context, device, [kernelSource]).build()

file_dir = wksp + 'maxpool_ST.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_ST = cl.Program(context, device, [kernelSource]).build()

Reprogramming device [0] with handle 1
  warn("Non-empty compiler output encountered. Set the "


Creat kernels

In [5]:
maxpool_NDR = program_NDR.maxpool2d
maxpool_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None])

maxpool_ST = program_ST.maxpool2d
maxpool_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None])


##### OpenCL kernel: maxpool_NDRange.cl

maxpool2d: 2-D 3x3 maxpool stride 2.  

```C
//maxPool2d 
//kernel_size=3 stride=2
//output one feature map per kernel
__kernel void maxpool2d(
	const int input_size,
	const int output_size,
	__global const float* restrict input_im,
    __global float* restrict output_im)
{
	int channels = get_global_id(0);//get output channel index
	
	input_im += channels * input_size * input_size;
	output_im += channels * output_size * output_size;

	//loop over output feature map
	for(int i = 0; i < output_size; i++)//row
	{
		for(int j = 0; j < output_size; j++)//col
		{
			//find the max value in 3x3 reigon 
			//to be one element in the output feature map
			float tmp = 0.0;

			#pragma unroll 1
			for(int k = 0; k < 3; k++)//row
			{
				#pragma unroll 1
				for(int l = 0; l < 3; l++)//col
				{
					float value = input_im[(i * 2 + k) * input_size  + j * 2 + l ];
					if(value > tmp)
						tmp = value;
				}
			}
			//store the result to output feature map
			output_im[i * output_size + j] = tmp; 
		}
	}
}
```
#### OpenCL kernel: maxpool_ST.cl

maxpool2d: 2-D 3x3 maxpool stride 2. 

```C
//maxPool2d 
//kernel_size=3 stride=2
//output one feature map per kernel
__kernel void maxpool2d(
	const int input_size,
	const int output_size,
    const int channel_size,
	__global const float* restrict input_im,
    __global float* restrict output_im)
{

    for(int channel_index = 0; channel_index < channel_size; channel_index++)
    {
        //loop over output feature map
        for(int i = 0; i < output_size; i++)//row
        {
            for(int j = 0; j < output_size; j++)//col
            {
                //find the max value in 3x3 reigon 
                //to be one element in the output feature map
                float tmp = 0.0;

                #pragma unroll 1
                for(int k = 0; k < 3; k++)//row
                {
                    #pragma unroll 1
                    for(int l = 0; l < 3; l++)//col
                    {
                        float value = input_im[(i * 2 + k) * input_size  + j * 2 + l ];
                        if(value > tmp)
                            tmp = value;
                    }
                }
                //store the result to output feature map
                output_im[i * output_size + j] = tmp;
            }
        }
    
        input_im += input_size * input_size;
        output_im += output_size * output_size;
    }
}
```

Run OpenCL implement  

In [6]:
tamanyo=113 #input_size
canales_iniciales=64 #input_channels
canales_finales = canales_iniciales
tamanyo_final = np.int32((tamanyo - 3 + 2 ) / 2)

acumulado_pytorch=0

imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
#imagen = np.ones((1,canales_contraidos, tamanyo, tamanyo)).astype(np.float32)

tic=pc()

maxpool = nn.MaxPool2d(3, stride=2)
imagen1  = torch.from_numpy(imagen).float()
    
salida1 = maxpool(imagen1)
    
salida1_a_numpy=salida1.detach().numpy()

toc=pc()
acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [14]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic2 = pc()

maxpool_NDR(queue,(canales_iniciales, ), None, tamanyo, tamanyo_final, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)

rtime = pc() - tic2


In [17]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic3 = pc()

maxpool_ST(queue, (1, ), None, tamanyo, tamanyo_final, canales_iniciales, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos1 = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)

rtime1 = pc() - tic3


In [11]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida1_a_numpy, veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.05555460599998696
tiempo en segundos con opencl (NDRANGE)= 0.04587164899976415
tiempo en segundos con opencl (Simple Task)= 0.03300632400032555
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  True
comparativa (NDRange == Simple Task):  True


In [12]:
for i in range(canales_iniciales):
    for j in range(tamanyo_final):
        for k in range(tamanyo_final):
            if (abs(salida1_a_numpy.reshape(-1,tamanyo_final,tamanyo_final)[i][j][k] - veamos1[i][j][k])) > 1e-01:
                print("i:", i, "j:", j, "k:", k, salida1_a_numpy.reshape(-1,tamanyo_final,tamanyo_final)[i][j][k], veamos1[i][j][k])

In [15]:
tamanyo=113 #input_size
canales_iniciales=64 #input_channels
canales_finales = canales_iniciales
tamanyo_final = np.int32((tamanyo - 3 + 2 ) / 2)

count = 100

acumulado_pytorch=0
acumulado_kernel=0
comparativa4 = True

for i in range(count):
    imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
    #imagen = np.ones((1,canales_contraidos, tamanyo, tamanyo)).astype(np.float32)
    
    tic=pc()

    maxpool = nn.MaxPool2d(3, stride=2)
    imagen1  = torch.from_numpy(imagen).float()

    salida1 = maxpool(imagen1)

    salida1_a_numpy=salida1.detach().numpy()

    toc=pc()
    acumulado_pytorch=toc-tic+acumulado_pytorch
    
    ####### OPENCL COMPARISON #######
    
    h_sample = imagen.reshape(-1).astype(np.float32)
    d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

    h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
    d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)
    
    tic5 = pc()

    maxpool_NDR(queue,(canales_iniciales, ), None, tamanyo, tamanyo_final, d_sample, d_result_pool)

    queue.finish()

    cl.enqueue_copy(queue, h_result_pool, d_result_pool)

    queue.finish()

    veamos3 = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)
    
    toc5 = pc()

    acumulado_kernel = toc5 - tic5 + acumulado_kernel
    
    comparativa4 &= np.allclose(salida1_a_numpy, veamos3,rtol=1e-01, atol=1e-01)
    
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)
print ("tiempo en segundos con opencl (NDRange)=",acumulado_kernel/count)

print("comparativa (pytorch == NDRange): ",comparativa4)

tiempo en segundos con pytorch=  0.05174941249002132
tiempo en segundos con opencl (NDRange)= 0.04399831712999457
comparativa (pytorch == NDRange):  True


In [18]:
tamanyo=113 #input_size
canales_iniciales=64 #input_channels
canales_finales = canales_iniciales
tamanyo_final = np.int32((tamanyo - 3 + 2 ) / 2)

count = 100

acumulado_pytorch=0
acumulado_kernel=0
comparativa4 = True

for i in range(count):
    imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
    #imagen = np.ones((1,canales_contraidos, tamanyo, tamanyo)).astype(np.float32)
    
    tic=pc()

    maxpool = nn.MaxPool2d(3, stride=2)
    imagen1  = torch.from_numpy(imagen).float()

    salida1 = maxpool(imagen1)

    salida1_a_numpy=salida1.detach().numpy()

    toc=pc()
    acumulado_pytorch=toc-tic+acumulado_pytorch
    
    ####### OPENCL COMPARISON #######
    
    h_sample = imagen.reshape(-1).astype(np.float32)
    d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

    h_result_pool = np.empty(1 * canales_iniciales * tamanyo_final * tamanyo_final).astype(np.float32)
    d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)
    
    tic5 = pc()

    maxpool_ST(queue, (1, ), None, tamanyo, tamanyo_final, canales_iniciales, d_sample, d_result_pool)

    queue.finish()

    cl.enqueue_copy(queue, h_result_pool, d_result_pool)

    queue.finish()

    veamos3 = h_result_pool.reshape(-1, tamanyo_final, tamanyo_final)
    
    toc5 = pc()

    acumulado_kernel = toc5 - tic5 + acumulado_kernel
    
    comparativa4 &= np.allclose(salida1_a_numpy, veamos3,rtol=1e-01, atol=1e-01)
    
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)
print ("tiempo en segundos con opencl (Simple Task)=",acumulado_kernel/count)

print("comparativa (pytorch == Simple Task): ",comparativa4)

tiempo en segundos con pytorch=  0.05108882825000819
tiempo en segundos con opencl (Simple Task)= 0.028037996070006555
comparativa (pytorch == Simple Task):  True
