# Debug SqueezeNet v1.3 (Simple Task) OpenCL implement with PyOpenCL and PyTorch
Partial code are copied heavily from https://github.com/pytorch/vision/blob/master/torchvision/models/squeezenet.py  
SqueezeNet Paper:https://arxiv.org/abs/1602.07360  
SqueezeNet 1.1 model from https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1   
SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0, without sacrificing accuracy.

TEST DE IMPLEMENTACIÓN AVGPOOL 15x15 (capa final)

In [1]:
#some set up
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from PIL import Image
import math
import time
from time import sleep, perf_counter as pc
from matplotlib.pyplot import imshow
%matplotlib inline

In [2]:
# OpenCL setup
import pyopencl as cl
import sys
sys.path.append('../python_common')
import deviceinfo
from time import time

#wksp = '../device/v1.3/conv3x3'


## Veamos ahora solo conv3x3 con opencl

#### Step1: OpenCL preparation

In [7]:
## Devices and compute context
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
device = platforms[0].get_devices()

# Create a command queue
queue = cl.CommandQueue(context)

context

<pyopencl.Context at 0x-465619f0 on <pyopencl.Device 'de10_nano_sharedonly : Cyclone V SoC Development Kit' on 'Intel(R) FPGA SDK for OpenCL(TM)' at 0x-5b111690>>

#### Step 2: creat kernels
Creat & build program

Create kernels

In [8]:
wksp = ''

file_dir = wksp + 'avgpool_NDRange.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_NDR = cl.Program(context, device, [kernelSource]).build()

file_dir = wksp + 'avgpool_ST.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_ST = cl.Program(context, device, [kernelSource]).build()


Reprogramming device [0] with handle 1
  warn("Non-empty compiler output encountered. Set the "


Creat kernels

In [9]:
avgpool_NDR = program_NDR.avgpool2d
avgpool_NDR.set_scalar_arg_dtypes([None, None])

avgpool_ST = program_ST.avgpool2d
avgpool_ST.set_scalar_arg_dtypes([None, None])


#### OpenCL kernel: avgpool_NDRange.cl

avgpool2d: 2-D 13x13 average pool for the last layer of the classifier.  

```C
//last layer use a 13 x 13 avgPool layer as classifier
//one class score per kernel
__kernel void avgpool2d(
	__global const float* restrict input_im,
	__global float* restrict output_im)
{
	int class_index = get_global_id(0);//get class score index

	input_im += 169 * class_index;
	
	float tmp = 0.0f;

	for(int i = 0; i < 169; i++)
	{
		tmp += input_im[i];
	}

	output_im[class_index] = tmp / 169.0;
}
```
#### OpenCL kernel: avgpool_ST.cl

avgpool2d: 2-D 13x13 average pool for the last layer of the classifier. 

```C
//last layer use a 13 x 13 avgPool layer as classifier
//one class score per kernel
__kernel void avgpool2d(
	__global const float* restrict input_im,
	__global float* restrict output_im)
{
	// int class_index = get_global_id(0);//get class score index
    
    //Since it's the final layer, we know that there are only 1000 classes
    
	//input_im += 169 * class_index;

	for(int class_index = 0; class_index < 1000; class_index++)
    {
            
        float tmp = 0.0f;

        for(int i = 0; i < 169; i++)
        {
            tmp += input_im[class_index * 169 + i];
        }

        output_im[class_index] = tmp / 169.0;
    }
}
```

Run OpenCL implement  

In [29]:
tamanyo=13 #input_size
canales_iniciales= 1000 #input_channels

acumulado_pytorch=0

imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
#imagen = np.ones((1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)

tic=pc()

avgpool = nn.AvgPool2d(tamanyo)
imagen1  = torch.from_numpy(imagen).float()
    
salida1 = avgpool(imagen1)
    
salida1_a_numpy=salida1.detach().numpy()

toc=pc()
acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######



In [55]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic2 = pc()

avgpool_NDR(queue,(canales_iniciales, ), None, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos = h_result_pool

rtime = pc() - tic2


In [48]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

h_result_pool = np.empty(1 * canales_iniciales).astype(np.float32)
d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

tic3 = pc()

avgpool_ST(queue, (1, ), None, d_sample, d_result_pool)

queue.finish()

cl.enqueue_copy(queue, h_result_pool, d_result_pool)

queue.finish()

veamos1 = h_result_pool

rtime1 = pc() - tic3


In [56]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida1_a_numpy.reshape(-1), veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida1_a_numpy.reshape(-1), veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.007513587000175903
tiempo en segundos con opencl (NDRANGE)= 0.0071161689998007205
tiempo en segundos con opencl (Simple Task)= 0.00508749499977057
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  False
comparativa (NDRange == Simple Task):  False


In [60]:
for i in range(canales_iniciales):
    if ((salida1_a_numpy.reshape(-1)[i] - veamos1[i])) > 1e-01:
        print("i:", i, salida1_a_numpy.reshape(-1)[i], veamos1[i])

In [61]:
tamanyo=13 #input_size
canales_iniciales=1000 #input_channels

count = 100

acumulado_pytorch=0
acumulado_kernel=0
comparativa = True

for i in range(count):
    imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
    #imagen = np.ones((1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)

    tic=pc()

    avgpool = nn.AvgPool2d(tamanyo)
    imagen1  = torch.from_numpy(imagen).float()

    salida1 = avgpool(imagen1)

    salida1_a_numpy=salida1.detach().numpy()

    toc=pc()
    acumulado_pytorch=toc-tic+acumulado_pytorch
    
    ####### OPENCL COMPARISON #######
    
    h_sample = imagen.reshape(-1).astype(np.float32)
    d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

    h_result_pool = np.empty(1 * canales_iniciales).astype(np.float32)
    d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

    tic5 = pc()

    avgpool_NDR(queue,(canales_iniciales, ), None, d_sample, d_result_pool)

    queue.finish()

    cl.enqueue_copy(queue, h_result_pool, d_result_pool)

    queue.finish()

    veamos1 = h_result_pool
    
    toc5 = pc()

    acumulado_kernel = toc5 - tic5 + acumulado_kernel
    
    comparativa &= np.allclose(salida1_a_numpy.reshape(-1), veamos1,rtol=1e-01, atol=1e-01)
    
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)
print ("tiempo en segundos con opencl (NDRange)=",acumulado_kernel/count)

print("comparativa (pytorch == NDRange): ",comparativa)

tiempo en segundos con pytorch=  0.003400834379999651
tiempo en segundos con opencl (NDRange)= 0.003507062499975291
comparativa (pytorch == NDRange):  True


In [64]:
tamanyo=13 #input_size
canales_iniciales=1000 #input_channels

count = 100

acumulado_pytorch=0
acumulado_kernel=0
comparativa = True

for i in range(count):
    imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
    #imagen = np.ones((1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)

    tic=pc()

    avgpool = nn.AvgPool2d(tamanyo)
    imagen1  = torch.from_numpy(imagen).float()

    salida1 = avgpool(imagen1)

    salida1_a_numpy=salida1.detach().numpy()

    toc=pc()
    acumulado_pytorch=toc-tic+acumulado_pytorch
    
    ####### OPENCL COMPARISON #######
    
    h_sample = imagen.reshape(-1).astype(np.float32)
    d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

    h_result_pool = np.empty(1 * canales_iniciales).astype(np.float32)
    d_result_pool = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool.nbytes)

    tic5 = pc()

    avgpool_ST(queue, (1, ), None, d_sample, d_result_pool)

    queue.finish()

    cl.enqueue_copy(queue, h_result_pool, d_result_pool)

    queue.finish()

    veamos1 = h_result_pool
    
    toc5 = pc()

    acumulado_kernel = toc5 - tic5 + acumulado_kernel
    
    comparativa &= np.allclose(salida1_a_numpy.reshape(-1), veamos1,rtol=1e-01, atol=1e-01)
    
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)
print ("tiempo en segundos con opencl (Simple Task)=",acumulado_kernel/count)
print("comparativa (pytorch == Simple Task): ",comparativa)

tiempo en segundos con pytorch=  0.0033899411500169663
tiempo en segundos con opencl (Simple Task)= 0.0027128481899762846
comparativa (pytorch == Simple Task):  True
