# Debug SqueezeNet v1.3 (Simple Task) OpenCL implement with PyOpenCL and PyTorch
Partial code are copied heavily from https://github.com/pytorch/vision/blob/master/torchvision/models/squeezenet.py  
SqueezeNet Paper:https://arxiv.org/abs/1602.07360  
SqueezeNet 1.1 model from https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1   
SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0, without sacrificing accuracy.

TEST DE IMPLEMENTACIÓN CONV1x1

In [1]:
#some set up
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from PIL import Image
import math
import time
from time import sleep, perf_counter as pc
from matplotlib.pyplot import imshow
%matplotlib inline

## Aquí tenemos el medidor de pytorch

- En esta aproximacion tengo en cuenta el tiempo de definición de capas y la carga de pesos porque se supone que entre fire y fire estas operaciones tiene que hacerse


In [3]:
canales_iniciales=4 #input_channels
canales_contraidos=2 #filter_size
canales_finales= canales_iniciales
acumulado_pytorch=0
idea=True
count=100
tamanyo=5 #input_size
squeeze_activation = nn.ReLU(inplace=True)

for i in range(count):

    imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo))
    weights1=np.random.randint(10,size=(canales_contraidos, canales_iniciales,1,1)).astype(np.float32)
    bias1=np.random.randint(10,size=(canales_contraidos,)).astype(np.float32)
      
    tic=pc()
    squeeze1=nn.Conv2d(canales_iniciales, canales_contraidos, kernel_size=1, bias=False)
    squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
    squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))    
    imagen1  = torch.from_numpy(imagen).float()
    
    salida1=squeeze1(imagen1)
    salida1_activation=squeeze_activation(salida1)

    salida1_a_numpy=salida1_activation.detach().numpy()
    
    toc=pc()
    
    acumulado_pytorch=toc-tic+acumulado_pytorch
    
print(salida1_a_numpy)
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)

[[[[ 46.  60. 115.  71.  61.]
   [ 88.  43.  91.  85.  28.]
   [ 44.  45.  43.  49.  99.]
   [ 71.  56.  62.  69.  78.]
   [ 70.  68.  89. 109.  39.]]

  [[106. 125. 163.  56.  48.]
   [ 65.  99. 166. 143.  54.]
   [ 74. 123.  44.  85. 122.]
   [111.  88.  67. 146. 149.]
   [139. 152. 142. 136. 110.]]]]
tiempo en segundos con pytorch=  0.0007013197499873058


## Veamos ahora conv1x1 con opencl

In [3]:
# OpenCL setup
import pyopencl as cl
import sys
sys.path.append('../common')
import deviceinfo
from time import time

#wksp = '../device/v1.3/'


#### Step0-A: compilation for emulation

In [177]:
%%bash
aoc -march=emulator ../device/v1.3/conv1x1_NDRange.cl -o ../device/v1.3/bin_em/conv1x1_NDRange.aocx
aoc -march=emulator ../device/v1.3/conv1x1_simple_task.cl -o ../device/v1.3/bin_em/conv1x1_simple_task.aocx

aoc: OpenCL kernel compilation completed successfully.
aoc: Linking Object files....
aoc: Compiling for Emulation ....
aoc: OpenCL kernel compilation completed successfully.
aoc: Linking Object files....
aoc: Compiling for Emulation ....


#### Step0-B: compilation for simulation

In [178]:
%%bash
#aoc -march=simulator -v -ghdl ../device/v1.3/conv1x1_NDRange.cl -o ../device/v1.3/bin_sim/conv1x1_NDRange.aocx -board=a10gx
aoc -march=simulator -v -ghdl ../device/v1.3/conv1x1_simple_task.cl -o ../device/v1.3/bin_sim/conv1x1_simple_task.aocx -board=a10gx

aoc: Environment checks completed successfully.
Quartus location: /home/joerock/intelFPGA_pro/21.1/quartus/bin/quartus_sh
aoc: Cached files in /var/tmp/aocl/joerock may be used to reduce compilation time
aoc: Selected target board package /home/joerock/intelFPGA_pro/21.1/hld/board/a10_ref
aoc: Selected target board a10gx
aoc: Running OpenCL parser....
aoc: OpenCL parser completed 
aoc: Linking Object files....
aoc: Optimizing and doing static analysis of code...
aoc: Linking with IP library ...
aoc: Checking if memory usage is larger than 100%...
aoc: Memory usage is not above 100.
aoc: First stage compilation completed successfully.
aoc: Compiling for Simulator.
Quartus location: /home/joerock/intelFPGA_pro/21.1/quartus/bin/quartus_sh
Creating simulation system...
Generating simulation system...
Compiling simulation...
aoc: Simulation generation done!
Simulator flow is successful.
To execute simulator, invoke host with 
	env CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 <host_program>


#### Step1: OpenCL preparation

In [12]:
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
queue = cl.CommandQueue(context)

context

<pyopencl.Context at 0x4d12f78 on <pyopencl.Device '12th Gen Intel(R) Core(TM) i7-12650H' on 'Intel(R) OpenCL' at 0x4cc17e8>>

#### Step 2: creat kernels
Creat & build program

In [13]:
wksp = '../device/v1.3/'

file_dir = wksp + 'conv1x1_NDRange.cl'

kernelSource = open(file_dir).read()
program_NDR = cl.Program(context, kernelSource).build()

file_dir = wksp + 'conv1x1_simple_task.cl'

kernelSource = open(file_dir).read()
program_ST = cl.Program(context, kernelSource).build()

Creat kernels

In [14]:
conv1x1_NDR = program_NDR.conv2d1x1
conv1x1_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None, None, None])

conv1x1_ST = program_ST.conv2d1x1
conv1x1_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None, None])


#### OpenCL kernel: conv1x1_NDRange.cl

conv2d1x1: 2-D 1x1 convolution. kernel size 1, stride 1  

```C
//1x1 convolution layer
//output one feature map per kernel
__kernel void conv2d1x1(
	const int input_channels, const int input_size,
	__global float * restrict input_im,
	__global const float4* restrict filter_weight,
	__global const float* restrict filter_bias,
	__global float *restrict output_im)
{
	// Adding restrict keyword
    int filter_index = get_global_id(0); // 0 - (output_channels - 1)
	int i = get_global_id(1);

	filter_weight += filter_index * input_channels;

	float bias = filter_bias[filter_index];
	
	output_im += filter_index * input_size * input_size;//start_channel is for 1x1 feature map in fire layer

	//loop over output feature map
	//for(int i = 0; i < input_size; i++)
	{
		for(int j = 0; j < input_size; j++)
		{
			float tmp = bias;
			int loc = i * input_size + j;

			for(int k = 0; k < input_channels; k++)
			{
				//float8 weight = filter_weight[k];
				//float8 feature;
				tmp += input_im[((k << 2) + 0) * input_size * input_size + loc] * filter_weight[k].s0
				     + input_im[((k << 2) + 1) * input_size * input_size + loc] * filter_weight[k].s1
					 + input_im[((k << 2) + 2) * input_size * input_size + loc] * filter_weight[k].s2
					 + input_im[((k << 2) + 3) * input_size * input_size + loc] * filter_weight[k].s3;
			}
			//add relu after conv
			output_im[i * input_size + j] = (tmp > 0.0) ? tmp : 0.0;
		}
	}
}

```
#### OpenCL kernel: conv1x1_simple_task.cl

conv2d1x1: 2-D 1x1 convolution. kernel size 1, stride 1  

```C
//1x1 convolution layer as a single kernel
//output one feature map per kernel
__kernel void conv2d1x1(
	const int input_channels, 
    const int input_size,
    const int filter_size,
	__global float * restrict input_im,
	__global const float4* restrict filter_weight,
	__global const float* restrict filter_bias,
	__global float *restrict output_im)
{
	// Adding restrict keyword
    //loop over filters
	for(int f_i = 0; f_i < filter_size; f_i++)
	{
        //filter_weight += f_i * input_channels;

        float bias = filter_bias[f_i];
	
        // output_im += input_size * input_size;//start_channel is for 1x1 feature map in fire layer
	
		for(int ij = 0; ij < (input_size * input_size); ij++)
		{
			float tmp = bias;
			// int loc = i * input_size + j; // this is equal to ij

			for(int k = 0; k < input_channels; k++)
			{
				//float8 weight = filter_weight[k];
				//float8 feature;
				/*tmp += input_im[((k << 2) + 0) * input_size * input_size + ij] * filter_weight[k + f_i * input_channels].s0
				     + input_im[((k << 2) + 1) * input_size * input_size + ij] * filter_weight[k + f_i * input_channels].s1
					 + input_im[((k << 2) + 2) * input_size * input_size + ij] * filter_weight[k + f_i * input_channels].s2
					 + input_im[((k << 2) + 3) * input_size * input_size + ij] * filter_weight[k + f_i * input_channels].s3;*/
                tmp += input_im[((k << 2) + 0) * input_size * input_size + ij] * filter_weight[k].s0
				     + input_im[((k << 2) + 1) * input_size * input_size + ij] * filter_weight[k].s1
					 + input_im[((k << 2) + 2) * input_size * input_size + ij] * filter_weight[k].s2
					 + input_im[((k << 2) + 3) * input_size * input_size + ij] * filter_weight[k].s3;
			}
			//add relu after conv
			//output_im[ij + (input_size * input_size * f_i)] = (tmp > 0.0) ? tmp : 0.0;
            output_im[ij] = (tmp > 0.0) ? tmp : 0.0;
		}
        filter_weight += input_channels;	
        output_im += input_size * input_size;//start_channel is for 1x1 feature map in fire layer
	
	}
}
```

Run OpenCL implement  

In [15]:
tamanyo=55 #input_size
canales_iniciales=64 #input_channels
canales_contraidos=16 #filter_size
canales_finales = canales_iniciales

acumulado_pytorch=0

imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
#imagen = np.ones((1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)

weights1=np.random.randint(10,size=(canales_contraidos, canales_iniciales,1,1)).astype(np.float32)
bias1=np.random.randint(10,size=(canales_contraidos,)).astype(np.float32)

#weights1=np.ones((canales_contraidos, canales_iniciales,1,1)).astype(np.float32)
#bias1=np.ones((canales_contraidos,)).astype(np.float32)

squeeze_activation = nn.ReLU(inplace=True)

tic=pc()
squeeze1=nn.Conv2d(canales_iniciales, canales_contraidos, kernel_size=1, bias=False)
squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))

imagen1  = torch.from_numpy(imagen).float()

salida1=squeeze1(imagen1)
salida1_activation=squeeze_activation(salida1)

salida1_a_numpy=salida1_activation.detach().numpy()

toc=pc()
acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [16]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

fire1_squeeze_weight = weights1.reshape(-1)
fire1_squeeze_bias = bias1

d_fire1_squeeze_weight1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
d_fire1_squeeze_bias1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)

h_result_fire1_squeeze = np.empty(1 * canales_contraidos * tamanyo * tamanyo).astype(np.float32)
d_result_fire1_squeeze1 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)

tic2 = pc()

conv1x1_NDR(queue,(canales_contraidos, tamanyo), None, np.int32(canales_iniciales/4), tamanyo, d_sample1, d_fire1_squeeze_weight1, d_fire1_squeeze_bias1, d_result_fire1_squeeze1)

queue.finish()

cl.enqueue_copy(queue, h_result_fire1_squeeze, d_result_fire1_squeeze1)

queue.finish()

veamos = h_result_fire1_squeeze.reshape(-1,tamanyo,tamanyo)

rtime = pc() - tic2


In [17]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

fire1_squeeze_weight = weights1.reshape(-1)
fire1_squeeze_bias = bias1

d_fire1_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
d_fire1_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)

h_result_fire1_squeeze = np.empty(1 * canales_contraidos * tamanyo * tamanyo).astype(np.float32)
d_result_fire1_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)

tic3 = pc()

conv1x1_ST(queue,(1,), None, np.int32(canales_iniciales/4), tamanyo, canales_contraidos, d_sample, d_fire1_squeeze_weight, d_fire1_squeeze_bias, d_result_fire1_squeeze)

queue.finish()

cl.enqueue_copy(queue, h_result_fire1_squeeze, d_result_fire1_squeeze)

queue.finish()

veamos1 = h_result_fire1_squeeze.reshape(-1,tamanyo,tamanyo)

rtime1 = pc() - tic3


In [18]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida1_a_numpy, veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.003386429999864049
tiempo en segundos con opencl (NDRANGE)= 0.0014173159997881157
tiempo en segundos con opencl (Simple Task)= 0.0015757820001454093
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  True
comparativa (NDRange == Simple Task):  True


#### Step 3: emulation
Creat & build program

In [20]:
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[1])])
device = platforms[1].get_devices()

queue = cl.CommandQueue(context)

context

<pyopencl.Context at 0x4cd57d8 on <pyopencl.Device 'Intel(R) FPGA Emulation Device' on 'Intel(R) FPGA Emulation Platform for OpenCL(TM)' at 0x4ccc448>>

In [21]:
wksp = '../device/v1.3/bin_em/'

file_dir = wksp + 'conv1x1_NDRange.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_NDR = cl.Program(context, device, [kernelSource]).build()

file_dir = wksp + 'conv1x1_simple_task.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_ST = cl.Program(context, device, [kernelSource]).build()

Creat kernels

In [22]:
conv1x1_NDR = program_NDR.conv2d1x1
conv1x1_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None, None, None])

conv1x1_ST = program_ST.conv2d1x1
conv1x1_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None, None])


Run OpenCL implement  

In [23]:
tamanyo=55 #input_size
canales_iniciales=64 #input_channels
canales_contraidos=16 #filter_size
canales_finales = canales_iniciales

acumulado_pytorch=0

imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
#imagen = np.ones((1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)

weights1=np.random.randint(10,size=(canales_contraidos, canales_iniciales,1,1)).astype(np.float32)
bias1=np.random.randint(10,size=(canales_contraidos,)).astype(np.float32)

#weights1=np.ones((canales_contraidos, canales_iniciales,1,1)).astype(np.float32)
#bias1=np.ones((canales_contraidos,)).astype(np.float32)

squeeze_activation = nn.ReLU(inplace=True)

tic=pc()
squeeze1=nn.Conv2d(canales_iniciales, canales_contraidos, kernel_size=1, bias=False)
squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))

imagen1  = torch.from_numpy(imagen).float()

salida1=squeeze1(imagen1)
salida1_activation=squeeze_activation(salida1)

salida1_a_numpy=salida1_activation.detach().numpy()

toc=pc()
acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [24]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

fire1_squeeze_weight = weights1.reshape(-1)
fire1_squeeze_bias = bias1

d_fire1_squeeze_weight1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
d_fire1_squeeze_bias1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)

h_result_fire1_squeeze = np.empty(1 * canales_contraidos * tamanyo * tamanyo).astype(np.float32)
d_result_fire1_squeeze1 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)

tic2 = pc()

conv1x1_NDR(queue,(canales_contraidos, tamanyo), None, np.int32(canales_iniciales/4), tamanyo, d_sample1, d_fire1_squeeze_weight1, d_fire1_squeeze_bias1, d_result_fire1_squeeze1)

queue.finish()

cl.enqueue_copy(queue, h_result_fire1_squeeze, d_result_fire1_squeeze1)

queue.finish()

veamos = h_result_fire1_squeeze.reshape(-1,tamanyo,tamanyo)

rtime = pc() - tic2


In [25]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

fire1_squeeze_weight = weights1.reshape(-1)
fire1_squeeze_bias = bias1

d_fire1_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
d_fire1_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)

h_result_fire1_squeeze = np.empty(1 * canales_contraidos * tamanyo * tamanyo).astype(np.float32)
d_result_fire1_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)

tic3 = pc()

conv1x1_ST(queue,(1,), None, np.int32(canales_iniciales/4), tamanyo, canales_contraidos, d_sample, d_fire1_squeeze_weight, d_fire1_squeeze_bias, d_result_fire1_squeeze)

queue.finish()

cl.enqueue_copy(queue, h_result_fire1_squeeze, d_result_fire1_squeeze)

queue.finish()

veamos1 = h_result_fire1_squeeze.reshape(-1,tamanyo,tamanyo)

rtime1 = pc() - tic3


In [26]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida1_a_numpy, veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.0006829490002928651
tiempo en segundos con opencl (NDRANGE)= 0.01670902699970611
tiempo en segundos con opencl (Simple Task)= 0.016361129999950208
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  True
comparativa (NDRange == Simple Task):  True


#### Step 4: simulación
Creat & build program

In [27]:
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[2])])
device = platforms[2].get_devices()

queue = cl.CommandQueue(context)

context

<pyopencl.Context at 0x631bf90 on <pyopencl.Device 'SimulatorDevice : Multi-process Simulator (aclmsim0)' on 'Intel(R) FPGA SDK for OpenCL(TM)' at 0x7f5a5c0c70d8>>

In [28]:
wksp = '../device/v1.3/bin_sim/'

file_dir = wksp + 'conv1x1_NDRange.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_NDR = cl.Program(context, device, [kernelSource]).build()

file_dir = wksp + 'conv1x1_simple_task.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_ST = cl.Program(context, device, [kernelSource]).build()

Creat kernels

In [30]:
conv1x1_NDR = program_NDR.conv2d1x1
conv1x1_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None, None, None])

conv1x1_ST = program_ST.conv2d1x1
conv1x1_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None, None])


Run OpenCL implement  

In [31]:
tamanyo=5 #input_size
canales_iniciales=8 #input_channels
canales_contraidos=2 #filter_size
canales_finales = canales_iniciales

acumulado_pytorch=0

imagen = np.random.randint(10,size=(1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)
#imagen = np.ones((1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32)

weights1=np.random.randint(10,size=(canales_contraidos, canales_iniciales,1,1)).astype(np.float32)
bias1=np.random.randint(10,size=(canales_contraidos,)).astype(np.float32)

#weights1=np.ones((canales_contraidos, canales_iniciales,1,1)).astype(np.float32)
#bias1=np.ones((canales_contraidos,)).astype(np.float32)

squeeze_activation = nn.ReLU(inplace=True)

tic=pc()
squeeze1=nn.Conv2d(canales_iniciales, canales_contraidos, kernel_size=1, bias=False)
squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))

imagen1  = torch.from_numpy(imagen).float()

salida1=squeeze1(imagen1)
salida1_activation=squeeze_activation(salida1)

salida1_a_numpy=salida1_activation.detach().numpy()

toc=pc()
acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [32]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

fire1_squeeze_weight = weights1.reshape(-1)
fire1_squeeze_bias = bias1

d_fire1_squeeze_weight1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
d_fire1_squeeze_bias1 = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)

h_result_fire1_squeeze = np.empty(1 * canales_contraidos * tamanyo * tamanyo).astype(np.float32)
d_result_fire1_squeeze1 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)

tic2 = pc()

conv1x1_NDR(queue,(canales_contraidos, tamanyo), None, np.int32(canales_iniciales/4), tamanyo, d_sample1, d_fire1_squeeze_weight1, d_fire1_squeeze_bias1, d_result_fire1_squeeze1)

queue.finish()

cl.enqueue_copy(queue, h_result_fire1_squeeze, d_result_fire1_squeeze1)

queue.finish()

veamos = h_result_fire1_squeeze.reshape(-1,tamanyo,tamanyo)

rtime = pc() - tic2


  h_result_fire1_squeeze = np.empty(1 * canales_contraidos * tamanyo * tamanyo).astype(np.float32)


In [None]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

fire1_squeeze_weight = weights1.reshape(-1)
fire1_squeeze_bias = bias1

d_fire1_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
d_fire1_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)

h_result_fire1_squeeze = np.empty(1 * canales_contraidos * tamanyo * tamanyo).astype(np.float32)
d_result_fire1_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)

tic3 = pc()

conv1x1_ST(queue,(1,), None, np.int32(canales_iniciales/4), tamanyo, canales_contraidos, d_sample, d_fire1_squeeze_weight, d_fire1_squeeze_bias, d_result_fire1_squeeze)

queue.finish()

cl.enqueue_copy(queue, h_result_fire1_squeeze, d_result_fire1_squeeze)

queue.finish()

veamos1 = h_result_fire1_squeeze.reshape(-1,tamanyo,tamanyo)

rtime1 = pc() - tic3


  h_result_fire1_squeeze = np.empty(1 * canales_contraidos * tamanyo * tamanyo).astype(np.float32)


In [None]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida1_a_numpy, veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

In [19]:
print("pytorch", veamos.shape, salida1_a_numpy.reshape(-1,tamanyo,tamanyo)[4][0])
print("NDRange", veamos[4][0])
print("Simple task", veamos1[6][0])
print(imagen.shape, np.allclose(imagen, np.ones((1,canales_iniciales, tamanyo, tamanyo)).astype(np.float32), rtol=1e-01, atol=1e-01))
#print(weights1[6])
# print(fire1_squeeze_weight)
#print(bias1[6])
# print(imagen1_obtenida)
# print(veamos2)
print(np.allclose(salida1_a_numpy, veamos1,rtol=1e-01, atol=1e-01))

pytorch (16, 55, 55) [1193. 1265.  962. 1468. 1151. 1208. 1168. 1300. 1210. 1278. 1195. 1442.
 1170. 1086. 1594. 1191.  929. 1192. 1037. 1237. 1082. 1313. 1070. 1297.
 1253. 1303. 1348. 1179. 1189. 1286.  991. 1196. 1133. 1270. 1248. 1266.
 1213. 1182. 1212. 1323. 1340. 1210. 1350. 1394. 1361. 1346. 1275. 1347.
 1234. 1127. 1428. 1342. 1336. 1341. 1228.]
NDRange [1193. 1265.  962. 1468. 1151. 1208. 1168. 1300. 1210. 1278. 1195. 1442.
 1170. 1086. 1594. 1191.  929. 1192. 1037. 1237. 1082. 1313. 1070. 1297.
 1253. 1303. 1348. 1179. 1189. 1286.  991. 1196. 1133. 1270. 1248. 1266.
 1213. 1182. 1212. 1323. 1340. 1210. 1350. 1394. 1361. 1346. 1275. 1347.
 1234. 1127. 1428. 1342. 1336. 1341. 1228.]
Simple task [1563. 1504. 1345. 1593. 1302. 1332. 1512. 1648. 1425. 1572. 1470. 1658.
 1448. 1459. 1884. 1265. 1334. 1567. 1255. 1454. 1172. 1508. 1370. 1663.
 1480. 1619. 1577. 1413. 1639. 1476. 1307. 1433. 1395. 1539. 1374. 1340.
 1547. 1500. 1534. 1672. 1646. 1609. 1656. 1532. 1612. 1597. 1374. 1

In [206]:
for i in range(16):
    for j in range(55):
        for k in range(55):
            if (abs(salida1_a_numpy.reshape(-1,tamanyo,tamanyo)[i][j][k] - veamos1[i][j][k])) > 1e-01:
                print("i:", i, "j:", j, "k:", k, salida1_a_numpy.reshape(-1,tamanyo,tamanyo)[i][j][k], veamos1[i][j][k])