# Debug SqueezeNet v1.3 (Simple Task) OpenCL implement with PyOpenCL and PyTorch
Partial code are copied heavily from https://github.com/pytorch/vision/blob/master/torchvision/models/squeezenet.py  
SqueezeNet Paper:https://arxiv.org/abs/1602.07360  
SqueezeNet 1.1 model from https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1   
SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0, without sacrificing accuracy.

TTests con bloque 3

In [1]:
#some set up
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from PIL import Image
import math
import time
from time import sleep, perf_counter as pc
from matplotlib.pyplot import imshow
%matplotlib inline

In [3]:
# OpenCL setup
import pyopencl as cl
import sys
sys.path.append('../python_common')
import deviceinfo
from time import time

#wksp = '../device/v1.3/squeezenet'

## Veamos ahora fire con opencl

#### Step1: OpenCL preparation

In [42]:
## Devices and compute context
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
device = platforms[0].get_devices()

# Create a command queue
queue = cl.CommandQueue(context)
queue1 = cl.CommandQueue(context)

context

<pyopencl.Context at 0x687a4a8 on <pyopencl.Device '12th Gen Intel(R) Core(TM) i7-12650H' on 'Intel(R) OpenCL' at 0x4f7fab8>>

#### Step 2: creat kernels
Creat & build program

Create kernels

In [52]:
wksp = ''

file_dir = wksp + 'maxpool_NDRange.aocx'

file_dir = wksp + 'squeezenet_NDRange.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_NDR = cl.Program(context, device, [kernelSource]).build()

file_dir = wksp + 'squeezenet_ST.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_ST = cl.Program(context, device, [kernelSource]).build()


Creat kernels

In [53]:
conv3x3_NDR = program_NDR.conv2d3x3
conv3x3_NDR.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, None, None, None, None])

maxpool_NDR = program_NDR.maxpool2d
maxpool_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None])

conv1x1_NDR = program_NDR.conv2d1x1
conv1x1_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None, None, None])

avgpool_NDR = program_NDR.avgpool2d
avgpool_NDR.set_scalar_arg_dtypes([None, None])

conv3x3_ST = program_ST.conv2d3x3
conv3x3_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, None, None, None, None])

maxpool_ST = program_ST.maxpool2d
maxpool_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None])

conv1x1_ST = program_ST.conv2d1x1
conv1x1_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None, None])

avgpool_ST = program_ST.avgpool2d
avgpool_ST.set_scalar_arg_dtypes([None, None])

#### OpenCL kernel: squeezenet_NDRange.cl
conv2d3x3: 2-D 3x3 convolution.  
conv2d1x1: 2-D 1x1 convolution. kerner size 1, stride 1  
maxpool2d: 2-D max pool. kerner size 3, stride 2  
avgpool2d: 2-D average pool. kernel size 13
```C
//maxPool2d 
//kernel_size=3 stride=2
//output one feature map per kernel
__kernel void maxpool2d(
	const int input_size,
	const int output_size,
	__global const float* restrict input_im,
    __global float* restrict output_im)
{
	int channels = get_global_id(0);//get output channel index
	
	input_im += channels * input_size * input_size;
	output_im += channels * output_size * output_size;

	//loop over output feature map
	for(int i = 0; i < output_size; i++)//row
	{
		for(int j = 0; j < output_size; j++)//col
		{
			//find the max value in 3x3 reigon 
			//to be one element in the output feature map
			float tmp = 0.0;

			#pragma unroll 1
			for(int k = 0; k < 3; k++)//row
			{
				#pragma unroll 1
				for(int l = 0; l < 3; l++)//col
				{
					float value = input_im[(i * 2 + k) * input_size  + j * 2 + l ];
					if(value > tmp)
						tmp = value;
				}
			}
			//store the result to output feature map
			output_im[i * output_size + j] = tmp; 
		}
	}
}

//3x3 convolution layer
//output one feature map per kernel
__kernel void conv2d3x3(
	const int input_channels, const int input_size,
	const int pad, const int stride,
	const int start_channel, //start_channel is for 1x1 feature map in fire layer
	const int output_size,
	__global const float* restrict input_im,
	__global const float* restrict filter_weight,
	__global const float* restrict filter_bias,
	__global float *restrict output_im
	)
{
	int filter_index = get_global_id(0); //get output channel index
	int i =  get_global_id(1);

	filter_weight += filter_index * input_channels * 9;
	float bias = filter_bias[filter_index];
	output_im += (start_channel + filter_index) * output_size * output_size;
	
	//loop over output feature map
	//for(int i = 0; i < output_size; i++)
	{
		for(int j = 0; j < output_size; j++)
		{
			//compute one element in the output feature map
			float tmp = bias;
			
			//compute dot product of 2 input_channels x 3 x 3 matrix
			for(int k = 0; k < input_channels; k++)
			{
				#pragma unroll
				for(int l = 0; l < 3; l++)
				{
					int h = i * stride + l - pad;
					for(int m = 0; m < 3; m++)
					{
						int w = j * stride + m - pad;
						if((h >= 0) && (h < input_size) && (w >= 0) && (w < input_size))
						{
							tmp += input_im[k * input_size * input_size + h * input_size + w] \
                               * filter_weight[9 * k + 3 * l + m];
						}
					}
				}
			}

			//add relu activation after conv
			output_im[i * output_size + j] = (tmp > 0.0) ? tmp : 0.0;
		}
	}
}

//1x1 convolution layer
//output one feature map per kernel
__kernel void conv2d1x1(
	const int input_channels, const int input_size,
	__global float *input_im,
	__global const float4* filter_weight,
	__global const float* filter_bias,
	__global float *restrict output_im)
{
	int filter_index = get_global_id(0); // 0 - (output_channels - 1)
	int i = get_global_id(1);

	filter_weight += filter_index * input_channels;

	float bias = filter_bias[filter_index];
	
	output_im += filter_index * input_size * input_size;//start_channel is for 1x1 feature map in fire layer

	//loop over output feature map
	//for(int i = 0; i < input_size; i++)
	{
		for(int j = 0; j < input_size; j++)
		{
			float tmp = bias;
			int loc = i * input_size + j;

			for(int k = 0; k < input_channels; k++)
			{
				//float8 weight = filter_weight[k];
				//float8 feature;
				tmp += input_im[((k << 2) + 0) * input_size * input_size + loc] * filter_weight[k].s0
				     + input_im[((k << 2) + 1) * input_size * input_size + loc] * filter_weight[k].s1
					 + input_im[((k << 2) + 2) * input_size * input_size + loc] * filter_weight[k].s2
					 + input_im[((k << 2) + 3) * input_size * input_size + loc] * filter_weight[k].s3;
			}
			//add relu after conv
			output_im[i * input_size + j] = (tmp > 0.0) ? tmp : 0.0;
		}
	}
}

//last layer use a 13 x 13 avgPool layer as classifier
//one class score per kernel
__kernel void avgpool2d(
	__global const float* restrict input_im,
	__global float* restrict output_im)
{
	int class_index = get_global_id(0);//get class score index

	input_im += 169 * class_index;
	
	float tmp = 0.0f;

	for(int i = 0; i < 169; i++)
	{
		tmp += input_im[i];
	}

	output_im[class_index] = tmp / 169.0;
}
```

#### OpenCL kernel: squeezenet_ST.cl
conv2d3x3: 2-D 3x3 convolution.  
conv2d1x1: 2-D 1x1 convolution. kerner size 1, stride 1  
maxpool2d: 2-D max pool. kerner size 3, stride 2  
avgpool2d: 2-D average pool. kernel size 13
```C
//maxPool2d 
//kernel_size=3 stride=2
//output one feature map per kernel
__kernel void maxpool2d(
	const int input_size,
	const int output_size,
    const int channel_size,
	__global const float* restrict input_im,
    __global float* restrict output_im)
{

    for(int channel_index = 0; channel_index < channel_size; channel_index++)
    {
        //loop over output feature map
        for(int i = 0; i < output_size; i++)//row
        {
            for(int j = 0; j < output_size; j++)//col
            {
                //find the max value in 3x3 reigon 
                //to be one element in the output feature map
                float tmp = 0.0;

                #pragma unroll 1
                for(int k = 0; k < 3; k++)//row
                {
                    #pragma unroll 1
                    for(int l = 0; l < 3; l++)//col
                    {
                        float value = input_im[(i * 2 + k) * input_size  + j * 2 + l ];
                        if(value > tmp)
                            tmp = value;
                    }
                }
                //store the result to output feature map
                output_im[i * output_size + j] = tmp;
            }
        }
    
        input_im += input_size * input_size;
        output_im += output_size * output_size;
    }
}

//3x3 convolution layer
//output one feature map per kernel
__kernel void conv2d3x3(
	const int input_channels, const int input_size,
	const int pad, const int stride,
	const int start_channel, //start_channel is for 1x1 feature map in fire layer
	const int output_size,
    const int filter_size,
	__global const float* restrict input_im,
	__global const float* restrict filter_weight,
	__global const float* restrict filter_bias,
	__global float *restrict output_im
	)
{
	
	//filter_weight += filter_index * input_channels * 9;
	output_im += start_channel * output_size * output_size;
	
	//loop over output feature map
	for(int filter_index = 0; filter_index < filter_size; filter_index++)
	{
        float bias = filter_bias[filter_index];

		for(int i = 0; i < output_size; i++)
		{
            for(int j = 0; j < output_size; j++)
            {
                //compute one element in the output feature map
                float tmp = bias;

                //compute dot product of 2 input_channels x 3 x 3 matrix
                for(int k = 0; k < input_channels; k++)
                {
                    #pragma unroll
                    for(int l = 0; l < 3; l++)
                    {
                        int h = i * stride + l - pad;
                        for(int m = 0; m < 3; m++)
                        {
                            int w = j * stride + m - pad;
                            if((h >= 0) && (h < input_size) && (w >= 0) && (w < input_size))
                            {
                                tmp += input_im[k * input_size * input_size + h * input_size + w] \
                                   * filter_weight[9 * k + 3 * l + m];
                            }
                        }
                    }
                }

                //add relu activation after conv
                output_im[i * output_size + j] = (tmp > 0.0) ? tmp : 0.0;                 
            }
		}
        
        filter_weight += input_channels * 9;
        output_im += output_size * output_size;
	}
}

//1x1 convolution layer as a single kernel (V5)
//output one feature map per kernel

__kernel void conv2d1x1(
	const int input_channels, 
    const int input_size,
    const int filter_size,
	__global const float* restrict input_im,
	__global const float* restrict filter_weight,
	__global const float* restrict filter_bias,
	__global float *restrict output_im)
{
	// Adding restrict keyword
    //loop over filters
	for(int f_i = 0; f_i < filter_size; f_i++)
	{
        //filter_weight += f_i * input_channels;

        float bias = filter_bias[f_i];
		
        // output_im += input_size * input_size;//start_channel is for 1x1 feature map in fire layer
	
		for(int ij = 0; ij < (input_size * input_size); ij++)
		{
			float tmp = bias;
			// int loc = i * input_size + j; // this is equal to ij

			for(int k = 0; k < input_channels; k++)
			{
				//float8 weight = filter_weight[k];
				//float8 feature;
				tmp += input_im[k * input_size * input_size + ij] * filter_weight[k + f_i * input_channels];
			}
			//add relu after conv
			output_im[ij + (input_size * input_size * f_i)] = (tmp > 0.0) ? tmp : 0.0;
            //output_im[ij] = (tmp > 0.0) ? tmp : 0.0;
		}
        //filter_weight += input_channels;	
        //output_im += input_size * input_size;//start_channel is for 1x1 feature map in fire layer
	
	}
}

//last layer use a 13 x 13 avgPool layer as classifier
//one class score per kernel
__kernel void avgpool2d(
	__global const float* restrict input_im,
	__global float* restrict output_im)
{
	// int class_index = get_global_id(0);//get class score index
    
    //Since it's the final layer, we know that there are only 1000 classes
    
	//input_im += 169 * class_index;

	for(int class_index = 0; class_index < 1000; class_index++)
    {
            
        float tmp = 0.0f;

        for(int i = 0; i < 169; i++)
        {
            tmp += input_im[class_index * 169 + i];
        }

        output_im[class_index] = tmp / 169.0;
    }
}
```

Run OpenCL implement  

In [54]:
######## BLOCK 3 ########
tamanyo=14 #input_size
canales_iniciales=256 #input channels and 
squeeze_factor_a = 48
squeeze_factor_b = 64
expand_factor_a = 192
expand_factor_b = 256

################

tamanyo_final = tamanyo # En el último bloque no hay maxpool layer

squeeze_activation = nn.ReLU(inplace=True)

acumulado_pytorch = 0

imagen = np.random.randint(1,size=(1,canales_iniciales, tamanyo, tamanyo))

#### FIRE 6 ####
weights1=np.random.randint(1,size=(squeeze_factor_a, canales_iniciales,1,1)).astype(np.float32)
bias1=np.random.randint(1,size=(squeeze_factor_a,)).astype(np.float32)

weights2a=np.random.randint(1,size=(expand_factor_a, squeeze_factor_a,1,1)).astype(np.float32)  
bias2a=np.random.randint(1,size=(expand_factor_a,)).astype(np.float32)    

weights2b=np.random.randint(1,size=(expand_factor_a, squeeze_factor_a,3,3)).astype(np.float32)    
bias2b=np.random.randint(1,size=(expand_factor_a,)).astype(np.float32)

#### FIRE 7 ####
weights3=np.random.randint(1,size=(squeeze_factor_a, expand_factor_a * 2,1,1)).astype(np.float32)
bias3=np.random.randint(1,size=(squeeze_factor_a,)).astype(np.float32)

weights4a=np.random.randint(1,size=(expand_factor_a, squeeze_factor_a,1,1)).astype(np.float32)  
bias4a=np.random.randint(1,size=(expand_factor_a,)).astype(np.float32)    

weights4b=np.random.randint(1,size=(expand_factor_a, squeeze_factor_a,3,3)).astype(np.float32)    
bias4b=np.random.randint(1,size=(expand_factor_a,)).astype(np.float32)

#### FIRE 8 ####
weights5=np.random.randint(1,size=(squeeze_factor_b, expand_factor_a * 2,1,1)).astype(np.float32)
bias5=np.random.randint(1,size=(squeeze_factor_b,)).astype(np.float32)

weights6a=np.random.randint(1,size=(expand_factor_b, squeeze_factor_b,1,1)).astype(np.float32)  
bias6a=np.random.randint(1,size=(expand_factor_b,)).astype(np.float32)    

weights6b=np.random.randint(1,size=(expand_factor_b, squeeze_factor_b,3,3)).astype(np.float32)    
bias6b=np.random.randint(1,size=(expand_factor_b,)).astype(np.float32)

#### FIRE 9 ####
weights7=np.random.randint(1,size=(squeeze_factor_b, expand_factor_b*2,1,1)).astype(np.float32)
bias7=np.random.randint(1,size=(squeeze_factor_b,)).astype(np.float32)

weights8a=np.random.randint(1,size=(expand_factor_b, squeeze_factor_b,1,1)).astype(np.float32)  
bias8a=np.random.randint(1,size=(expand_factor_b,)).astype(np.float32)    

weights8b=np.random.randint(1,size=(expand_factor_b, squeeze_factor_b,3,3)).astype(np.float32)    
bias8b=np.random.randint(1,size=(expand_factor_b,)).astype(np.float32)

#####################################################
imagen = np.ones((1,canales_iniciales, tamanyo, tamanyo))

#### FIRE 6 ####
weights1=np.ones((squeeze_factor_a, canales_iniciales,1,1)).astype(np.float32)
bias1=np.ones((squeeze_factor_a,)).astype(np.float32)

weights2a=np.ones((expand_factor_a, squeeze_factor_a,1,1)).astype(np.float32)  
bias2a=np.ones((expand_factor_a,)).astype(np.float32)    

weights2b=np.ones((expand_factor_a, squeeze_factor_a,3,3)).astype(np.float32)    
bias2b=np.ones((expand_factor_a,)).astype(np.float32)

#### FIRE 7 ####
weights3=np.ones((squeeze_factor_a, expand_factor_a * 2,1,1)).astype(np.float32)
bias3=np.ones((squeeze_factor_a,)).astype(np.float32)

weights4a=np.ones((expand_factor_a, squeeze_factor_a,1,1)).astype(np.float32)  
bias4a=np.ones((expand_factor_a,)).astype(np.float32)    

weights4b=np.ones((expand_factor_a, squeeze_factor_a,3,3)).astype(np.float32)    
bias4b=np.ones((expand_factor_a,)).astype(np.float32)

#### FIRE 8 ####
weights5=np.ones((squeeze_factor_b, expand_factor_a * 2,1,1)).astype(np.float32)
bias5=np.ones((squeeze_factor_b,)).astype(np.float32)

weights6a=np.ones((expand_factor_b, squeeze_factor_b,1,1)).astype(np.float32)  
bias6a=np.ones((expand_factor_b,)).astype(np.float32)    

weights6b=np.ones((expand_factor_b, squeeze_factor_b,3,3)).astype(np.float32)    
bias6b=np.ones((expand_factor_b,)).astype(np.float32)

#### FIRE 9 ####
weights7=np.ones((squeeze_factor_b, expand_factor_b*2,1,1)).astype(np.float32)
bias7=np.ones((squeeze_factor_b,)).astype(np.float32)

weights8a=np.ones((expand_factor_b, squeeze_factor_b,1,1)).astype(np.float32)  
bias8a=np.ones((expand_factor_b,)).astype(np.float32)    

weights8b=np.ones((expand_factor_b, squeeze_factor_b,3,3)).astype(np.float32)    
bias8b=np.ones((expand_factor_b,)).astype(np.float32)

#####################################################
imagen = np.random.randint(239,size=(1,canales_iniciales, tamanyo, tamanyo))
params = torch.load('squeezenet1_1.pth')

######## BLOCK 3 ########
#fire - fire - fire - fire block 3
#### FIRE 6 ####
weights1 = params['features.9.squeeze.weight'].numpy()
bias1 = params['features.9.squeeze.bias'].numpy()

weights2a = params['features.9.expand1x1.weight'].numpy()
bias2a = params['features.9.expand1x1.bias'].numpy()

weights2b = params['features.9.expand3x3.weight'].numpy()
bias2b = params['features.9.expand3x3.bias'].numpy()

#### FIRE 7 ####
weights3 = params['features.10.squeeze.weight'].numpy()
bias3 = params['features.10.squeeze.bias'].numpy()

weights4a = params['features.10.expand1x1.weight'].numpy()
bias4a = params['features.10.expand1x1.bias'].numpy()

weights4b = params['features.10.expand3x3.weight'].numpy()
bias4b = params['features.10.expand3x3.bias'].numpy()

#### FIRE 8 ####
weights5 = params['features.11.squeeze.weight'].numpy()
bias5 = params['features.11.squeeze.bias'].numpy()

weights6a = params['features.11.expand1x1.weight'].numpy()
bias6a = params['features.11.expand1x1.bias'].numpy()

weights6b = params['features.11.expand3x3.weight'].numpy()
bias6b = params['features.11.expand3x3.bias'].numpy()

#### FIRE 9 ####
weights7 = params['features.12.squeeze.weight'].numpy()
bias7 = params['features.12.squeeze.bias'].numpy()

weights8a = params['features.12.expand1x1.weight'].numpy()
bias8a = params['features.12.expand1x1.bias'].numpy()

weights8b = params['features.12.expand3x3.weight'].numpy()
bias8b = params['features.12.expand3x3.bias'].numpy()

tic=pc()

squeeze1=nn.Conv2d(canales_iniciales, squeeze_factor_a, kernel_size=1, bias=False)
squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))    

squeeze2a=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=1, bias=False)
squeeze2a.weight = nn.Parameter(torch.from_numpy(weights2a))
squeeze2a.bias = nn.Parameter(torch.from_numpy(bias2a))

squeeze2b=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=3, bias=False, padding=1)
squeeze2b.weight = nn.Parameter(torch.from_numpy(weights2b))
squeeze2b.bias = nn.Parameter(torch.from_numpy(bias2b))

squeeze3=nn.Conv2d(expand_factor_a * 2, squeeze_factor_a, kernel_size=1, bias=False)
squeeze3.weight = nn.Parameter(torch.from_numpy(weights3))
squeeze3.bias = nn.Parameter(torch.from_numpy(bias3))    

squeeze4a=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=1, bias=False)
squeeze4a.weight = nn.Parameter(torch.from_numpy(weights4a))
squeeze4a.bias = nn.Parameter(torch.from_numpy(bias4a))

squeeze4b=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=3, bias=False, padding=1)
squeeze4b.weight = nn.Parameter(torch.from_numpy(weights4b))
squeeze4b.bias = nn.Parameter(torch.from_numpy(bias4b))

squeeze5=nn.Conv2d(expand_factor_a * 2, squeeze_factor_b, kernel_size=1, bias=False)
squeeze5.weight = nn.Parameter(torch.from_numpy(weights5))
squeeze5.bias = nn.Parameter(torch.from_numpy(bias5))    

squeeze6a=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=1, bias=False)
squeeze6a.weight = nn.Parameter(torch.from_numpy(weights6a))
squeeze6a.bias = nn.Parameter(torch.from_numpy(bias6a))

squeeze6b=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=3, bias=False, padding=1)
squeeze6b.weight = nn.Parameter(torch.from_numpy(weights6b))
squeeze6b.bias = nn.Parameter(torch.from_numpy(bias6b))

squeeze7=nn.Conv2d(expand_factor_b * 2, squeeze_factor_b, kernel_size=1, bias=False)
squeeze7.weight = nn.Parameter(torch.from_numpy(weights7))
squeeze7.bias = nn.Parameter(torch.from_numpy(bias7))    

squeeze8a=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=1, bias=False)
squeeze8a.weight = nn.Parameter(torch.from_numpy(weights8a))
squeeze8a.bias = nn.Parameter(torch.from_numpy(bias8a))

squeeze8b=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=3, bias=False, padding=1)
squeeze8b.weight = nn.Parameter(torch.from_numpy(weights8b))
squeeze8b.bias = nn.Parameter(torch.from_numpy(bias8b))

imagen1  = torch.from_numpy(imagen).float()

salida1=squeeze1(imagen1)
salida1_activation=squeeze_activation(salida1)

salida2a=squeeze2a(salida1_activation)
salida2a_activation=squeeze_activation(salida2a)
salida2b=squeeze2b(salida1_activation)
salida2b_activation=squeeze_activation(salida2b)    
salida2_total=torch.cat([salida2a_activation,salida2b_activation], 1)

salida3=squeeze3(salida2_total)
salida3_activation=squeeze_activation(salida3)

salida4a=squeeze4a(salida3_activation)
salida4a_activation=squeeze_activation(salida4a)
salida4b=squeeze4b(salida3_activation)
salida4b_activation=squeeze_activation(salida4b)    
salida4_total=torch.cat([salida4a_activation,salida4b_activation], 1)

salida5=squeeze5(salida4_total)
salida5_activation=squeeze_activation(salida5)

salida6a=squeeze6a(salida5_activation)
salida6a_activation=squeeze_activation(salida6a)
salida6b=squeeze6b(salida5_activation)
salida6b_activation=squeeze_activation(salida6b)    
salida6_total=torch.cat([salida6a_activation,salida6b_activation], 1)

salida7=squeeze7(salida6_total)
salida7_activation=squeeze_activation(salida7)

salida8a=squeeze8a(salida7_activation)
salida8a_activation=squeeze_activation(salida8a)
salida8b=squeeze8b(salida7_activation)
salida8b_activation=squeeze_activation(salida8b)    
salida8_total=torch.cat([salida8a_activation,salida8b_activation], 1)

salida8_total_a_numpy=salida8_total.detach().numpy()

toc=pc()

acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [55]:
# NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

fire6_squeeze_weight = weights1.reshape(-1)
fire6_squeeze_bias = bias1
fire6_expand1x1_weight = weights2a.reshape(-1)
fire6_expand1x1_bias = bias2a
fire6_expand3x3_weight =weights2b.reshape(-1)
fire6_expand3x3_bias = bias2b

fire7_squeeze_weight = weights3.reshape(-1)
fire7_squeeze_bias = bias3
fire7_expand1x1_weight = weights4a.reshape(-1)
fire7_expand1x1_bias = bias4a
fire7_expand3x3_weight =weights4b.reshape(-1)
fire7_expand3x3_bias = bias4b

fire8_squeeze_weight = weights5.reshape(-1)
fire8_squeeze_bias = bias5
fire8_expand1x1_weight = weights6a.reshape(-1)
fire8_expand1x1_bias = bias6a
fire8_expand3x3_weight =weights6b.reshape(-1)
fire8_expand3x3_bias = bias6b

fire9_squeeze_weight = weights7.reshape(-1)
fire9_squeeze_bias = bias7
fire9_expand1x1_weight = weights8a.reshape(-1)
fire9_expand1x1_bias = bias8a
fire9_expand3x3_weight =weights8b.reshape(-1)
fire9_expand3x3_bias = bias8b

h_result_fire6_squeeze = np.empty(1 * squeeze_factor_a * tamanyo * tamanyo).astype(np.float32)
h_result_fire6_expand = np.empty(1 * expand_factor_a * 2 * tamanyo * tamanyo).astype(np.float32)

h_result_fire7_squeeze = np.empty(1 * squeeze_factor_a * tamanyo * tamanyo).astype(np.float32)
h_result_fire7_expand = np.empty(1 * expand_factor_a * 2 * tamanyo * tamanyo).astype(np.float32)

h_result_fire8_squeeze = np.empty(1 * squeeze_factor_b * tamanyo * tamanyo).astype(np.float32)
h_result_fire8_expand = np.empty(1 * expand_factor_b * 2 * tamanyo * tamanyo).astype(np.float32)

h_result_fire9_squeeze = np.empty(1 * squeeze_factor_b * tamanyo * tamanyo).astype(np.float32)
h_result_fire9_expand = np.empty(1 * expand_factor_b * 2 * tamanyo * tamanyo).astype(np.float32)

d_fire6_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_weight)
d_fire6_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_bias)
d_fire6_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_weight)
d_fire6_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_bias)
d_fire6_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_weight)
d_fire6_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_bias)

d_fire7_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_weight)
d_fire7_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_bias)
d_fire7_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_weight)
d_fire7_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_bias)
d_fire7_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_weight)
d_fire7_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_bias)

d_fire8_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_weight)
d_fire8_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_bias)
d_fire8_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_weight)
d_fire8_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_bias)
d_fire8_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_weight)
d_fire8_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_bias)

d_fire9_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_squeeze_weight)
d_fire9_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_squeeze_bias)
d_fire9_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand1x1_weight)
d_fire9_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand1x1_bias)
d_fire9_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand3x3_weight)
d_fire9_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand3x3_bias)

d_result_fire6_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_squeeze.nbytes)
d_result_fire6_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_expand.nbytes)

d_result_fire7_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_squeeze.nbytes)
d_result_fire7_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_expand.nbytes)

d_result_fire8_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_squeeze.nbytes)
d_result_fire8_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_expand.nbytes)

d_result_fire9_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire9_squeeze.nbytes)
d_result_fire9_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire9_expand.nbytes)

tic2 = pc()

conv1x1_NDR(queue,(squeeze_factor_a, tamanyo), None, np.int32(canales_iniciales/4), tamanyo, d_sample, d_fire6_squeeze_weight, d_fire6_squeeze_bias, d_result_fire6_squeeze)

queue.finish()

conv1x1_NDR(queue1,(expand_factor_a, tamanyo), None, np.int32(squeeze_factor_a/4), tamanyo, d_result_fire6_squeeze, d_fire6_expand1x1_weight, d_fire6_expand1x1_bias, d_result_fire6_expand)
conv3x3_NDR(queue,(expand_factor_a, tamanyo), None, squeeze_factor_a, tamanyo, 1, 1, expand_factor_a, tamanyo, d_result_fire6_squeeze, d_fire6_expand3x3_weight, d_fire6_expand3x3_bias, d_result_fire6_expand)

queue.finish()
queue1.finish()

conv1x1_NDR(queue,(squeeze_factor_a, tamanyo), None, np.int32(expand_factor_a*2/4), tamanyo, d_result_fire6_expand, d_fire7_squeeze_weight, d_fire7_squeeze_bias, d_result_fire7_squeeze)

queue.finish()

conv1x1_NDR(queue1,(expand_factor_a, tamanyo), None, np.int32(squeeze_factor_a/4), tamanyo, d_result_fire7_squeeze, d_fire7_expand1x1_weight, d_fire7_expand1x1_bias, d_result_fire7_expand)
conv3x3_NDR(queue,(expand_factor_a, tamanyo), None, squeeze_factor_a, tamanyo, 1, 1, expand_factor_a, tamanyo, d_result_fire7_squeeze, d_fire7_expand3x3_weight, d_fire7_expand3x3_bias, d_result_fire7_expand)

queue.finish()
queue1.finish()

conv1x1_NDR(queue,(squeeze_factor_b, tamanyo), None, np.int32(expand_factor_a*2/4), tamanyo, d_result_fire7_expand, d_fire8_squeeze_weight, d_fire8_squeeze_bias, d_result_fire8_squeeze)

queue.finish()

conv1x1_NDR(queue1,(expand_factor_b, tamanyo), None, np.int32(squeeze_factor_b/4), tamanyo, d_result_fire8_squeeze, d_fire8_expand1x1_weight, d_fire8_expand1x1_bias, d_result_fire8_expand)
conv3x3_NDR(queue,(expand_factor_b, tamanyo), None, squeeze_factor_b, tamanyo, 1, 1, expand_factor_b, tamanyo, d_result_fire8_squeeze, d_fire8_expand3x3_weight, d_fire8_expand3x3_bias, d_result_fire8_expand)

queue.finish()
queue1.finish()

conv1x1_NDR(queue,(squeeze_factor_b, tamanyo), None, np.int32(expand_factor_b*2/4), tamanyo, d_result_fire8_expand, d_fire9_squeeze_weight, d_fire9_squeeze_bias, d_result_fire9_squeeze)

queue.finish()

conv1x1_NDR(queue1,(expand_factor_b, tamanyo), None, np.int32(squeeze_factor_b/4), tamanyo, d_result_fire9_squeeze, d_fire9_expand1x1_weight, d_fire9_expand1x1_bias, d_result_fire9_expand)
conv3x3_NDR(queue,(expand_factor_b, tamanyo), None, squeeze_factor_b, tamanyo, 1, 1, expand_factor_b, tamanyo, d_result_fire9_squeeze, d_fire9_expand3x3_weight, d_fire9_expand3x3_bias, d_result_fire9_expand)

queue.finish()
queue1.finish()

cl.enqueue_copy(queue, h_result_fire9_expand, d_result_fire9_expand)

queue.finish()

veamos = h_result_fire9_expand.reshape(-1,tamanyo,tamanyo)

rtime = pc() - tic2


In [56]:
# Simple task

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

fire6_squeeze_weight = weights1.reshape(-1)
fire6_squeeze_bias = bias1
fire6_expand1x1_weight = weights2a.reshape(-1)
fire6_expand1x1_bias = bias2a
fire6_expand3x3_weight =weights2b.reshape(-1)
fire6_expand3x3_bias = bias2b

fire7_squeeze_weight = weights3.reshape(-1)
fire7_squeeze_bias = bias3
fire7_expand1x1_weight = weights4a.reshape(-1)
fire7_expand1x1_bias = bias4a
fire7_expand3x3_weight =weights4b.reshape(-1)
fire7_expand3x3_bias = bias4b

fire8_squeeze_weight = weights5.reshape(-1)
fire8_squeeze_bias = bias5
fire8_expand1x1_weight = weights6a.reshape(-1)
fire8_expand1x1_bias = bias6a
fire8_expand3x3_weight =weights6b.reshape(-1)
fire8_expand3x3_bias = bias6b

fire9_squeeze_weight = weights7.reshape(-1)
fire9_squeeze_bias = bias7
fire9_expand1x1_weight = weights8a.reshape(-1)
fire9_expand1x1_bias = bias8a
fire9_expand3x3_weight =weights8b.reshape(-1)
fire9_expand3x3_bias = bias8b

h_result_fire6_squeeze = np.empty(1 * squeeze_factor_a * tamanyo * tamanyo).astype(np.float32)
h_result_fire6_expand = np.empty(1 * expand_factor_a * 2 * tamanyo * tamanyo).astype(np.float32)

h_result_fire7_squeeze = np.empty(1 * squeeze_factor_a * tamanyo * tamanyo).astype(np.float32)
h_result_fire7_expand = np.empty(1 * expand_factor_a * 2 * tamanyo * tamanyo).astype(np.float32)

h_result_fire8_squeeze = np.empty(1 * squeeze_factor_b * tamanyo * tamanyo).astype(np.float32)
h_result_fire8_expand = np.empty(1 * expand_factor_b * 2 * tamanyo * tamanyo).astype(np.float32)

h_result_fire9_squeeze = np.empty(1 * squeeze_factor_b * tamanyo * tamanyo).astype(np.float32)
h_result_fire9_expand = np.empty(1 * expand_factor_b * 2 * tamanyo * tamanyo).astype(np.float32)

d_fire6_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_weight)
d_fire6_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_bias)
d_fire6_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_weight)
d_fire6_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_bias)
d_fire6_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_weight)
d_fire6_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_bias)

d_fire7_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_weight)
d_fire7_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_bias)
d_fire7_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_weight)
d_fire7_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_bias)
d_fire7_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_weight)
d_fire7_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_bias)

d_fire8_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_weight)
d_fire8_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_bias)
d_fire8_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_weight)
d_fire8_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_bias)
d_fire8_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_weight)
d_fire8_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_bias)

d_fire9_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_squeeze_weight)
d_fire9_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_squeeze_bias)
d_fire9_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand1x1_weight)
d_fire9_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand1x1_bias)
d_fire9_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand3x3_weight)
d_fire9_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand3x3_bias)

d_result_fire6_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_squeeze.nbytes)
d_result_fire6_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_expand.nbytes)

d_result_fire7_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_squeeze.nbytes)
d_result_fire7_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_expand.nbytes)

d_result_fire8_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_squeeze.nbytes)
d_result_fire8_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_expand.nbytes)

d_result_fire9_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire9_squeeze.nbytes)
d_result_fire9_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire9_expand.nbytes)

tic3 = pc()

conv1x1_ST(queue,(1,), None, canales_iniciales, tamanyo, squeeze_factor_a, d_sample, d_fire6_squeeze_weight, d_fire6_squeeze_bias, d_result_fire6_squeeze)

queue.finish()

conv1x1_ST(queue1,(1,), None, squeeze_factor_a, tamanyo, expand_factor_a, d_result_fire6_squeeze, d_fire6_expand1x1_weight, d_fire6_expand1x1_bias, d_result_fire6_expand)
conv3x3_ST(queue,(1,), None, squeeze_factor_a, tamanyo, 1, 1, expand_factor_a, tamanyo, expand_factor_a, d_result_fire6_squeeze, d_fire6_expand3x3_weight, d_fire6_expand3x3_bias, d_result_fire6_expand)

queue.finish()
queue1.finish()

conv1x1_ST(queue,(1,), None, np.int32(expand_factor_a * 2), tamanyo, squeeze_factor_a, d_result_fire6_expand, d_fire7_squeeze_weight, d_fire7_squeeze_bias, d_result_fire7_squeeze)

queue.finish()

conv1x1_ST(queue1,(1,), None, squeeze_factor_a, tamanyo, expand_factor_a, d_result_fire7_squeeze, d_fire7_expand1x1_weight, d_fire7_expand1x1_bias, d_result_fire7_expand)
conv3x3_ST(queue,(1,), None, squeeze_factor_a, tamanyo, 1, 1, expand_factor_a, tamanyo, expand_factor_a, d_result_fire7_squeeze, d_fire7_expand3x3_weight, d_fire7_expand3x3_bias, d_result_fire7_expand)

queue.finish()
queue1.finish()

conv1x1_ST(queue,(1,), None, np.int32(expand_factor_a * 2), tamanyo, squeeze_factor_b, d_result_fire7_expand, d_fire8_squeeze_weight, d_fire8_squeeze_bias, d_result_fire8_squeeze)

queue.finish()

conv1x1_ST(queue1,(1,), None, squeeze_factor_b, tamanyo, expand_factor_b, d_result_fire8_squeeze, d_fire8_expand1x1_weight, d_fire8_expand1x1_bias, d_result_fire8_expand)
conv3x3_ST(queue,(1,), None, squeeze_factor_b, tamanyo, 1, 1, expand_factor_b, tamanyo, expand_factor_b, d_result_fire8_squeeze, d_fire8_expand3x3_weight, d_fire8_expand3x3_bias, d_result_fire8_expand)

queue.finish()
queue1.finish()

conv1x1_ST(queue,(1,), None, np.int32(expand_factor_b * 2), tamanyo, squeeze_factor_b, d_result_fire8_expand, d_fire9_squeeze_weight, d_fire9_squeeze_bias, d_result_fire9_squeeze)

queue.finish()

conv1x1_ST(queue1,(1,), None, squeeze_factor_b, tamanyo, expand_factor_b, d_result_fire9_squeeze, d_fire9_expand1x1_weight, d_fire9_expand1x1_bias, d_result_fire9_expand)
conv3x3_ST(queue,(1,), None, squeeze_factor_b, tamanyo, 1, 1, expand_factor_b, tamanyo, expand_factor_b, d_result_fire9_squeeze, d_fire9_expand3x3_weight, d_fire9_expand3x3_bias, d_result_fire9_expand)

queue.finish()
queue1.finish()

cl.enqueue_copy(queue, h_result_fire9_expand, d_result_fire9_expand)

veamos1 = h_result_fire9_expand.reshape(-1,tamanyo,tamanyo)

rtime1 = pc() - tic3


In [57]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida8_total_a_numpy.reshape(-1,tamanyo,tamanyo), veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida8_total_a_numpy.reshape(-1,tamanyo,tamanyo), veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.0030234180003390065
tiempo en segundos con opencl (NDRANGE)= 0.03991814599976351
tiempo en segundos con opencl (Simple Task)= 0.04235464699922886
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  True
comparativa (NDRange == Simple Task):  True


In [155]:
for i in range(expand_factor_b * 2):
    for j in range(tamanyo):
        for k in range(tamanyo):
            if (abs(salida8_total_a_numpy.reshape(-1,tamanyo,tamanyo)[i][j][k] - veamos1[i][j][k])) > 1e-01:
                print("i:", i, "j:", j, "k:", k, salida8_total_a_numpy.reshape(-1,tamanyo,tamanyo)[i][j][k], veamos1[i][j][k])     

In [18]:
######## BLOCK 3 ########
tamanyo=14 #input_size
canales_iniciales=256 #input channels and 
squeeze_factor_a = 48
squeeze_factor_b = 64
expand_factor_a = 192
expand_factor_b = 256

################

tamanyo_final = tamanyo # En el último bloque no hay maxpool layer

squeeze_activation = nn.ReLU(inplace=True)

count = 100
acumulado_pytorch=0
acumulado_kernel=0
comparativa = True

params = torch.load('squeezenet1_1.pth')

######## BLOCK 3 ########
#fire - fire - fire - fire block 3
#### FIRE 6 ####
weights1 = params['features.9.squeeze.weight'].numpy()
bias1 = params['features.9.squeeze.bias'].numpy()

weights2a = params['features.9.expand1x1.weight'].numpy()
bias2a = params['features.9.expand1x1.bias'].numpy()

weights2b = params['features.9.expand3x3.weight'].numpy()
bias2b = params['features.9.expand3x3.bias'].numpy()

#### FIRE 7 ####
weights3 = params['features.10.squeeze.weight'].numpy()
bias3 = params['features.10.squeeze.bias'].numpy()

weights4a = params['features.10.expand1x1.weight'].numpy()
bias4a = params['features.10.expand1x1.bias'].numpy()

weights4b = params['features.10.expand3x3.weight'].numpy()
bias4b = params['features.10.expand3x3.bias'].numpy()

#### FIRE 8 ####
weights5 = params['features.11.squeeze.weight'].numpy()
bias5 = params['features.11.squeeze.bias'].numpy()

weights6a = params['features.11.expand1x1.weight'].numpy()
bias6a = params['features.11.expand1x1.bias'].numpy()

weights6b = params['features.11.expand3x3.weight'].numpy()
bias6b = params['features.11.expand3x3.bias'].numpy()

#### FIRE 9 ####
weights7 = params['features.12.squeeze.weight'].numpy()
bias7 = params['features.12.squeeze.bias'].numpy()

weights8a = params['features.12.expand1x1.weight'].numpy()
bias8a = params['features.12.expand1x1.bias'].numpy()

weights8b = params['features.12.expand3x3.weight'].numpy()
bias8b = params['features.12.expand3x3.bias'].numpy()

for i in range(count):
   

    imagen = np.random.randint(239,size=(1,canales_iniciales, tamanyo, tamanyo))

    tic=pc()

    squeeze1=nn.Conv2d(canales_iniciales, squeeze_factor_a, kernel_size=1, bias=False)
    squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
    squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))    

    squeeze2a=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=1, bias=False)
    squeeze2a.weight = nn.Parameter(torch.from_numpy(weights2a))
    squeeze2a.bias = nn.Parameter(torch.from_numpy(bias2a))

    squeeze2b=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=3, bias=False, padding=1)
    squeeze2b.weight = nn.Parameter(torch.from_numpy(weights2b))
    squeeze2b.bias = nn.Parameter(torch.from_numpy(bias2b))

    squeeze3=nn.Conv2d(expand_factor_a * 2, squeeze_factor_a, kernel_size=1, bias=False)
    squeeze3.weight = nn.Parameter(torch.from_numpy(weights3))
    squeeze3.bias = nn.Parameter(torch.from_numpy(bias3))    

    squeeze4a=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=1, bias=False)
    squeeze4a.weight = nn.Parameter(torch.from_numpy(weights4a))
    squeeze4a.bias = nn.Parameter(torch.from_numpy(bias4a))

    squeeze4b=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=3, bias=False, padding=1)
    squeeze4b.weight = nn.Parameter(torch.from_numpy(weights4b))
    squeeze4b.bias = nn.Parameter(torch.from_numpy(bias4b))

    squeeze5=nn.Conv2d(expand_factor_a * 2, squeeze_factor_b, kernel_size=1, bias=False)
    squeeze5.weight = nn.Parameter(torch.from_numpy(weights5))
    squeeze5.bias = nn.Parameter(torch.from_numpy(bias5))    

    squeeze6a=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=1, bias=False)
    squeeze6a.weight = nn.Parameter(torch.from_numpy(weights6a))
    squeeze6a.bias = nn.Parameter(torch.from_numpy(bias6a))

    squeeze6b=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=3, bias=False, padding=1)
    squeeze6b.weight = nn.Parameter(torch.from_numpy(weights6b))
    squeeze6b.bias = nn.Parameter(torch.from_numpy(bias6b))

    squeeze7=nn.Conv2d(expand_factor_b * 2, squeeze_factor_b, kernel_size=1, bias=False)
    squeeze7.weight = nn.Parameter(torch.from_numpy(weights7))
    squeeze7.bias = nn.Parameter(torch.from_numpy(bias7))    

    squeeze8a=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=1, bias=False)
    squeeze8a.weight = nn.Parameter(torch.from_numpy(weights8a))
    squeeze8a.bias = nn.Parameter(torch.from_numpy(bias8a))

    squeeze8b=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=3, bias=False, padding=1)
    squeeze8b.weight = nn.Parameter(torch.from_numpy(weights8b))
    squeeze8b.bias = nn.Parameter(torch.from_numpy(bias8b))

    imagen1  = torch.from_numpy(imagen).float()

    salida1=squeeze1(imagen1)
    salida1_activation=squeeze_activation(salida1)

    salida2a=squeeze2a(salida1_activation)
    salida2a_activation=squeeze_activation(salida2a)
    salida2b=squeeze2b(salida1_activation)
    salida2b_activation=squeeze_activation(salida2b)    
    salida2_total=torch.cat([salida2a_activation,salida2b_activation], 1)

    salida3=squeeze3(salida2_total)
    salida3_activation=squeeze_activation(salida3)

    salida4a=squeeze4a(salida3_activation)
    salida4a_activation=squeeze_activation(salida4a)
    salida4b=squeeze4b(salida3_activation)
    salida4b_activation=squeeze_activation(salida4b)    
    salida4_total=torch.cat([salida4a_activation,salida4b_activation], 1)

    salida5=squeeze5(salida4_total)
    salida5_activation=squeeze_activation(salida5)

    salida6a=squeeze6a(salida5_activation)
    salida6a_activation=squeeze_activation(salida6a)
    salida6b=squeeze6b(salida5_activation)
    salida6b_activation=squeeze_activation(salida6b)    
    salida6_total=torch.cat([salida6a_activation,salida6b_activation], 1)

    salida7=squeeze7(salida6_total)
    salida7_activation=squeeze_activation(salida7)

    salida8a=squeeze8a(salida7_activation)
    salida8a_activation=squeeze_activation(salida8a)
    salida8b=squeeze8b(salida7_activation)
    salida8b_activation=squeeze_activation(salida8b)    
    salida8_total=torch.cat([salida8a_activation,salida8b_activation], 1)

    salida8_total_a_numpy=salida8_total.detach().numpy()

    toc=pc()

    acumulado_pytorch=toc-tic+acumulado_pytorch

    ####### OPENCL COMPARISON #######
                               
    # NDRANGE
    h_sample = imagen.reshape(-1).astype(np.float32)
    d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

    fire6_squeeze_weight = weights1.reshape(-1)
    fire6_squeeze_bias = bias1
    fire6_expand1x1_weight = weights2a.reshape(-1)
    fire6_expand1x1_bias = bias2a
    fire6_expand3x3_weight =weights2b.reshape(-1)
    fire6_expand3x3_bias = bias2b

    fire7_squeeze_weight = weights3.reshape(-1)
    fire7_squeeze_bias = bias3
    fire7_expand1x1_weight = weights4a.reshape(-1)
    fire7_expand1x1_bias = bias4a
    fire7_expand3x3_weight =weights4b.reshape(-1)
    fire7_expand3x3_bias = bias4b

    fire8_squeeze_weight = weights5.reshape(-1)
    fire8_squeeze_bias = bias5
    fire8_expand1x1_weight = weights6a.reshape(-1)
    fire8_expand1x1_bias = bias6a
    fire8_expand3x3_weight =weights6b.reshape(-1)
    fire8_expand3x3_bias = bias6b

    fire9_squeeze_weight = weights7.reshape(-1)
    fire9_squeeze_bias = bias7
    fire9_expand1x1_weight = weights8a.reshape(-1)
    fire9_expand1x1_bias = bias8a
    fire9_expand3x3_weight =weights8b.reshape(-1)
    fire9_expand3x3_bias = bias8b

    h_result_fire6_squeeze = np.empty(1 * squeeze_factor_a * tamanyo * tamanyo).astype(np.float32)
    h_result_fire6_expand = np.empty(1 * expand_factor_a * 2 * tamanyo * tamanyo).astype(np.float32)

    h_result_fire7_squeeze = np.empty(1 * squeeze_factor_a * tamanyo * tamanyo).astype(np.float32)
    h_result_fire7_expand = np.empty(1 * expand_factor_a * 2 * tamanyo * tamanyo).astype(np.float32)

    h_result_fire8_squeeze = np.empty(1 * squeeze_factor_b * tamanyo * tamanyo).astype(np.float32)
    h_result_fire8_expand = np.empty(1 * expand_factor_b * 2 * tamanyo * tamanyo).astype(np.float32)

    h_result_fire9_squeeze = np.empty(1 * squeeze_factor_b * tamanyo * tamanyo).astype(np.float32)
    h_result_fire9_expand = np.empty(1 * expand_factor_b * 2 * tamanyo * tamanyo).astype(np.float32)

    d_fire6_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_weight)
    d_fire6_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_bias)
    d_fire6_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_weight)
    d_fire6_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_bias)
    d_fire6_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_weight)
    d_fire6_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_bias)

    d_fire7_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_weight)
    d_fire7_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_bias)
    d_fire7_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_weight)
    d_fire7_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_bias)
    d_fire7_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_weight)
    d_fire7_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_bias)

    d_fire8_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_weight)
    d_fire8_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_bias)
    d_fire8_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_weight)
    d_fire8_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_bias)
    d_fire8_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_weight)
    d_fire8_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_bias)

    d_fire9_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_squeeze_weight)
    d_fire9_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_squeeze_bias)
    d_fire9_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand1x1_weight)
    d_fire9_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand1x1_bias)
    d_fire9_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand3x3_weight)
    d_fire9_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand3x3_bias)

    d_result_fire6_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_squeeze.nbytes)
    d_result_fire6_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_expand.nbytes)

    d_result_fire7_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_squeeze.nbytes)
    d_result_fire7_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_expand.nbytes)

    d_result_fire8_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_squeeze.nbytes)
    d_result_fire8_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_expand.nbytes)

    d_result_fire9_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire9_squeeze.nbytes)
    d_result_fire9_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire9_expand.nbytes)

    tic5 = pc()

    conv1x1_NDR(queue,(squeeze_factor_a, tamanyo), None, np.int32(canales_iniciales/4), tamanyo, d_sample, d_fire6_squeeze_weight, d_fire6_squeeze_bias, d_result_fire6_squeeze)

    queue.finish()

    conv1x1_NDR(queue1,(expand_factor_a, tamanyo), None, np.int32(squeeze_factor_a/4), tamanyo, d_result_fire6_squeeze, d_fire6_expand1x1_weight, d_fire6_expand1x1_bias, d_result_fire6_expand)
    conv3x3_NDR(queue,(expand_factor_a, tamanyo), None, squeeze_factor_a, tamanyo, 1, 1, expand_factor_a, tamanyo, d_result_fire6_squeeze, d_fire6_expand3x3_weight, d_fire6_expand3x3_bias, d_result_fire6_expand)

    queue.finish()
    queue1.finish()

    conv1x1_NDR(queue,(squeeze_factor_a, tamanyo), None, np.int32(expand_factor_a*2/4), tamanyo, d_result_fire6_expand, d_fire7_squeeze_weight, d_fire7_squeeze_bias, d_result_fire7_squeeze)

    queue.finish()

    conv1x1_NDR(queue1,(expand_factor_a, tamanyo), None, np.int32(squeeze_factor_a/4), tamanyo, d_result_fire7_squeeze, d_fire7_expand1x1_weight, d_fire7_expand1x1_bias, d_result_fire7_expand)
    conv3x3_NDR(queue,(expand_factor_a, tamanyo), None, squeeze_factor_a, tamanyo, 1, 1, expand_factor_a, tamanyo, d_result_fire7_squeeze, d_fire7_expand3x3_weight, d_fire7_expand3x3_bias, d_result_fire7_expand)

    queue.finish()
    queue1.finish()

    conv1x1_NDR(queue,(squeeze_factor_b, tamanyo), None, np.int32(expand_factor_a*2/4), tamanyo, d_result_fire7_expand, d_fire8_squeeze_weight, d_fire8_squeeze_bias, d_result_fire8_squeeze)

    queue.finish()

    conv1x1_NDR(queue1,(expand_factor_b, tamanyo), None, np.int32(squeeze_factor_b/4), tamanyo, d_result_fire8_squeeze, d_fire8_expand1x1_weight, d_fire8_expand1x1_bias, d_result_fire8_expand)
    conv3x3_NDR(queue,(expand_factor_b, tamanyo), None, squeeze_factor_b, tamanyo, 1, 1, expand_factor_b, tamanyo, d_result_fire8_squeeze, d_fire8_expand3x3_weight, d_fire8_expand3x3_bias, d_result_fire8_expand)

    queue.finish()
    queue1.finish()

    conv1x1_NDR(queue,(squeeze_factor_b, tamanyo), None, np.int32(expand_factor_b*2/4), tamanyo, d_result_fire8_expand, d_fire9_squeeze_weight, d_fire9_squeeze_bias, d_result_fire9_squeeze)

    queue.finish()

    conv1x1_NDR(queue1,(expand_factor_b, tamanyo), None, np.int32(squeeze_factor_b/4), tamanyo, d_result_fire9_squeeze, d_fire9_expand1x1_weight, d_fire9_expand1x1_bias, d_result_fire9_expand)
    conv3x3_NDR(queue,(expand_factor_b, tamanyo), None, squeeze_factor_b, tamanyo, 1, 1, expand_factor_b, tamanyo, d_result_fire9_squeeze, d_fire9_expand3x3_weight, d_fire9_expand3x3_bias, d_result_fire9_expand)

    queue.finish()
    queue1.finish()

    cl.enqueue_copy(queue, h_result_fire9_expand, d_result_fire9_expand)

    queue.finish()

    veamos3 = h_result_fire9_expand.reshape(-1,tamanyo,tamanyo)
    
    toc5 = pc()

    acumulado_kernel = toc5 - tic5 + acumulado_kernel
    
    comparativa &= np.allclose(salida8_total_a_numpy, veamos3,rtol=1e-01, atol=1e-01)
    
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)
print ("tiempo en segundos con opencl (NDRange)=",acumulado_kernel/count)
print("comparativa (pytorch == NDRange): ",comparativa)

tiempo en segundos con pytorch=  0.05108882825000819
tiempo en segundos con opencl (Simple Task)= 0.028037996070006555
comparativa (pytorch == Simple Task):  True


In [1]:
######## BLOCK 3 ########
tamanyo=14 #input_size
canales_iniciales=256 #input channels and 
squeeze_factor_a = 48
squeeze_factor_b = 64
expand_factor_a = 192
expand_factor_b = 256

################

tamanyo_final = tamanyo # En el último bloque no hay maxpool layer

squeeze_activation = nn.ReLU(inplace=True)

count = 100
acumulado_pytorch=0
acumulado_kernel=0
comparativa = True

params = torch.load('squeezenet1_1.pth')

######## BLOCK 3 ########
#fire - fire - fire - fire block 3
#### FIRE 6 ####
weights1 = params['features.9.squeeze.weight'].numpy()
bias1 = params['features.9.squeeze.bias'].numpy()

weights2a = params['features.9.expand1x1.weight'].numpy()
bias2a = params['features.9.expand1x1.bias'].numpy()

weights2b = params['features.9.expand3x3.weight'].numpy()
bias2b = params['features.9.expand3x3.bias'].numpy()

#### FIRE 7 ####
weights3 = params['features.10.squeeze.weight'].numpy()
bias3 = params['features.10.squeeze.bias'].numpy()

weights4a = params['features.10.expand1x1.weight'].numpy()
bias4a = params['features.10.expand1x1.bias'].numpy()

weights4b = params['features.10.expand3x3.weight'].numpy()
bias4b = params['features.10.expand3x3.bias'].numpy()

#### FIRE 8 ####
weights5 = params['features.11.squeeze.weight'].numpy()
bias5 = params['features.11.squeeze.bias'].numpy()

weights6a = params['features.11.expand1x1.weight'].numpy()
bias6a = params['features.11.expand1x1.bias'].numpy()

weights6b = params['features.11.expand3x3.weight'].numpy()
bias6b = params['features.11.expand3x3.bias'].numpy()

#### FIRE 9 ####
weights7 = params['features.12.squeeze.weight'].numpy()
bias7 = params['features.12.squeeze.bias'].numpy()

weights8a = params['features.12.expand1x1.weight'].numpy()
bias8a = params['features.12.expand1x1.bias'].numpy()

weights8b = params['features.12.expand3x3.weight'].numpy()
bias8b = params['features.12.expand3x3.bias'].numpy()

for i in range(count):
   

    imagen = np.random.randint(239,size=(1,canales_iniciales, tamanyo, tamanyo))

    tic=pc()

    squeeze1=nn.Conv2d(canales_iniciales, squeeze_factor_a, kernel_size=1, bias=False)
    squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
    squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))    

    squeeze2a=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=1, bias=False)
    squeeze2a.weight = nn.Parameter(torch.from_numpy(weights2a))
    squeeze2a.bias = nn.Parameter(torch.from_numpy(bias2a))

    squeeze2b=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=3, bias=False, padding=1)
    squeeze2b.weight = nn.Parameter(torch.from_numpy(weights2b))
    squeeze2b.bias = nn.Parameter(torch.from_numpy(bias2b))

    squeeze3=nn.Conv2d(expand_factor_a * 2, squeeze_factor_a, kernel_size=1, bias=False)
    squeeze3.weight = nn.Parameter(torch.from_numpy(weights3))
    squeeze3.bias = nn.Parameter(torch.from_numpy(bias3))    

    squeeze4a=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=1, bias=False)
    squeeze4a.weight = nn.Parameter(torch.from_numpy(weights4a))
    squeeze4a.bias = nn.Parameter(torch.from_numpy(bias4a))

    squeeze4b=nn.Conv2d(squeeze_factor_a, expand_factor_a, kernel_size=3, bias=False, padding=1)
    squeeze4b.weight = nn.Parameter(torch.from_numpy(weights4b))
    squeeze4b.bias = nn.Parameter(torch.from_numpy(bias4b))

    squeeze5=nn.Conv2d(expand_factor_a * 2, squeeze_factor_b, kernel_size=1, bias=False)
    squeeze5.weight = nn.Parameter(torch.from_numpy(weights5))
    squeeze5.bias = nn.Parameter(torch.from_numpy(bias5))    

    squeeze6a=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=1, bias=False)
    squeeze6a.weight = nn.Parameter(torch.from_numpy(weights6a))
    squeeze6a.bias = nn.Parameter(torch.from_numpy(bias6a))

    squeeze6b=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=3, bias=False, padding=1)
    squeeze6b.weight = nn.Parameter(torch.from_numpy(weights6b))
    squeeze6b.bias = nn.Parameter(torch.from_numpy(bias6b))

    squeeze7=nn.Conv2d(expand_factor_b * 2, squeeze_factor_b, kernel_size=1, bias=False)
    squeeze7.weight = nn.Parameter(torch.from_numpy(weights7))
    squeeze7.bias = nn.Parameter(torch.from_numpy(bias7))    

    squeeze8a=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=1, bias=False)
    squeeze8a.weight = nn.Parameter(torch.from_numpy(weights8a))
    squeeze8a.bias = nn.Parameter(torch.from_numpy(bias8a))

    squeeze8b=nn.Conv2d(squeeze_factor_b, expand_factor_b, kernel_size=3, bias=False, padding=1)
    squeeze8b.weight = nn.Parameter(torch.from_numpy(weights8b))
    squeeze8b.bias = nn.Parameter(torch.from_numpy(bias8b))

    imagen1  = torch.from_numpy(imagen).float()

    salida1=squeeze1(imagen1)
    salida1_activation=squeeze_activation(salida1)

    salida2a=squeeze2a(salida1_activation)
    salida2a_activation=squeeze_activation(salida2a)
    salida2b=squeeze2b(salida1_activation)
    salida2b_activation=squeeze_activation(salida2b)    
    salida2_total=torch.cat([salida2a_activation,salida2b_activation], 1)

    salida3=squeeze3(salida2_total)
    salida3_activation=squeeze_activation(salida3)

    salida4a=squeeze4a(salida3_activation)
    salida4a_activation=squeeze_activation(salida4a)
    salida4b=squeeze4b(salida3_activation)
    salida4b_activation=squeeze_activation(salida4b)    
    salida4_total=torch.cat([salida4a_activation,salida4b_activation], 1)

    salida5=squeeze5(salida4_total)
    salida5_activation=squeeze_activation(salida5)

    salida6a=squeeze6a(salida5_activation)
    salida6a_activation=squeeze_activation(salida6a)
    salida6b=squeeze6b(salida5_activation)
    salida6b_activation=squeeze_activation(salida6b)    
    salida6_total=torch.cat([salida6a_activation,salida6b_activation], 1)

    salida7=squeeze7(salida6_total)
    salida7_activation=squeeze_activation(salida7)

    salida8a=squeeze8a(salida7_activation)
    salida8a_activation=squeeze_activation(salida8a)
    salida8b=squeeze8b(salida7_activation)
    salida8b_activation=squeeze_activation(salida8b)    
    salida8_total=torch.cat([salida8a_activation,salida8b_activation], 1)

    salida8_total_a_numpy=salida8_total.detach().numpy()

    toc=pc()

    acumulado_pytorch=toc-tic+acumulado_pytorch

    ####### OPENCL COMPARISON #######
    
    # Simple Task
    h_sample = imagen.reshape(-1).astype(np.float32)
    d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

    fire6_squeeze_weight = weights1.reshape(-1)
    fire6_squeeze_bias = bias1
    fire6_expand1x1_weight = weights2a.reshape(-1)
    fire6_expand1x1_bias = bias2a
    fire6_expand3x3_weight =weights2b.reshape(-1)
    fire6_expand3x3_bias = bias2b

    fire7_squeeze_weight = weights3.reshape(-1)
    fire7_squeeze_bias = bias3
    fire7_expand1x1_weight = weights4a.reshape(-1)
    fire7_expand1x1_bias = bias4a
    fire7_expand3x3_weight =weights4b.reshape(-1)
    fire7_expand3x3_bias = bias4b

    fire8_squeeze_weight = weights5.reshape(-1)
    fire8_squeeze_bias = bias5
    fire8_expand1x1_weight = weights6a.reshape(-1)
    fire8_expand1x1_bias = bias6a
    fire8_expand3x3_weight =weights6b.reshape(-1)
    fire8_expand3x3_bias = bias6b

    fire9_squeeze_weight = weights7.reshape(-1)
    fire9_squeeze_bias = bias7
    fire9_expand1x1_weight = weights8a.reshape(-1)
    fire9_expand1x1_bias = bias8a
    fire9_expand3x3_weight =weights8b.reshape(-1)
    fire9_expand3x3_bias = bias8b

    h_result_fire6_squeeze = np.empty(1 * squeeze_factor_a * tamanyo * tamanyo).astype(np.float32)
    h_result_fire6_expand = np.empty(1 * expand_factor_a * 2 * tamanyo * tamanyo).astype(np.float32)

    h_result_fire7_squeeze = np.empty(1 * squeeze_factor_a * tamanyo * tamanyo).astype(np.float32)
    h_result_fire7_expand = np.empty(1 * expand_factor_a * 2 * tamanyo * tamanyo).astype(np.float32)

    h_result_fire8_squeeze = np.empty(1 * squeeze_factor_b * tamanyo * tamanyo).astype(np.float32)
    h_result_fire8_expand = np.empty(1 * expand_factor_b * 2 * tamanyo * tamanyo).astype(np.float32)

    h_result_fire9_squeeze = np.empty(1 * squeeze_factor_b * tamanyo * tamanyo).astype(np.float32)
    h_result_fire9_expand = np.empty(1 * expand_factor_b * 2 * tamanyo * tamanyo).astype(np.float32)

    d_fire6_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_weight)
    d_fire6_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_bias)
    d_fire6_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_weight)
    d_fire6_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_bias)
    d_fire6_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_weight)
    d_fire6_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_bias)

    d_fire7_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_weight)
    d_fire7_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_bias)
    d_fire7_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_weight)
    d_fire7_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_bias)
    d_fire7_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_weight)
    d_fire7_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_bias)

    d_fire8_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_weight)
    d_fire8_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_bias)
    d_fire8_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_weight)
    d_fire8_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_bias)
    d_fire8_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_weight)
    d_fire8_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_bias)

    d_fire9_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_squeeze_weight)
    d_fire9_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_squeeze_bias)
    d_fire9_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand1x1_weight)
    d_fire9_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand1x1_bias)
    d_fire9_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand3x3_weight)
    d_fire9_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire9_expand3x3_bias)

    d_result_fire6_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_squeeze.nbytes)
    d_result_fire6_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_expand.nbytes)

    d_result_fire7_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_squeeze.nbytes)
    d_result_fire7_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_expand.nbytes)

    d_result_fire8_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_squeeze.nbytes)
    d_result_fire8_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_expand.nbytes)

    d_result_fire9_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire9_squeeze.nbytes)
    d_result_fire9_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire9_expand.nbytes)

    tic5 = pc()

    conv1x1_ST(queue,(1,), None, canales_iniciales, tamanyo, squeeze_factor_a, d_sample, d_fire6_squeeze_weight, d_fire6_squeeze_bias, d_result_fire6_squeeze)

    queue.finish()

    conv1x1_ST(queue1,(1,), None, squeeze_factor_a, tamanyo, expand_factor_a, d_result_fire6_squeeze, d_fire6_expand1x1_weight, d_fire6_expand1x1_bias, d_result_fire6_expand)
    conv3x3_ST(queue,(1,), None, squeeze_factor_a, tamanyo, 1, 1, expand_factor_a, tamanyo, expand_factor_a, d_result_fire6_squeeze, d_fire6_expand3x3_weight, d_fire6_expand3x3_bias, d_result_fire6_expand)

    queue.finish()
    queue1.finish()

    conv1x1_ST(queue,(1,), None, np.int32(expand_factor_a * 2), tamanyo, squeeze_factor_a, d_result_fire6_expand, d_fire7_squeeze_weight, d_fire7_squeeze_bias, d_result_fire7_squeeze)

    queue.finish()

    conv1x1_ST(queue1,(1,), None, squeeze_factor_a, tamanyo, expand_factor_a, d_result_fire7_squeeze, d_fire7_expand1x1_weight, d_fire7_expand1x1_bias, d_result_fire7_expand)
    conv3x3_ST(queue,(1,), None, squeeze_factor_a, tamanyo, 1, 1, expand_factor_a, tamanyo, expand_factor_a, d_result_fire7_squeeze, d_fire7_expand3x3_weight, d_fire7_expand3x3_bias, d_result_fire7_expand)

    queue.finish()
    queue1.finish()

    conv1x1_ST(queue,(1,), None, np.int32(expand_factor_a * 2), tamanyo, squeeze_factor_b, d_result_fire7_expand, d_fire8_squeeze_weight, d_fire8_squeeze_bias, d_result_fire8_squeeze)

    queue.finish()

    conv1x1_ST(queue1,(1,), None, squeeze_factor_b, tamanyo, expand_factor_b, d_result_fire8_squeeze, d_fire8_expand1x1_weight, d_fire8_expand1x1_bias, d_result_fire8_expand)
    conv3x3_ST(queue,(1,), None, squeeze_factor_b, tamanyo, 1, 1, expand_factor_b, tamanyo, expand_factor_b, d_result_fire8_squeeze, d_fire8_expand3x3_weight, d_fire8_expand3x3_bias, d_result_fire8_expand)

    queue.finish()
    queue1.finish()

    conv1x1_ST(queue,(1,), None, np.int32(expand_factor_b * 2), tamanyo, squeeze_factor_b, d_result_fire8_expand, d_fire9_squeeze_weight, d_fire9_squeeze_bias, d_result_fire9_squeeze)

    queue.finish()

    conv1x1_ST(queue1,(1,), None, squeeze_factor_b, tamanyo, expand_factor_b, d_result_fire9_squeeze, d_fire9_expand1x1_weight, d_fire9_expand1x1_bias, d_result_fire9_expand)
    conv3x3_ST(queue,(1,), None, squeeze_factor_b, tamanyo, 1, 1, expand_factor_b, tamanyo, expand_factor_b, d_result_fire9_squeeze, d_fire9_expand3x3_weight, d_fire9_expand3x3_bias, d_result_fire9_expand)

    queue.finish()
    queue1.finish()

    cl.enqueue_copy(queue, h_result_fire9_expand, d_result_fire9_expand)

    veamos3 = h_result_fire9_expand.reshape(-1,tamanyo,tamanyo)
    
    toc5 = pc()

    acumulado_kernel = toc5 - tic5 + acumulado_kernel
    
    comparativa &= np.allclose(salida8_total_a_numpy, veamos3,rtol=1e-01, atol=1e-01)
    
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)
print ("tiempo en segundos con opencl (Simple Task)=",acumulado_kernel/count)
print("comparativa (pytorch == Simple Task): ",comparativa)

SyntaxError: invalid syntax (675052290.py, line 72)