# Debug SqueezeNet v1.3 (Simple Task) OpenCL implement with PyOpenCL and PyTorch
Partial code are copied heavily from https://github.com/pytorch/vision/blob/master/torchvision/models/squeezenet.py  
SqueezeNet Paper:https://arxiv.org/abs/1602.07360  
SqueezeNet 1.1 model from https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1   
SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0, without sacrificing accuracy.

Tests arquitecura completa

In [1]:
#some set up
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from PIL import Image
import math
import time
from time import sleep, perf_counter as pc
from matplotlib.pyplot import imshow
%matplotlib inline

In [3]:
# OpenCL setup
import pyopencl as cl
import sys
sys.path.append('../python_common')
import deviceinfo
from time import time

#wksp = '../device/v1.3/squeezenet'

## Veamos ahora fire con opencl

#### Step1: OpenCL preparation

In [42]:
## Devices and compute context
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[0])])
device = platforms[0].get_devices()

# Create a command queue
queue = cl.CommandQueue(context)
queue1 = cl.CommandQueue(context)

context

<pyopencl.Context at 0x687a4a8 on <pyopencl.Device '12th Gen Intel(R) Core(TM) i7-12650H' on 'Intel(R) OpenCL' at 0x4f7fab8>>

#### Step 2: creat kernels
Creat & build program

Create kernels

In [52]:
wksp = ''

file_dir = wksp + 'maxpool_NDRange.aocx'

file_dir = wksp + 'squeezenet_NDRange.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_NDR = cl.Program(context, device, [kernelSource]).build()

file_dir = wksp + 'squeezenet_ST.aocx'

kernelSource = open(file_dir, mode='rb').read()
program_ST = cl.Program(context, device, [kernelSource]).build()


Creat kernels

In [53]:
conv3x3_NDR = program_NDR.conv2d3x3
conv3x3_NDR.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, None, None, None, None])

maxpool_NDR = program_NDR.maxpool2d
maxpool_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None])

conv1x1_NDR = program_NDR.conv2d1x1
conv1x1_NDR.set_scalar_arg_dtypes([np.int32, np.int32, None, None, None, None])

avgpool_NDR = program_NDR.avgpool2d
avgpool_NDR.set_scalar_arg_dtypes([None, None])

conv3x3_ST = program_ST.conv2d3x3
conv3x3_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, None, None, None, None])

maxpool_ST = program_ST.maxpool2d
maxpool_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None])

conv1x1_ST = program_ST.conv2d1x1
conv1x1_ST.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None, None])

avgpool_ST = program_ST.avgpool2d
avgpool_ST.set_scalar_arg_dtypes([None, None])

#### OpenCL kernel: squeezenet_NDRange.cl
conv2d3x3: 2-D 3x3 convolution.  
conv2d1x1: 2-D 1x1 convolution. kerner size 1, stride 1  
maxpool2d: 2-D max pool. kerner size 3, stride 2  
avgpool2d: 2-D average pool. kernel size 13
```C
//maxPool2d 
//kernel_size=3 stride=2
//output one feature map per kernel
__kernel void maxpool2d(
	const int input_size,
	const int output_size,
	__global const float* restrict input_im,
    __global float* restrict output_im)
{
	int channels = get_global_id(0);//get output channel index
	
	input_im += channels * input_size * input_size;
	output_im += channels * output_size * output_size;

	//loop over output feature map
	for(int i = 0; i < output_size; i++)//row
	{
		for(int j = 0; j < output_size; j++)//col
		{
			//find the max value in 3x3 reigon 
			//to be one element in the output feature map
			float tmp = 0.0;

			#pragma unroll 1
			for(int k = 0; k < 3; k++)//row
			{
				#pragma unroll 1
				for(int l = 0; l < 3; l++)//col
				{
					float value = input_im[(i * 2 + k) * input_size  + j * 2 + l ];
					if(value > tmp)
						tmp = value;
				}
			}
			//store the result to output feature map
			output_im[i * output_size + j] = tmp; 
		}
	}
}

//3x3 convolution layer
//output one feature map per kernel
__kernel void conv2d3x3(
	const int input_channels, const int input_size,
	const int pad, const int stride,
	const int start_channel, //start_channel is for 1x1 feature map in fire layer
	const int output_size,
	__global const float* restrict input_im,
	__global const float* restrict filter_weight,
	__global const float* restrict filter_bias,
	__global float *restrict output_im
	)
{
	int filter_index = get_global_id(0); //get output channel index
	int i =  get_global_id(1);

	filter_weight += filter_index * input_channels * 9;
	float bias = filter_bias[filter_index];
	output_im += (start_channel + filter_index) * output_size * output_size;
	
	//loop over output feature map
	//for(int i = 0; i < output_size; i++)
	{
		for(int j = 0; j < output_size; j++)
		{
			//compute one element in the output feature map
			float tmp = bias;
			
			//compute dot product of 2 input_channels x 3 x 3 matrix
			for(int k = 0; k < input_channels; k++)
			{
				#pragma unroll
				for(int l = 0; l < 3; l++)
				{
					int h = i * stride + l - pad;
					for(int m = 0; m < 3; m++)
					{
						int w = j * stride + m - pad;
						if((h >= 0) && (h < input_size) && (w >= 0) && (w < input_size))
						{
							tmp += input_im[k * input_size * input_size + h * input_size + w] \
                               * filter_weight[9 * k + 3 * l + m];
						}
					}
				}
			}

			//add relu activation after conv
			output_im[i * output_size + j] = (tmp > 0.0) ? tmp : 0.0;
		}
	}
}

//1x1 convolution layer
//output one feature map per kernel
__kernel void conv2d1x1(
	const int input_channels, const int input_size,
	__global float *input_im,
	__global const float4* filter_weight,
	__global const float* filter_bias,
	__global float *restrict output_im)
{
	int filter_index = get_global_id(0); // 0 - (output_channels - 1)
	int i = get_global_id(1);

	filter_weight += filter_index * input_channels;

	float bias = filter_bias[filter_index];
	
	output_im += filter_index * input_size * input_size;//start_channel is for 1x1 feature map in fire layer

	//loop over output feature map
	//for(int i = 0; i < input_size; i++)
	{
		for(int j = 0; j < input_size; j++)
		{
			float tmp = bias;
			int loc = i * input_size + j;

			for(int k = 0; k < input_channels; k++)
			{
				//float8 weight = filter_weight[k];
				//float8 feature;
				tmp += input_im[((k << 2) + 0) * input_size * input_size + loc] * filter_weight[k].s0
				     + input_im[((k << 2) + 1) * input_size * input_size + loc] * filter_weight[k].s1
					 + input_im[((k << 2) + 2) * input_size * input_size + loc] * filter_weight[k].s2
					 + input_im[((k << 2) + 3) * input_size * input_size + loc] * filter_weight[k].s3;
			}
			//add relu after conv
			output_im[i * input_size + j] = (tmp > 0.0) ? tmp : 0.0;
		}
	}
}

//last layer use a 13 x 13 avgPool layer as classifier
//one class score per kernel
__kernel void avgpool2d(
	__global const float* restrict input_im,
	__global float* restrict output_im)
{
	int class_index = get_global_id(0);//get class score index

	input_im += 169 * class_index;
	
	float tmp = 0.0f;

	for(int i = 0; i < 169; i++)
	{
		tmp += input_im[i];
	}

	output_im[class_index] = tmp / 169.0;
}
```

#### OpenCL kernel: squeezenet_ST.cl
conv2d3x3: 2-D 3x3 convolution.  
conv2d1x1: 2-D 1x1 convolution. kerner size 1, stride 1  
maxpool2d: 2-D max pool. kerner size 3, stride 2  
avgpool2d: 2-D average pool. kernel size 13
```C
//maxPool2d 
//kernel_size=3 stride=2
//output one feature map per kernel
__kernel void maxpool2d(
	const int input_size,
	const int output_size,
    const int channel_size,
	__global const float* restrict input_im,
    __global float* restrict output_im)
{

    for(int channel_index = 0; channel_index < channel_size; channel_index++)
    {
        //loop over output feature map
        for(int i = 0; i < output_size; i++)//row
        {
            for(int j = 0; j < output_size; j++)//col
            {
                //find the max value in 3x3 reigon 
                //to be one element in the output feature map
                float tmp = 0.0;

                #pragma unroll 1
                for(int k = 0; k < 3; k++)//row
                {
                    #pragma unroll 1
                    for(int l = 0; l < 3; l++)//col
                    {
                        float value = input_im[(i * 2 + k) * input_size  + j * 2 + l ];
                        if(value > tmp)
                            tmp = value;
                    }
                }
                //store the result to output feature map
                output_im[i * output_size + j] = tmp;
            }
        }
    
        input_im += input_size * input_size;
        output_im += output_size * output_size;
    }
}

//3x3 convolution layer
//output one feature map per kernel
__kernel void conv2d3x3(
	const int input_channels, const int input_size,
	const int pad, const int stride,
	const int start_channel, //start_channel is for 1x1 feature map in fire layer
	const int output_size,
    const int filter_size,
	__global const float* restrict input_im,
	__global const float* restrict filter_weight,
	__global const float* restrict filter_bias,
	__global float *restrict output_im
	)
{
	
	//filter_weight += filter_index * input_channels * 9;
	output_im += start_channel * output_size * output_size;
	
	//loop over output feature map
	for(int filter_index = 0; filter_index < filter_size; filter_index++)
	{
        float bias = filter_bias[filter_index];

		for(int i = 0; i < output_size; i++)
		{
            for(int j = 0; j < output_size; j++)
            {
                //compute one element in the output feature map
                float tmp = bias;

                //compute dot product of 2 input_channels x 3 x 3 matrix
                for(int k = 0; k < input_channels; k++)
                {
                    #pragma unroll
                    for(int l = 0; l < 3; l++)
                    {
                        int h = i * stride + l - pad;
                        for(int m = 0; m < 3; m++)
                        {
                            int w = j * stride + m - pad;
                            if((h >= 0) && (h < input_size) && (w >= 0) && (w < input_size))
                            {
                                tmp += input_im[k * input_size * input_size + h * input_size + w] \
                                   * filter_weight[9 * k + 3 * l + m];
                            }
                        }
                    }
                }

                //add relu activation after conv
                output_im[i * output_size + j] = (tmp > 0.0) ? tmp : 0.0;                 
            }
		}
        
        filter_weight += input_channels * 9;
        output_im += output_size * output_size;
	}
}

//1x1 convolution layer as a single kernel (V5)
//output one feature map per kernel

__kernel void conv2d1x1(
	const int input_channels, 
    const int input_size,
    const int filter_size,
	__global const float* restrict input_im,
	__global const float* restrict filter_weight,
	__global const float* restrict filter_bias,
	__global float *restrict output_im)
{
	// Adding restrict keyword
    //loop over filters
	for(int f_i = 0; f_i < filter_size; f_i++)
	{
        //filter_weight += f_i * input_channels;

        float bias = filter_bias[f_i];
		
        // output_im += input_size * input_size;//start_channel is for 1x1 feature map in fire layer
	
		for(int ij = 0; ij < (input_size * input_size); ij++)
		{
			float tmp = bias;
			// int loc = i * input_size + j; // this is equal to ij

			for(int k = 0; k < input_channels; k++)
			{
				//float8 weight = filter_weight[k];
				//float8 feature;
				tmp += input_im[k * input_size * input_size + ij] * filter_weight[k + f_i * input_channels];
			}
			//add relu after conv
			output_im[ij + (input_size * input_size * f_i)] = (tmp > 0.0) ? tmp : 0.0;
            //output_im[ij] = (tmp > 0.0) ? tmp : 0.0;
		}
        //filter_weight += input_channels;	
        //output_im += input_size * input_size;//start_channel is for 1x1 feature map in fire layer
	
	}
}

//last layer use a 13 x 13 avgPool layer as classifier
//one class score per kernel
__kernel void avgpool2d(
	__global const float* restrict input_im,
	__global float* restrict output_im)
{
	// int class_index = get_global_id(0);//get class score index
    
    //Since it's the final layer, we know that there are only 1000 classes
    
	//input_im += 169 * class_index;

	for(int class_index = 0; class_index < 1000; class_index++)
    {
            
        float tmp = 0.0f;

        for(int i = 0; i < 169; i++)
        {
            tmp += input_im[class_index * 169 + i];
        }

        output_im[class_index] = tmp / 169.0;
    }
}
```

Run OpenCL implement  

In [54]:
squeeze_activation = nn.ReLU(inplace=True)

acumulado_pytorch = 0
 
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ],
                     std = [ 0.229, 0.224, 0.225 ]),])

imagen = np.random.rand(3, 224, 224).astype(np.float32)
imagen = Image.fromarray(imagen,'RGB')
imagen = transform(imagen).numpy()
imagen = imagen[np.newaxis,:]

#imagen = np.ones((1, 3, 224, 224))
#print(imagen.shape)

### First Conv3x3 and maxpool

weights0=np.random.rand(64, 3,3,3).astype(np.float32)
bias0=np.random.rand(64,).astype(np.float32)

#### FIRE 1 ####
weights1=np.random.rand(16, 64,1,1).astype(np.float32)
bias1=np.random.rand(16,).astype(np.float32)

weights2a=np.random.rand(64, 16,1,1).astype(np.float32)  
bias2a=np.random.rand(64,).astype(np.float32)    

weights2b=np.random.rand(64, 16,3,3).astype(np.float32)    
bias2b=np.random.rand(64,).astype(np.float32)

#### FIRE 2 ####
weights3=np.random.rand(16, 128,1,1).astype(np.float32)
bias3=np.random.rand(16,).astype(np.float32)

weights4a=np.random.rand(64, 16,1,1).astype(np.float32)  
bias4a=np.random.rand(64,).astype(np.float32)    

weights4b=np.random.rand(64, 16,3,3).astype(np.float32)    
bias4b=np.random.rand(64,).astype(np.float32)

#### FIRE 3 ####
weights5=np.random.rand(32, 128,1,1).astype(np.float32)
bias5=np.random.rand(32,).astype(np.float32)

weights6a=np.random.rand(128, 32,1,1).astype(np.float32)  
bias6a=np.random.rand(128,).astype(np.float32)    

weights6b=np.random.rand(128, 32,3,3).astype(np.float32)    
bias6b=np.random.rand(128,).astype(np.float32)

#### FIRE 4 ####
weights7=np.random.rand(32, 256,1,1).astype(np.float32)
bias7=np.random.rand(32,).astype(np.float32)

weights8a=np.random.rand(128, 32,1,1).astype(np.float32)  
bias8a=np.random.rand(128,).astype(np.float32)    

weights8b=np.random.rand(128, 32,3,3).astype(np.float32)    
bias8b=np.random.rand(128,).astype(np.float32)

#### FIRE 5 ####
weights9=np.random.rand(48, 256,1,1).astype(np.float32)
bias9=np.random.rand(48,).astype(np.float32)

weights10a=np.random.rand(192, 48,1,1).astype(np.float32)  
bias10a=np.random.rand(192,).astype(np.float32)    

weights10b=np.random.rand(192, 48,3,3).astype(np.float32)    
bias10b=np.random.rand(192,).astype(np.float32)

#### FIRE 6 ####
weights11=np.random.rand(48, 384,1,1).astype(np.float32)
bias11=np.random.rand(48,).astype(np.float32)

weights12a=np.random.rand(192, 48,1,1).astype(np.float32)  
bias12a=np.random.rand(192,).astype(np.float32)    

weights12b=np.random.rand(192, 48,3,3).astype(np.float32)    
bias12b=np.random.rand(192,).astype(np.float32)

#### FIRE 7 ####
weights13=np.random.rand(64, 384,1,1).astype(np.float32)
bias13=np.random.rand(64,).astype(np.float32)

weights14a=np.random.rand(256, 64,1,1).astype(np.float32)  
bias14a=np.random.rand(256,).astype(np.float32)    

weights14b=np.random.rand(256, 64,3,3).astype(np.float32)    
bias14b=np.random.rand(256,).astype(np.float32)

#### FIRE 8 ####
weights15=np.random.rand(64, 512,1,1).astype(np.float32)
bias15=np.random.rand(64,).astype(np.float32)

weights16a=np.random.rand(256, 64,1,1).astype(np.float32)  
bias16a=np.random.rand(256,).astype(np.float32)    

weights16b=np.random.rand(256, 64,3,3).astype(np.float32)    
bias16b=np.random.rand(256,).astype(np.float32)

### Classifier Conv3x3 and maxpool

weights17=np.random.rand(1000, 512,1,1).astype(np.float32)
bias17=np.random.rand(1000,).astype(np.float32)

#####################################################

params = torch.load('squeezenet1_1.pth')

### First Conv3x3 and maxpool
weights0=params['features.0.weight'].numpy()
bias0=params['features.0.bias'].numpy()

######## BLOCK 1 ########
#fire - fire - maxpool
#### FIRE 1 ####
weights1=params['features.3.squeeze.weight'].numpy()
bias1=params['features.3.squeeze.bias'].numpy()

weights2a=params['features.3.expand1x1.weight'].numpy()
bias2a=params['features.3.expand1x1.bias'].numpy()    

weights2b=params['features.3.expand3x3.weight'].numpy()   
bias2b=params['features.3.expand3x3.bias'].numpy()

#### FIRE 2 ####
weights3=params['features.4.squeeze.weight'].numpy()
bias3=params['features.4.squeeze.bias'].numpy()

weights4a=params['features.4.expand1x1.weight'].numpy()
bias4a=params['features.4.expand1x1.bias'].numpy()    

weights4b=params['features.4.expand3x3.weight'].numpy()   
bias4b=params['features.4.expand3x3.bias'].numpy()

######## BLOCK 2 ########
#fire - fire - maxpool
#### FIRE 3 ####
weights5=params['features.6.squeeze.weight'].numpy()
bias5=params['features.6.squeeze.bias'].numpy()

weights6a=params['features.6.expand1x1.weight'].numpy()
bias6a=params['features.6.expand1x1.bias'].numpy()    

weights6b=params['features.6.expand3x3.weight'].numpy()   
bias6b=params['features.6.expand3x3.bias'].numpy()

#### FIRE 4 ####
weights7=params['features.7.squeeze.weight'].numpy()
bias7=params['features.7.squeeze.bias'].numpy()

weights8a=params['features.7.expand1x1.weight'].numpy()
bias8a=params['features.7.expand1x1.bias'].numpy()    

weights8b=params['features.7.expand3x3.weight'].numpy()  
bias8b=params['features.7.expand3x3.bias'].numpy()

######## BLOCK 3 ########
#fire - fire - fire - fire
#### FIRE 5 ####
weights9=params['features.9.squeeze.weight'].numpy()
bias9=params['features.9.squeeze.bias'].numpy()

weights10a=params['features.9.expand1x1.weight'].numpy()
bias10a=params['features.9.expand1x1.bias'].numpy()    

weights10b=params['features.9.expand3x3.weight'].numpy()
bias10b=params['features.9.expand3x3.bias'].numpy()

#### FIRE 6 ####
weights11=params['features.10.squeeze.weight'].numpy()
bias11=params['features.10.squeeze.bias'].numpy()

weights12a=params['features.10.expand1x1.weight'].numpy()
bias12a=params['features.10.expand1x1.bias'].numpy()    

weights12b=params['features.10.expand3x3.weight'].numpy()
bias12b=params['features.10.expand3x3.bias'].numpy()

#### FIRE 7 ####
weights13=params['features.11.squeeze.weight'].numpy()
bias13=params['features.11.squeeze.bias'].numpy()

weights14a=params['features.11.expand1x1.weight'].numpy()
bias14a=params['features.11.expand1x1.bias'].numpy()    

weights14b=params['features.11.expand3x3.weight'].numpy()
bias14b=params['features.11.expand3x3.bias'].numpy()

#### FIRE 8 ####
weights15=params['features.12.squeeze.weight'].numpy()
bias15=params['features.12.squeeze.bias'].numpy()

weights16a=params['features.12.expand1x1.weight'].numpy()
bias16a=params['features.12.expand1x1.bias'].numpy()    

weights16b=params['features.12.expand3x3.weight'].numpy()
bias16b=params['features.12.expand3x3.bias'].numpy()

######## Classifier ########
#conv3x3 - avgpool
### Classifier Conv3x3 and avgpool
weights17=params['classifier.1.weight'].numpy()
bias17=params['classifier.1.bias'].numpy()

tic=pc()

squeeze0=nn.Conv2d(3, 64, kernel_size=3, bias=False, stride=2)
squeeze0.weight = nn.Parameter(torch.from_numpy(weights0))
squeeze0.bias = nn.Parameter(torch.from_numpy(bias0))

maxpool=nn.MaxPool2d(3, stride=2)

squeeze1=nn.Conv2d(64, 16, kernel_size=1, bias=False)
squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))    

squeeze2a=nn.Conv2d(16, 64, kernel_size=1, bias=False)
squeeze2a.weight = nn.Parameter(torch.from_numpy(weights2a))
squeeze2a.bias = nn.Parameter(torch.from_numpy(bias2a))

squeeze2b=nn.Conv2d(16, 64, kernel_size=3, bias=False, padding=1)
squeeze2b.weight = nn.Parameter(torch.from_numpy(weights2b))
squeeze2b.bias = nn.Parameter(torch.from_numpy(bias2b))

squeeze3=nn.Conv2d(128, 16, kernel_size=1, bias=False)
squeeze3.weight = nn.Parameter(torch.from_numpy(weights3))
squeeze3.bias = nn.Parameter(torch.from_numpy(bias3))    

squeeze4a=nn.Conv2d(16, 64, kernel_size=1, bias=False)
squeeze4a.weight = nn.Parameter(torch.from_numpy(weights4a))
squeeze4a.bias = nn.Parameter(torch.from_numpy(bias4a))

squeeze4b=nn.Conv2d(16, 64, kernel_size=3, bias=False, padding=1)
squeeze4b.weight = nn.Parameter(torch.from_numpy(weights4b))
squeeze4b.bias = nn.Parameter(torch.from_numpy(bias4b))

squeeze5=nn.Conv2d(128, 32, kernel_size=1, bias=False)
squeeze5.weight = nn.Parameter(torch.from_numpy(weights5))
squeeze5.bias = nn.Parameter(torch.from_numpy(bias5))    

squeeze6a=nn.Conv2d(32, 128, kernel_size=1, bias=False)
squeeze6a.weight = nn.Parameter(torch.from_numpy(weights6a))
squeeze6a.bias = nn.Parameter(torch.from_numpy(bias6a))

squeeze6b=nn.Conv2d(32, 128, kernel_size=3, bias=False, padding=1)
squeeze6b.weight = nn.Parameter(torch.from_numpy(weights6b))
squeeze6b.bias = nn.Parameter(torch.from_numpy(bias6b))

squeeze7=nn.Conv2d(256, 32, kernel_size=1, bias=False)
squeeze7.weight = nn.Parameter(torch.from_numpy(weights7))
squeeze7.bias = nn.Parameter(torch.from_numpy(bias7))    

squeeze8a=nn.Conv2d(32, 128, kernel_size=1, bias=False)
squeeze8a.weight = nn.Parameter(torch.from_numpy(weights8a))
squeeze8a.bias = nn.Parameter(torch.from_numpy(bias8a))

squeeze8b=nn.Conv2d(32, 128, kernel_size=3, bias=False, padding=1)
squeeze8b.weight = nn.Parameter(torch.from_numpy(weights8b))
squeeze8b.bias = nn.Parameter(torch.from_numpy(bias8b))

squeeze9=nn.Conv2d(256, 48, kernel_size=1, bias=False)
squeeze9.weight = nn.Parameter(torch.from_numpy(weights9))
squeeze9.bias = nn.Parameter(torch.from_numpy(bias9))    

squeeze10a=nn.Conv2d(48, 192, kernel_size=1, bias=False)
squeeze10a.weight = nn.Parameter(torch.from_numpy(weights10a))
squeeze10a.bias = nn.Parameter(torch.from_numpy(bias10a))

squeeze10b=nn.Conv2d(48, 192, kernel_size=3, bias=False, padding=1)
squeeze10b.weight = nn.Parameter(torch.from_numpy(weights10b))
squeeze10b.bias = nn.Parameter(torch.from_numpy(bias10b))

squeeze11=nn.Conv2d(384, 48, kernel_size=1, bias=False)
squeeze11.weight = nn.Parameter(torch.from_numpy(weights11))
squeeze11.bias = nn.Parameter(torch.from_numpy(bias11))    

squeeze12a=nn.Conv2d(48, 192, kernel_size=1, bias=False)
squeeze12a.weight = nn.Parameter(torch.from_numpy(weights12a))
squeeze12a.bias = nn.Parameter(torch.from_numpy(bias12a))

squeeze12b=nn.Conv2d(48, 192, kernel_size=3, bias=False, padding=1)
squeeze12b.weight = nn.Parameter(torch.from_numpy(weights12b))
squeeze12b.bias = nn.Parameter(torch.from_numpy(bias12b))

squeeze13=nn.Conv2d(384, 64, kernel_size=1, bias=False)
squeeze13.weight = nn.Parameter(torch.from_numpy(weights13))
squeeze13.bias = nn.Parameter(torch.from_numpy(bias13))    

squeeze14a=nn.Conv2d(64, 256, kernel_size=1, bias=False)
squeeze14a.weight = nn.Parameter(torch.from_numpy(weights14a))
squeeze14a.bias = nn.Parameter(torch.from_numpy(bias14a))

squeeze14b=nn.Conv2d(64, 256, kernel_size=3, bias=False, padding=1)
squeeze14b.weight = nn.Parameter(torch.from_numpy(weights14b))
squeeze14b.bias = nn.Parameter(torch.from_numpy(bias14b))

squeeze15=nn.Conv2d(512, 64, kernel_size=1, bias=False)
squeeze15.weight = nn.Parameter(torch.from_numpy(weights15))
squeeze15.bias = nn.Parameter(torch.from_numpy(bias15))    

squeeze16a=nn.Conv2d(64, 256, kernel_size=1, bias=False)
squeeze16a.weight = nn.Parameter(torch.from_numpy(weights16a))
squeeze16a.bias = nn.Parameter(torch.from_numpy(bias16a))

squeeze16b=nn.Conv2d(64, 256, kernel_size=3, bias=False, padding=1)
squeeze16b.weight = nn.Parameter(torch.from_numpy(weights16b))
squeeze16b.bias = nn.Parameter(torch.from_numpy(bias16b))

conv_class=nn.Conv2d(512, 1000, kernel_size=1, bias=False)
conv_class.weight = nn.Parameter(torch.from_numpy(weights17))
conv_class.bias = nn.Parameter(torch.from_numpy(bias17))

avgpool=nn.AvgPool2d(13)

imagen1  = torch.from_numpy(imagen).float()

salida0=squeeze0(imagen1)
salida0_activation=squeeze_activation(salida0)

salida_pool1 = maxpool(salida0_activation)

salida1=squeeze1(salida_pool1)
salida1_activation=squeeze_activation(salida1)

salida2a=squeeze2a(salida1_activation)
salida2a_activation=squeeze_activation(salida2a)
salida2b=squeeze2b(salida1_activation)
salida2b_activation=squeeze_activation(salida2b)    
salida2_total=torch.cat([salida2a_activation,salida2b_activation], 1)

salida3=squeeze3(salida2_total)
salida3_activation=squeeze_activation(salida3)

salida4a=squeeze4a(salida3_activation)
salida4a_activation=squeeze_activation(salida4a)
salida4b=squeeze4b(salida3_activation)
salida4b_activation=squeeze_activation(salida4b)    
salida4_total=torch.cat([salida4a_activation,salida4b_activation], 1)

salida_pool2 = maxpool(salida4_total)

salida5=squeeze5(salida_pool2)
salida5_activation=squeeze_activation(salida5)

salida6a=squeeze6a(salida5_activation)
salida6a_activation=squeeze_activation(salida6a)
salida6b=squeeze6b(salida5_activation)
salida6b_activation=squeeze_activation(salida6b)    
salida6_total=torch.cat([salida6a_activation,salida6b_activation], 1)

salida7=squeeze7(salida6_total)
salida7_activation=squeeze_activation(salida7)

salida8a=squeeze8a(salida7_activation)
salida8a_activation=squeeze_activation(salida8a)
salida8b=squeeze8b(salida7_activation)
salida8b_activation=squeeze_activation(salida8b)    
salida8_total=torch.cat([salida8a_activation,salida8b_activation], 1)

salida_pool3 = maxpool(salida8_total)

salida9=squeeze9(salida_pool3)
salida9_activation=squeeze_activation(salida9)

salida10a=squeeze10a(salida9_activation)
salida10a_activation=squeeze_activation(salida10a)
salida10b=squeeze10b(salida9_activation)
salida10b_activation=squeeze_activation(salida10b)    
salida10_total=torch.cat([salida10a_activation,salida10b_activation], 1)

salida11=squeeze11(salida10_total)
salida11_activation=squeeze_activation(salida11)

salida12a=squeeze12a(salida11_activation)
salida12a_activation=squeeze_activation(salida12a)
salida12b=squeeze12b(salida11_activation)
salida12b_activation=squeeze_activation(salida12b)    
salida12_total=torch.cat([salida12a_activation,salida12b_activation], 1)

salida13=squeeze13(salida12_total)
salida13_activation=squeeze_activation(salida13)

salida14a=squeeze14a(salida13_activation)
salida14a_activation=squeeze_activation(salida14a)
salida14b=squeeze14b(salida13_activation)
salida14b_activation=squeeze_activation(salida14b)    
salida14_total=torch.cat([salida14a_activation,salida14b_activation], 1)

salida15=squeeze15(salida14_total)
salida15_activation=squeeze_activation(salida15)

salida16a=squeeze16a(salida15_activation)
salida16a_activation=squeeze_activation(salida16a)
salida16b=squeeze16b(salida15_activation)
salida16b_activation=squeeze_activation(salida16b)    
salida16_total=torch.cat([salida16a_activation,salida16b_activation], 1)

salida17=conv_class(salida16_total)
salida17_activation=squeeze_activation(salida17)
salida18=avgpool(salida17_activation)

salida18_a_numpy=salida18.detach().numpy()

toc=pc()

acumulado_pytorch=toc-tic+acumulado_pytorch

####### OPENCL COMPARISON #######


In [55]:
## NDRANGE

h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

conv1_weight = weights0.reshape(-1)
conv1_bias = bias0

fire1_squeeze_weight = weights1.reshape(-1)
fire1_squeeze_bias = bias1
fire1_expand1x1_weight = weights2a.reshape(-1)
fire1_expand1x1_bias = bias2a
fire1_expand3x3_weight =weights2b.reshape(-1)
fire1_expand3x3_bias = bias2b

fire2_squeeze_weight = weights3.reshape(-1)
fire2_squeeze_bias = bias3
fire2_expand1x1_weight = weights4a.reshape(-1)
fire2_expand1x1_bias = bias4a
fire2_expand3x3_weight =weights4b.reshape(-1)
fire2_expand3x3_bias = bias4b

fire3_squeeze_weight = weights5.reshape(-1)
fire3_squeeze_bias = bias5
fire3_expand1x1_weight = weights6a.reshape(-1)
fire3_expand1x1_bias = bias6a
fire3_expand3x3_weight =weights6b.reshape(-1)
fire3_expand3x3_bias = bias6b

fire4_squeeze_weight = weights7.reshape(-1)
fire4_squeeze_bias = bias7
fire4_expand1x1_weight = weights8a.reshape(-1)
fire4_expand1x1_bias = bias8a
fire4_expand3x3_weight =weights8b.reshape(-1)
fire4_expand3x3_bias = bias8b

fire5_squeeze_weight = weights9.reshape(-1)
fire5_squeeze_bias = bias9
fire5_expand1x1_weight = weights10a.reshape(-1)
fire5_expand1x1_bias = bias10a
fire5_expand3x3_weight =weights10b.reshape(-1)
fire5_expand3x3_bias = bias10b

fire6_squeeze_weight = weights11.reshape(-1)
fire6_squeeze_bias = bias11
fire6_expand1x1_weight = weights12a.reshape(-1)
fire6_expand1x1_bias = bias12a
fire6_expand3x3_weight =weights12b.reshape(-1)
fire6_expand3x3_bias = bias12b

fire7_squeeze_weight = weights13.reshape(-1)
fire7_squeeze_bias = bias13
fire7_expand1x1_weight = weights14a.reshape(-1)
fire7_expand1x1_bias = bias14a
fire7_expand3x3_weight =weights14b.reshape(-1)
fire7_expand3x3_bias = bias14b

fire8_squeeze_weight = weights15.reshape(-1)
fire8_squeeze_bias = bias15
fire8_expand1x1_weight = weights16a.reshape(-1)
fire8_expand1x1_bias = bias16a
fire8_expand3x3_weight =weights16b.reshape(-1)
fire8_expand3x3_bias = bias16b

classifier_conv_weight = weights17.reshape(-1)
classifier_conv_bias = bias17

h_result_conv = np.empty(1 * 64 * 111 * 111).astype(np.float32)
h_result_pool1 = np.empty(1 * 64 * 55 * 55).astype(np.float32)

h_result_fire1_squeeze = np.empty(1 * 16 * 55 * 55).astype(np.float32)
h_result_fire1_expand = np.empty(1 * 128 * 55 * 55).astype(np.float32)
h_result_fire2_squeeze = np.empty(1 * 16 * 55 * 55).astype(np.float32)
h_result_fire2_expand = np.empty(1 * 128 * 55 * 55).astype(np.float32)
h_result_pool2 = np.empty(1 * 128 * 27 * 27).astype(np.float32)

h_result_fire3_squeeze = np.empty(1 * 32 * 27 * 27).astype(np.float32)
h_result_fire3_expand = np.empty(1 * 256 * 27 * 27).astype(np.float32)
h_result_fire4_squeeze = np.empty(1 * 32 * 27 * 27).astype(np.float32)
h_result_fire4_expand = np.empty(1 * 256 * 27 * 27).astype(np.float32)
h_result_pool3 = np.empty(1 * 256 * 13 * 13).astype(np.float32)

h_result_fire5_squeeze = np.empty(1 * 48 * 13 * 13).astype(np.float32)
h_result_fire5_expand = np.empty(1 * 384 * 13 * 13).astype(np.float32)
h_result_fire6_squeeze = np.empty(1 * 48 * 13 * 13).astype(np.float32)
h_result_fire6_expand = np.empty(1 * 384 * 13 * 13).astype(np.float32)
h_result_fire7_squeeze = np.empty(1 * 64 * 13 * 13).astype(np.float32)
h_result_fire7_expand = np.empty(1 * 512 * 13 * 13).astype(np.float32)
h_result_fire8_squeeze = np.empty(1 * 64 * 13 * 13).astype(np.float32)
h_result_fire8_expand = np.empty(1 * 512 * 13 * 13).astype(np.float32)

h_result_classifier_conv = np.empty(1 * 1000 * 13 * 13).astype(np.float32)
h_result_classifier = np.empty(1 * 1000).astype(np.float32)

d_conv1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=conv1_weight)
d_conv1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=conv1_bias)

d_fire1_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
d_fire1_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)
d_fire1_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand1x1_weight)
d_fire1_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand1x1_bias)
d_fire1_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand3x3_weight)
d_fire1_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand3x3_bias)

d_fire2_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_squeeze_weight)
d_fire2_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_squeeze_bias)
d_fire2_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand1x1_weight)
d_fire2_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand1x1_bias)
d_fire2_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand3x3_weight)
d_fire2_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand3x3_bias)

d_fire3_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_squeeze_weight)
d_fire3_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_squeeze_bias)
d_fire3_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand1x1_weight)
d_fire3_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand1x1_bias)
d_fire3_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand3x3_weight)
d_fire3_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand3x3_bias)

d_fire4_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_squeeze_weight)
d_fire4_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_squeeze_bias)
d_fire4_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand1x1_weight)
d_fire4_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand1x1_bias)
d_fire4_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand3x3_weight)
d_fire4_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand3x3_bias)

d_fire5_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_squeeze_weight)
d_fire5_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_squeeze_bias)
d_fire5_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand1x1_weight)
d_fire5_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand1x1_bias)
d_fire5_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand3x3_weight)
d_fire5_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand3x3_bias)

d_fire6_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_weight)
d_fire6_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_bias)
d_fire6_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_weight)
d_fire6_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_bias)
d_fire6_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_weight)
d_fire6_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_bias)

d_fire7_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_weight)
d_fire7_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_bias)
d_fire7_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_weight)
d_fire7_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_bias)
d_fire7_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_weight)
d_fire7_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_bias)

d_fire8_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_weight)
d_fire8_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_bias)
d_fire8_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_weight)
d_fire8_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_bias)
d_fire8_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_weight)
d_fire8_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_bias)

d_classifier_conv_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=classifier_conv_weight)
d_classifier_conv_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=classifier_conv_bias)

d_result_conv = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_conv.nbytes)
d_result_pool1 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool1.nbytes)

d_result_fire1_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)
d_result_fire1_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_expand.nbytes)
d_result_fire2_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire2_squeeze.nbytes)
d_result_fire2_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire2_expand.nbytes)
d_result_pool2 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool2.nbytes)

d_result_fire3_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire3_squeeze.nbytes)
d_result_fire3_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire3_expand.nbytes)
d_result_fire4_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire4_squeeze.nbytes)
d_result_fire4_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire4_expand.nbytes)
d_result_pool3 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool3.nbytes)

d_result_fire5_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire5_squeeze.nbytes)
d_result_fire5_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire5_expand.nbytes)
d_result_fire6_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_squeeze.nbytes)
d_result_fire6_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_expand.nbytes)
d_result_fire7_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_squeeze.nbytes)
d_result_fire7_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_expand.nbytes)
d_result_fire8_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_squeeze.nbytes)
d_result_fire8_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_expand.nbytes)

d_result_classifier_conv = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_classifier_conv.nbytes)
d_result_classifier = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_classifier.nbytes)

tic2 = pc()

#first conv layer
conv3x3_NDR(queue,(64, 111), None, 3, 224, 0, 2, 0, 111, d_sample, d_conv1_weight, d_conv1_bias, d_result_conv)
maxpool_NDR(queue, (64, ), None, 111, 55, d_result_conv, d_result_pool1)

#block1
conv1x1_NDR(queue,(16, 55), None, np.int32(64/4), 55, d_result_pool1, d_fire1_squeeze_weight, d_fire1_squeeze_bias, d_result_fire1_squeeze)
queue.finish()
conv1x1_NDR(queue1,(64, 55), None, np.int32(16/4), 55, d_result_fire1_squeeze, d_fire1_expand1x1_weight, d_fire1_expand1x1_bias, d_result_fire1_expand)
conv3x3_NDR(queue,(64, 55), None, 16, 55, 1, 1, 64, 55, d_result_fire1_squeeze, d_fire1_expand3x3_weight, d_fire1_expand3x3_bias, d_result_fire1_expand)
queue.finish()
queue1.finish()

conv1x1_NDR(queue,(16, 55), None, np.int32(128/4), 55, d_result_fire1_expand, d_fire2_squeeze_weight, d_fire2_squeeze_bias, d_result_fire2_squeeze)
queue.finish()
conv1x1_NDR(queue1,(64, 55), None, np.int32(16/4), 55, d_result_fire2_squeeze, d_fire2_expand1x1_weight, d_fire2_expand1x1_bias, d_result_fire2_expand)
conv3x3_NDR(queue,(64, 55), None, 16, 55, 1, 1, 64, 55, d_result_fire2_squeeze, d_fire2_expand3x3_weight, d_fire2_expand3x3_bias, d_result_fire2_expand)
queue.finish()
queue1.finish()

maxpool_NDR(queue, (128, ), None, 55, 27, d_result_fire2_expand, d_result_pool2)

#block2
conv1x1_NDR(queue,(32, 27), None, np.int32(128/4), 27, d_result_pool2, d_fire3_squeeze_weight, d_fire3_squeeze_bias, d_result_fire3_squeeze)
queue.finish()
conv1x1_NDR(queue1,(128, 27), None, np.int32(32/4), 27, d_result_fire3_squeeze, d_fire3_expand1x1_weight, d_fire3_expand1x1_bias, d_result_fire3_expand)
conv3x3_NDR(queue,(128, 27), None, 32, 27, 1, 1, 128, 27, d_result_fire3_squeeze, d_fire3_expand3x3_weight, d_fire3_expand3x3_bias, d_result_fire3_expand)
queue.finish()
queue1.finish()

conv1x1_NDR(queue,(32, 27), None, np.int32(256/4), 27, d_result_fire3_expand, d_fire4_squeeze_weight, d_fire4_squeeze_bias, d_result_fire4_squeeze)
queue.finish()
conv1x1_NDR(queue1,(128, 27), None, np.int32(32/4), 27, d_result_fire4_squeeze, d_fire4_expand1x1_weight, d_fire4_expand1x1_bias, d_result_fire4_expand)
conv3x3_NDR(queue,(128, 27), None, 32, 27, 1, 1, 128, 27, d_result_fire4_squeeze, d_fire4_expand3x3_weight, d_fire4_expand3x3_bias, d_result_fire4_expand)
queue.finish()
queue1.finish()

maxpool_NDR(queue, (256, ), None, 27, 13, d_result_fire4_expand, d_result_pool3)

#block3
conv1x1_NDR(queue,(48, 13), None, np.int32(256/4), 13, d_result_pool3, d_fire5_squeeze_weight, d_fire5_squeeze_bias, d_result_fire5_squeeze)
queue.finish()
conv1x1_NDR(queue1,(192, 13), None, np.int32(48/4), 13, d_result_fire5_squeeze, d_fire5_expand1x1_weight, d_fire5_expand1x1_bias, d_result_fire5_expand)
conv3x3_NDR(queue,(192, 13), None, 48, 13, 1, 1, 192, 13, d_result_fire5_squeeze, d_fire5_expand3x3_weight, d_fire5_expand3x3_bias, d_result_fire5_expand)
queue.finish()
queue1.finish()

conv1x1_NDR(queue,(48, 13), None, np.int32(384/4), 13, d_result_fire5_expand, d_fire6_squeeze_weight, d_fire6_squeeze_bias, d_result_fire6_squeeze)
queue.finish()
conv1x1_NDR(queue1,(192, 13), None, np.int32(48/4), 13, d_result_fire6_squeeze, d_fire6_expand1x1_weight, d_fire6_expand1x1_bias, d_result_fire6_expand)
conv3x3_NDR(queue,(192, 13), None, 48, 13, 1, 1, 192, 13, d_result_fire6_squeeze, d_fire6_expand3x3_weight, d_fire6_expand3x3_bias, d_result_fire6_expand)
queue.finish()
queue1.finish()

conv1x1_NDR(queue,(64, 13), None, np.int32(384/4), 13, d_result_fire6_expand, d_fire7_squeeze_weight, d_fire7_squeeze_bias, d_result_fire7_squeeze)
queue.finish()
conv1x1_NDR(queue1,(256, 13), None, np.int32(64/4), 13, d_result_fire7_squeeze, d_fire7_expand1x1_weight, d_fire7_expand1x1_bias, d_result_fire7_expand)
conv3x3_NDR(queue,(256, 13), None, 64, 13, 1, 1, 256, 13, d_result_fire7_squeeze, d_fire7_expand3x3_weight, d_fire7_expand3x3_bias, d_result_fire7_expand)
queue.finish()
queue1.finish()

conv1x1_NDR(queue,(64, 13), None, np.int32(512/4), 13, d_result_fire7_expand, d_fire8_squeeze_weight, d_fire8_squeeze_bias, d_result_fire8_squeeze)
queue.finish()
conv1x1_NDR(queue1,(256, 13), None, np.int32(64/4), 13, d_result_fire8_squeeze, d_fire8_expand1x1_weight, d_fire8_expand1x1_bias, d_result_fire8_expand)
conv3x3_NDR(queue,(256, 13), None, 64, 13, 1, 1, 256, 13, d_result_fire8_squeeze, d_fire8_expand3x3_weight, d_fire8_expand3x3_bias, d_result_fire8_expand)
queue.finish()
queue1.finish()

# classifier
conv1x1_NDR(queue,(1000, 13), None, np.int32(512/4), 13, d_result_fire8_expand, d_classifier_conv_weight, d_classifier_conv_bias, d_result_classifier_conv)

avgpool_NDR(queue,(1000, ), None, d_result_classifier_conv, d_result_classifier)

cl.enqueue_copy(queue, h_result_classifier, d_result_classifier)

queue.finish()

veamos = h_result_classifier

rtime = pc() - tic2


In [56]:
# Simple task
h_sample = imagen.reshape(-1).astype(np.float32)
d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

conv1_weight = weights0.reshape(-1)
conv1_bias = bias0

fire1_squeeze_weight = weights1.reshape(-1)
fire1_squeeze_bias = bias1
fire1_expand1x1_weight = weights2a.reshape(-1)
fire1_expand1x1_bias = bias2a
fire1_expand3x3_weight =weights2b.reshape(-1)
fire1_expand3x3_bias = bias2b

fire2_squeeze_weight = weights3.reshape(-1)
fire2_squeeze_bias = bias3
fire2_expand1x1_weight = weights4a.reshape(-1)
fire2_expand1x1_bias = bias4a
fire2_expand3x3_weight =weights4b.reshape(-1)
fire2_expand3x3_bias = bias4b

fire3_squeeze_weight = weights5.reshape(-1)
fire3_squeeze_bias = bias5
fire3_expand1x1_weight = weights6a.reshape(-1)
fire3_expand1x1_bias = bias6a
fire3_expand3x3_weight =weights6b.reshape(-1)
fire3_expand3x3_bias = bias6b

fire4_squeeze_weight = weights7.reshape(-1)
fire4_squeeze_bias = bias7
fire4_expand1x1_weight = weights8a.reshape(-1)
fire4_expand1x1_bias = bias8a
fire4_expand3x3_weight =weights8b.reshape(-1)
fire4_expand3x3_bias = bias8b

fire5_squeeze_weight = weights9.reshape(-1)
fire5_squeeze_bias = bias9
fire5_expand1x1_weight = weights10a.reshape(-1)
fire5_expand1x1_bias = bias10a
fire5_expand3x3_weight =weights10b.reshape(-1)
fire5_expand3x3_bias = bias10b

fire6_squeeze_weight = weights11.reshape(-1)
fire6_squeeze_bias = bias11
fire6_expand1x1_weight = weights12a.reshape(-1)
fire6_expand1x1_bias = bias12a
fire6_expand3x3_weight =weights12b.reshape(-1)
fire6_expand3x3_bias = bias12b

fire7_squeeze_weight = weights13.reshape(-1)
fire7_squeeze_bias = bias13
fire7_expand1x1_weight = weights14a.reshape(-1)
fire7_expand1x1_bias = bias14a
fire7_expand3x3_weight =weights14b.reshape(-1)
fire7_expand3x3_bias = bias14b

fire8_squeeze_weight = weights15.reshape(-1)
fire8_squeeze_bias = bias15
fire8_expand1x1_weight = weights16a.reshape(-1)
fire8_expand1x1_bias = bias16a
fire8_expand3x3_weight =weights16b.reshape(-1)
fire8_expand3x3_bias = bias16b

classifier_conv_weight = weights17.reshape(-1)
classifier_conv_bias = bias17

h_result_conv = np.empty(1 * 64 * 111 * 111).astype(np.float32)
h_result_pool1 = np.empty(1 * 64 * 55 * 55).astype(np.float32)

h_result_fire1_squeeze = np.empty(1 * 16 * 55 * 55).astype(np.float32)
h_result_fire1_expand = np.empty(1 * 128 * 55 * 55).astype(np.float32)
h_result_fire2_squeeze = np.empty(1 * 16 * 55 * 55).astype(np.float32)
h_result_fire2_expand = np.empty(1 * 128 * 55 * 55).astype(np.float32)
h_result_pool2 = np.empty(1 * 128 * 27 * 27).astype(np.float32)

h_result_fire3_squeeze = np.empty(1 * 32 * 27 * 27).astype(np.float32)
h_result_fire3_expand = np.empty(1 * 256 * 27 * 27).astype(np.float32)
h_result_fire4_squeeze = np.empty(1 * 32 * 27 * 27).astype(np.float32)
h_result_fire4_expand = np.empty(1 * 256 * 27 * 27).astype(np.float32)
h_result_pool3 = np.empty(1 * 256 * 13 * 13).astype(np.float32)

h_result_fire5_squeeze = np.empty(1 * 48 * 13 * 13).astype(np.float32)
h_result_fire5_expand = np.empty(1 * 384 * 13 * 13).astype(np.float32)
h_result_fire6_squeeze = np.empty(1 * 48 * 13 * 13).astype(np.float32)
h_result_fire6_expand = np.empty(1 * 384 * 13 * 13).astype(np.float32)
h_result_fire7_squeeze = np.empty(1 * 64 * 13 * 13).astype(np.float32)
h_result_fire7_expand = np.empty(1 * 512 * 13 * 13).astype(np.float32)
h_result_fire8_squeeze = np.empty(1 * 64 * 13 * 13).astype(np.float32)
h_result_fire8_expand = np.empty(1 * 512 * 13 * 13).astype(np.float32)

h_result_classifier_conv = np.empty(1 * 1000 * 13 * 13).astype(np.float32)
h_result_classifier = np.empty(1 * 1000).astype(np.float32)

d_conv1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=conv1_weight)
d_conv1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=conv1_bias)

d_fire1_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
d_fire1_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)
d_fire1_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand1x1_weight)
d_fire1_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand1x1_bias)
d_fire1_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand3x3_weight)
d_fire1_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand3x3_bias)

d_fire2_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_squeeze_weight)
d_fire2_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_squeeze_bias)
d_fire2_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand1x1_weight)
d_fire2_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand1x1_bias)
d_fire2_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand3x3_weight)
d_fire2_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand3x3_bias)

d_fire3_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_squeeze_weight)
d_fire3_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_squeeze_bias)
d_fire3_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand1x1_weight)
d_fire3_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand1x1_bias)
d_fire3_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand3x3_weight)
d_fire3_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand3x3_bias)

d_fire4_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_squeeze_weight)
d_fire4_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_squeeze_bias)
d_fire4_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand1x1_weight)
d_fire4_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand1x1_bias)
d_fire4_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand3x3_weight)
d_fire4_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand3x3_bias)

d_fire5_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_squeeze_weight)
d_fire5_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_squeeze_bias)
d_fire5_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand1x1_weight)
d_fire5_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand1x1_bias)
d_fire5_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand3x3_weight)
d_fire5_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand3x3_bias)

d_fire6_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_weight)
d_fire6_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_bias)
d_fire6_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_weight)
d_fire6_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_bias)
d_fire6_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_weight)
d_fire6_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_bias)

d_fire7_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_weight)
d_fire7_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_bias)
d_fire7_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_weight)
d_fire7_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_bias)
d_fire7_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_weight)
d_fire7_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_bias)

d_fire8_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_weight)
d_fire8_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_bias)
d_fire8_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_weight)
d_fire8_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_bias)
d_fire8_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_weight)
d_fire8_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_bias)

d_classifier_conv_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=classifier_conv_weight)
d_classifier_conv_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=classifier_conv_bias)

d_result_conv = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_conv.nbytes)
d_result_pool1 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool1.nbytes)

d_result_fire1_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)
d_result_fire1_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_expand.nbytes)
d_result_fire2_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire2_squeeze.nbytes)
d_result_fire2_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire2_expand.nbytes)
d_result_pool2 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool2.nbytes)

d_result_fire3_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire3_squeeze.nbytes)
d_result_fire3_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire3_expand.nbytes)
d_result_fire4_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire4_squeeze.nbytes)
d_result_fire4_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire4_expand.nbytes)
d_result_pool3 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool3.nbytes)

d_result_fire5_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire5_squeeze.nbytes)
d_result_fire5_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire5_expand.nbytes)
d_result_fire6_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_squeeze.nbytes)
d_result_fire6_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_expand.nbytes)
d_result_fire7_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_squeeze.nbytes)
d_result_fire7_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_expand.nbytes)
d_result_fire8_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_squeeze.nbytes)
d_result_fire8_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_expand.nbytes)

d_result_classifier_conv = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_classifier_conv.nbytes)
d_result_classifier = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_classifier.nbytes)

tic3 = pc()

#first conv layer
conv3x3_ST(queue,(1,), None, 3, 224, 0, 2, 0, 111, 64, d_sample, d_conv1_weight, d_conv1_bias, d_result_conv)
maxpool_ST(queue, (1, ), None, 111, 55, 64, d_result_conv, d_result_pool1)

#block1
conv1x1_ST(queue,(1,), None, 64, 55, 16, d_result_pool1, d_fire1_squeeze_weight, d_fire1_squeeze_bias, d_result_fire1_squeeze)
queue.finish()
conv1x1_ST(queue1,(1,), None, 16, 55, 64, d_result_fire1_squeeze, d_fire1_expand1x1_weight, d_fire1_expand1x1_bias, d_result_fire1_expand)
conv3x3_ST(queue,(1,), None, 16, 55, 1, 1, 64, 55, 64, d_result_fire1_squeeze, d_fire1_expand3x3_weight, d_fire1_expand3x3_bias, d_result_fire1_expand)
queue.finish()
queue1.finish()

conv1x1_ST(queue,(1,), None, 128, 55, 16, d_result_fire1_expand, d_fire2_squeeze_weight, d_fire2_squeeze_bias, d_result_fire2_squeeze)
queue.finish()
conv1x1_ST(queue1,(1,), None, 16, 55, 64, d_result_fire2_squeeze, d_fire2_expand1x1_weight, d_fire2_expand1x1_bias, d_result_fire2_expand)
conv3x3_ST(queue,(1,), None, 16, 55, 1, 1, 64, 55, 64, d_result_fire2_squeeze, d_fire2_expand3x3_weight, d_fire2_expand3x3_bias, d_result_fire2_expand)
queue.finish()
queue1.finish()

maxpool_ST(queue, (1, ), None, 55, 27, 128, d_result_fire2_expand, d_result_pool2)

#block2
conv1x1_ST(queue,(1,), None, 128, 27, 32, d_result_pool2, d_fire3_squeeze_weight, d_fire3_squeeze_bias, d_result_fire3_squeeze)
queue.finish()
conv1x1_ST(queue1,(1,), None, 32, 27, 128, d_result_fire3_squeeze, d_fire3_expand1x1_weight, d_fire3_expand1x1_bias, d_result_fire3_expand)
conv3x3_ST(queue,(1,), None, 32, 27, 1, 1, 128, 27, 128, d_result_fire3_squeeze, d_fire3_expand3x3_weight, d_fire3_expand3x3_bias, d_result_fire3_expand)
queue.finish()
queue1.finish()

conv1x1_ST(queue,(1,), None, 256, 27, 32, d_result_fire3_expand, d_fire4_squeeze_weight, d_fire4_squeeze_bias, d_result_fire4_squeeze)
queue.finish()
conv1x1_ST(queue1,(1,), None, 32, 27, 128, d_result_fire4_squeeze, d_fire4_expand1x1_weight, d_fire4_expand1x1_bias, d_result_fire4_expand)
conv3x3_ST(queue,(1,), None, 32, 27, 1, 1, 128, 27, 128, d_result_fire4_squeeze, d_fire4_expand3x3_weight, d_fire4_expand3x3_bias, d_result_fire4_expand)
queue.finish()
queue1.finish()

maxpool_ST(queue, (1, ), None, 27, 13, 256, d_result_fire4_expand, d_result_pool3)

#block3
conv1x1_ST(queue,(1,), None, 256, 13, 48, d_result_pool3, d_fire5_squeeze_weight, d_fire5_squeeze_bias, d_result_fire5_squeeze)
queue.finish()
conv1x1_ST(queue1,(1,), None, 48, 13, 192, d_result_fire5_squeeze, d_fire5_expand1x1_weight, d_fire5_expand1x1_bias, d_result_fire5_expand)
conv3x3_ST(queue,(1,), None, 48, 13, 1, 1, 192, 13, 192, d_result_fire5_squeeze, d_fire5_expand3x3_weight, d_fire5_expand3x3_bias, d_result_fire5_expand)
queue.finish()
queue1.finish()

conv1x1_ST(queue,(1,), None, 384, 13, 48, d_result_fire5_expand, d_fire6_squeeze_weight, d_fire6_squeeze_bias, d_result_fire6_squeeze)
queue.finish()
conv1x1_ST(queue1,(1,), None, 48, 13, 192, d_result_fire6_squeeze, d_fire6_expand1x1_weight, d_fire6_expand1x1_bias, d_result_fire6_expand)
conv3x3_ST(queue,(1,), None, 48, 13, 1, 1, 192, 13, 192, d_result_fire6_squeeze, d_fire6_expand3x3_weight, d_fire6_expand3x3_bias, d_result_fire6_expand)
queue.finish()
queue1.finish()

conv1x1_ST(queue,(1,), None, 384, 13, 64, d_result_fire6_expand, d_fire7_squeeze_weight, d_fire7_squeeze_bias, d_result_fire7_squeeze)
queue.finish()
conv1x1_ST(queue1,(1,), None, 64, 13, 256, d_result_fire7_squeeze, d_fire7_expand1x1_weight, d_fire7_expand1x1_bias, d_result_fire7_expand)
conv3x3_ST(queue,(1,), None, 64, 13, 1, 1, 256, 13, 256, d_result_fire7_squeeze, d_fire7_expand3x3_weight, d_fire7_expand3x3_bias, d_result_fire7_expand)
queue.finish()
queue1.finish()

conv1x1_ST(queue,(1,), None, 512, 13, 64, d_result_fire7_expand, d_fire8_squeeze_weight, d_fire8_squeeze_bias, d_result_fire8_squeeze)
queue.finish()
conv1x1_ST(queue1,(1,), None, 64, 13, 256, d_result_fire8_squeeze, d_fire8_expand1x1_weight, d_fire8_expand1x1_bias, d_result_fire8_expand)
conv3x3_ST(queue,(1,), None, 64, 13, 1, 1, 256, 13, 256, d_result_fire8_squeeze, d_fire8_expand3x3_weight, d_fire8_expand3x3_bias, d_result_fire8_expand)
queue.finish()
queue1.finish()

# classifier
conv1x1_ST(queue,(1,), None, 512, 13, 1000, d_result_fire8_expand, d_classifier_conv_weight, d_classifier_conv_bias, d_result_classifier_conv)

avgpool_ST(queue, (1, ), None, d_result_classifier_conv, d_result_classifier)

cl.enqueue_copy(queue, h_result_classifier, d_result_classifier)

veamos1 = h_result_classifier

rtime1 = pc() - tic3


In [57]:
print ("tiempo en segundos con pytorch= ", toc-tic)
print ("tiempo en segundos con opencl (NDRANGE)=",rtime)
print ("tiempo en segundos con opencl (Simple Task)=",rtime1)

comparativa1=np.allclose(salida18_a_numpy.reshape(-1), veamos,rtol=1e-01, atol=1e-01)
comparativa2=np.allclose(salida18_a_numpy.reshape(-1), veamos1,rtol=1e-01, atol=1e-01)
comparativa3=np.allclose(veamos, veamos1,rtol=1e-01, atol=1e-01)

print("comparativa (pytorch == NDRange): ",comparativa1)
print("comparativa (pytorch == Simple Task): ",comparativa2)
print("comparativa (NDRange == Simple Task): ",comparativa3)

tiempo en segundos con pytorch=  0.0030234180003390065
tiempo en segundos con opencl (NDRANGE)= 0.03991814599976351
tiempo en segundos con opencl (Simple Task)= 0.04235464699922886
comparativa (pytorch == NDRange):  True
comparativa (pytorch == Simple Task):  True
comparativa (NDRange == Simple Task):  True


In [155]:
for i in range(1000):
    if (abs(salida18_a_numpy.reshape(-1)[i] - veamos1[i])) > 1e-01:
        print("i:", i, salida18_a_numpy.reshape(-1)[i], veamos1[i])

In [18]:
squeeze_activation = nn.ReLU(inplace=True)

count = 100
acumulado_pytorch=0
acumulado_kernel=0
comparativa = True

#####################################################

params = torch.load('squeezenet1_1.pth')

### First Conv3x3 and maxpool
weights0=params['features.0.weight'].numpy()
bias0=params['features.0.bias'].numpy()

######## BLOCK 1 ########
#fire - fire - maxpool
#### FIRE 1 ####
weights1=params['features.3.squeeze.weight'].numpy()
bias1=params['features.3.squeeze.bias'].numpy()

weights2a=params['features.3.expand1x1.weight'].numpy()
bias2a=params['features.3.expand1x1.bias'].numpy()    

weights2b=params['features.3.expand3x3.weight'].numpy()   
bias2b=params['features.3.expand3x3.bias'].numpy()

#### FIRE 2 ####
weights3=params['features.4.squeeze.weight'].numpy()
bias3=params['features.4.squeeze.bias'].numpy()

weights4a=params['features.4.expand1x1.weight'].numpy()
bias4a=params['features.4.expand1x1.bias'].numpy()    

weights4b=params['features.4.expand3x3.weight'].numpy()   
bias4b=params['features.4.expand3x3.bias'].numpy()

######## BLOCK 2 ########
#fire - fire - maxpool
#### FIRE 3 ####
weights5=params['features.6.squeeze.weight'].numpy()
bias5=params['features.6.squeeze.bias'].numpy()

weights6a=params['features.6.expand1x1.weight'].numpy()
bias6a=params['features.6.expand1x1.bias'].numpy()    

weights6b=params['features.6.expand3x3.weight'].numpy()   
bias6b=params['features.6.expand3x3.bias'].numpy()

#### FIRE 4 ####
weights7=params['features.7.squeeze.weight'].numpy()
bias7=params['features.7.squeeze.bias'].numpy()

weights8a=params['features.7.expand1x1.weight'].numpy()
bias8a=params['features.7.expand1x1.bias'].numpy()    

weights8b=params['features.7.expand3x3.weight'].numpy()  
bias8b=params['features.7.expand3x3.bias'].numpy()

######## BLOCK 3 ########
#fire - fire - fire - fire
#### FIRE 5 ####
weights9=params['features.9.squeeze.weight'].numpy()
bias9=params['features.9.squeeze.bias'].numpy()

weights10a=params['features.9.expand1x1.weight'].numpy()
bias10a=params['features.9.expand1x1.bias'].numpy()    

weights10b=params['features.9.expand3x3.weight'].numpy()
bias10b=params['features.9.expand3x3.bias'].numpy()

#### FIRE 6 ####
weights11=params['features.10.squeeze.weight'].numpy()
bias11=params['features.10.squeeze.bias'].numpy()

weights12a=params['features.10.expand1x1.weight'].numpy()
bias12a=params['features.10.expand1x1.bias'].numpy()    

weights12b=params['features.10.expand3x3.weight'].numpy()
bias12b=params['features.10.expand3x3.bias'].numpy()

#### FIRE 7 ####
weights13=params['features.11.squeeze.weight'].numpy()
bias13=params['features.11.squeeze.bias'].numpy()

weights14a=params['features.11.expand1x1.weight'].numpy()
bias14a=params['features.11.expand1x1.bias'].numpy()    

weights14b=params['features.11.expand3x3.weight'].numpy()
bias14b=params['features.11.expand3x3.bias'].numpy()

#### FIRE 8 ####
weights15=params['features.12.squeeze.weight'].numpy()
bias15=params['features.12.squeeze.bias'].numpy()

weights16a=params['features.12.expand1x1.weight'].numpy()
bias16a=params['features.12.expand1x1.bias'].numpy()    

weights16b=params['features.12.expand3x3.weight'].numpy()
bias16b=params['features.12.expand3x3.bias'].numpy()

######## Classifier ########
#conv3x3 - avgpool
### Classifier Conv3x3 and avgpool
weights17=params['classifier.1.weight'].numpy()
bias17=params['classifier.1.bias'].numpy()

for i in range(count):
   

    transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ],
                         std = [ 0.229, 0.224, 0.225 ]),])

    imagen = np.random.rand(3, 224, 224).astype(np.float32)
    imagen = Image.fromarray(imagen,'RGB')
    imagen = transform(imagen).numpy()
    imagen = imagen[np.newaxis,:]

    #imagen = np.ones((1, 3, 224, 224))
    #print(imagen.shape)

    tic=pc()

    squeeze0=nn.Conv2d(3, 64, kernel_size=3, bias=False, stride=2)
    squeeze0.weight = nn.Parameter(torch.from_numpy(weights0))
    squeeze0.bias = nn.Parameter(torch.from_numpy(bias0))

    maxpool=nn.MaxPool2d(3, stride=2)

    squeeze1=nn.Conv2d(64, 16, kernel_size=1, bias=False)
    squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
    squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))    

    squeeze2a=nn.Conv2d(16, 64, kernel_size=1, bias=False)
    squeeze2a.weight = nn.Parameter(torch.from_numpy(weights2a))
    squeeze2a.bias = nn.Parameter(torch.from_numpy(bias2a))

    squeeze2b=nn.Conv2d(16, 64, kernel_size=3, bias=False, padding=1)
    squeeze2b.weight = nn.Parameter(torch.from_numpy(weights2b))
    squeeze2b.bias = nn.Parameter(torch.from_numpy(bias2b))

    squeeze3=nn.Conv2d(128, 16, kernel_size=1, bias=False)
    squeeze3.weight = nn.Parameter(torch.from_numpy(weights3))
    squeeze3.bias = nn.Parameter(torch.from_numpy(bias3))    

    squeeze4a=nn.Conv2d(16, 64, kernel_size=1, bias=False)
    squeeze4a.weight = nn.Parameter(torch.from_numpy(weights4a))
    squeeze4a.bias = nn.Parameter(torch.from_numpy(bias4a))

    squeeze4b=nn.Conv2d(16, 64, kernel_size=3, bias=False, padding=1)
    squeeze4b.weight = nn.Parameter(torch.from_numpy(weights4b))
    squeeze4b.bias = nn.Parameter(torch.from_numpy(bias4b))

    squeeze5=nn.Conv2d(128, 32, kernel_size=1, bias=False)
    squeeze5.weight = nn.Parameter(torch.from_numpy(weights5))
    squeeze5.bias = nn.Parameter(torch.from_numpy(bias5))    

    squeeze6a=nn.Conv2d(32, 128, kernel_size=1, bias=False)
    squeeze6a.weight = nn.Parameter(torch.from_numpy(weights6a))
    squeeze6a.bias = nn.Parameter(torch.from_numpy(bias6a))

    squeeze6b=nn.Conv2d(32, 128, kernel_size=3, bias=False, padding=1)
    squeeze6b.weight = nn.Parameter(torch.from_numpy(weights6b))
    squeeze6b.bias = nn.Parameter(torch.from_numpy(bias6b))

    squeeze7=nn.Conv2d(256, 32, kernel_size=1, bias=False)
    squeeze7.weight = nn.Parameter(torch.from_numpy(weights7))
    squeeze7.bias = nn.Parameter(torch.from_numpy(bias7))    

    squeeze8a=nn.Conv2d(32, 128, kernel_size=1, bias=False)
    squeeze8a.weight = nn.Parameter(torch.from_numpy(weights8a))
    squeeze8a.bias = nn.Parameter(torch.from_numpy(bias8a))

    squeeze8b=nn.Conv2d(32, 128, kernel_size=3, bias=False, padding=1)
    squeeze8b.weight = nn.Parameter(torch.from_numpy(weights8b))
    squeeze8b.bias = nn.Parameter(torch.from_numpy(bias8b))

    squeeze9=nn.Conv2d(256, 48, kernel_size=1, bias=False)
    squeeze9.weight = nn.Parameter(torch.from_numpy(weights9))
    squeeze9.bias = nn.Parameter(torch.from_numpy(bias9))    

    squeeze10a=nn.Conv2d(48, 192, kernel_size=1, bias=False)
    squeeze10a.weight = nn.Parameter(torch.from_numpy(weights10a))
    squeeze10a.bias = nn.Parameter(torch.from_numpy(bias10a))

    squeeze10b=nn.Conv2d(48, 192, kernel_size=3, bias=False, padding=1)
    squeeze10b.weight = nn.Parameter(torch.from_numpy(weights10b))
    squeeze10b.bias = nn.Parameter(torch.from_numpy(bias10b))

    squeeze11=nn.Conv2d(384, 48, kernel_size=1, bias=False)
    squeeze11.weight = nn.Parameter(torch.from_numpy(weights11))
    squeeze11.bias = nn.Parameter(torch.from_numpy(bias11))    

    squeeze12a=nn.Conv2d(48, 192, kernel_size=1, bias=False)
    squeeze12a.weight = nn.Parameter(torch.from_numpy(weights12a))
    squeeze12a.bias = nn.Parameter(torch.from_numpy(bias12a))

    squeeze12b=nn.Conv2d(48, 192, kernel_size=3, bias=False, padding=1)
    squeeze12b.weight = nn.Parameter(torch.from_numpy(weights12b))
    squeeze12b.bias = nn.Parameter(torch.from_numpy(bias12b))

    squeeze13=nn.Conv2d(384, 64, kernel_size=1, bias=False)
    squeeze13.weight = nn.Parameter(torch.from_numpy(weights13))
    squeeze13.bias = nn.Parameter(torch.from_numpy(bias13))    

    squeeze14a=nn.Conv2d(64, 256, kernel_size=1, bias=False)
    squeeze14a.weight = nn.Parameter(torch.from_numpy(weights14a))
    squeeze14a.bias = nn.Parameter(torch.from_numpy(bias14a))

    squeeze14b=nn.Conv2d(64, 256, kernel_size=3, bias=False, padding=1)
    squeeze14b.weight = nn.Parameter(torch.from_numpy(weights14b))
    squeeze14b.bias = nn.Parameter(torch.from_numpy(bias14b))

    squeeze15=nn.Conv2d(512, 64, kernel_size=1, bias=False)
    squeeze15.weight = nn.Parameter(torch.from_numpy(weights15))
    squeeze15.bias = nn.Parameter(torch.from_numpy(bias15))    

    squeeze16a=nn.Conv2d(64, 256, kernel_size=1, bias=False)
    squeeze16a.weight = nn.Parameter(torch.from_numpy(weights16a))
    squeeze16a.bias = nn.Parameter(torch.from_numpy(bias16a))

    squeeze16b=nn.Conv2d(64, 256, kernel_size=3, bias=False, padding=1)
    squeeze16b.weight = nn.Parameter(torch.from_numpy(weights16b))
    squeeze16b.bias = nn.Parameter(torch.from_numpy(bias16b))

    conv_class=nn.Conv2d(512, 1000, kernel_size=1, bias=False)
    conv_class.weight = nn.Parameter(torch.from_numpy(weights17))
    conv_class.bias = nn.Parameter(torch.from_numpy(bias17))

    avgpool=nn.AvgPool2d(13)

    imagen1  = torch.from_numpy(imagen).float()

    salida0=squeeze0(imagen1)
    salida0_activation=squeeze_activation(salida0)

    salida_pool1 = maxpool(salida0_activation)

    salida1=squeeze1(salida_pool1)
    salida1_activation=squeeze_activation(salida1)

    salida2a=squeeze2a(salida1_activation)
    salida2a_activation=squeeze_activation(salida2a)
    salida2b=squeeze2b(salida1_activation)
    salida2b_activation=squeeze_activation(salida2b)    
    salida2_total=torch.cat([salida2a_activation,salida2b_activation], 1)

    salida3=squeeze3(salida2_total)
    salida3_activation=squeeze_activation(salida3)

    salida4a=squeeze4a(salida3_activation)
    salida4a_activation=squeeze_activation(salida4a)
    salida4b=squeeze4b(salida3_activation)
    salida4b_activation=squeeze_activation(salida4b)    
    salida4_total=torch.cat([salida4a_activation,salida4b_activation], 1)

    salida_pool2 = maxpool(salida4_total)

    salida5=squeeze5(salida_pool2)
    salida5_activation=squeeze_activation(salida5)

    salida6a=squeeze6a(salida5_activation)
    salida6a_activation=squeeze_activation(salida6a)
    salida6b=squeeze6b(salida5_activation)
    salida6b_activation=squeeze_activation(salida6b)    
    salida6_total=torch.cat([salida6a_activation,salida6b_activation], 1)

    salida7=squeeze7(salida6_total)
    salida7_activation=squeeze_activation(salida7)

    salida8a=squeeze8a(salida7_activation)
    salida8a_activation=squeeze_activation(salida8a)
    salida8b=squeeze8b(salida7_activation)
    salida8b_activation=squeeze_activation(salida8b)    
    salida8_total=torch.cat([salida8a_activation,salida8b_activation], 1)

    salida_pool3 = maxpool(salida8_total)

    salida9=squeeze9(salida_pool3)
    salida9_activation=squeeze_activation(salida9)

    salida10a=squeeze10a(salida9_activation)
    salida10a_activation=squeeze_activation(salida10a)
    salida10b=squeeze10b(salida9_activation)
    salida10b_activation=squeeze_activation(salida10b)    
    salida10_total=torch.cat([salida10a_activation,salida10b_activation], 1)

    salida11=squeeze11(salida10_total)
    salida11_activation=squeeze_activation(salida11)

    salida12a=squeeze12a(salida11_activation)
    salida12a_activation=squeeze_activation(salida12a)
    salida12b=squeeze12b(salida11_activation)
    salida12b_activation=squeeze_activation(salida12b)    
    salida12_total=torch.cat([salida12a_activation,salida12b_activation], 1)

    salida13=squeeze13(salida12_total)
    salida13_activation=squeeze_activation(salida13)

    salida14a=squeeze14a(salida13_activation)
    salida14a_activation=squeeze_activation(salida14a)
    salida14b=squeeze14b(salida13_activation)
    salida14b_activation=squeeze_activation(salida14b)    
    salida14_total=torch.cat([salida14a_activation,salida14b_activation], 1)

    salida15=squeeze15(salida14_total)
    salida15_activation=squeeze_activation(salida15)

    salida16a=squeeze16a(salida15_activation)
    salida16a_activation=squeeze_activation(salida16a)
    salida16b=squeeze16b(salida15_activation)
    salida16b_activation=squeeze_activation(salida16b)    
    salida16_total=torch.cat([salida16a_activation,salida16b_activation], 1)

    salida17=conv_class(salida16_total)
    salida17_activation=squeeze_activation(salida17)
    salida18=avgpool(salida17_activation)

    salida18_a_numpy=salida18.detach().numpy()

    toc=pc()

    acumulado_pytorch=toc-tic+acumulado_pytorch

    ####### OPENCL COMPARISON #######
                               
    # NDRANGE
    h_sample = imagen.reshape(-1).astype(np.float32)
    d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

    conv1_weight = weights0.reshape(-1)
    conv1_bias = bias0

    fire1_squeeze_weight = weights1.reshape(-1)
    fire1_squeeze_bias = bias1
    fire1_expand1x1_weight = weights2a.reshape(-1)
    fire1_expand1x1_bias = bias2a
    fire1_expand3x3_weight =weights2b.reshape(-1)
    fire1_expand3x3_bias = bias2b

    fire2_squeeze_weight = weights3.reshape(-1)
    fire2_squeeze_bias = bias3
    fire2_expand1x1_weight = weights4a.reshape(-1)
    fire2_expand1x1_bias = bias4a
    fire2_expand3x3_weight =weights4b.reshape(-1)
    fire2_expand3x3_bias = bias4b

    fire3_squeeze_weight = weights5.reshape(-1)
    fire3_squeeze_bias = bias5
    fire3_expand1x1_weight = weights6a.reshape(-1)
    fire3_expand1x1_bias = bias6a
    fire3_expand3x3_weight =weights6b.reshape(-1)
    fire3_expand3x3_bias = bias6b

    fire4_squeeze_weight = weights7.reshape(-1)
    fire4_squeeze_bias = bias7
    fire4_expand1x1_weight = weights8a.reshape(-1)
    fire4_expand1x1_bias = bias8a
    fire4_expand3x3_weight =weights8b.reshape(-1)
    fire4_expand3x3_bias = bias8b

    fire5_squeeze_weight = weights9.reshape(-1)
    fire5_squeeze_bias = bias9
    fire5_expand1x1_weight = weights10a.reshape(-1)
    fire5_expand1x1_bias = bias10a
    fire5_expand3x3_weight =weights10b.reshape(-1)
    fire5_expand3x3_bias = bias10b

    fire6_squeeze_weight = weights11.reshape(-1)
    fire6_squeeze_bias = bias11
    fire6_expand1x1_weight = weights12a.reshape(-1)
    fire6_expand1x1_bias = bias12a
    fire6_expand3x3_weight =weights12b.reshape(-1)
    fire6_expand3x3_bias = bias12b

    fire7_squeeze_weight = weights13.reshape(-1)
    fire7_squeeze_bias = bias13
    fire7_expand1x1_weight = weights14a.reshape(-1)
    fire7_expand1x1_bias = bias14a
    fire7_expand3x3_weight =weights14b.reshape(-1)
    fire7_expand3x3_bias = bias14b

    fire8_squeeze_weight = weights15.reshape(-1)
    fire8_squeeze_bias = bias15
    fire8_expand1x1_weight = weights16a.reshape(-1)
    fire8_expand1x1_bias = bias16a
    fire8_expand3x3_weight =weights16b.reshape(-1)
    fire8_expand3x3_bias = bias16b

    classifier_conv_weight = weights17.reshape(-1)
    classifier_conv_bias = bias17

    h_result_conv = np.empty(1 * 64 * 111 * 111).astype(np.float32)
    h_result_pool1 = np.empty(1 * 64 * 55 * 55).astype(np.float32)

    h_result_fire1_squeeze = np.empty(1 * 16 * 55 * 55).astype(np.float32)
    h_result_fire1_expand = np.empty(1 * 128 * 55 * 55).astype(np.float32)
    h_result_fire2_squeeze = np.empty(1 * 16 * 55 * 55).astype(np.float32)
    h_result_fire2_expand = np.empty(1 * 128 * 55 * 55).astype(np.float32)
    h_result_pool2 = np.empty(1 * 128 * 27 * 27).astype(np.float32)

    h_result_fire3_squeeze = np.empty(1 * 32 * 27 * 27).astype(np.float32)
    h_result_fire3_expand = np.empty(1 * 256 * 27 * 27).astype(np.float32)
    h_result_fire4_squeeze = np.empty(1 * 32 * 27 * 27).astype(np.float32)
    h_result_fire4_expand = np.empty(1 * 256 * 27 * 27).astype(np.float32)
    h_result_pool3 = np.empty(1 * 256 * 13 * 13).astype(np.float32)

    h_result_fire5_squeeze = np.empty(1 * 48 * 13 * 13).astype(np.float32)
    h_result_fire5_expand = np.empty(1 * 384 * 13 * 13).astype(np.float32)
    h_result_fire6_squeeze = np.empty(1 * 48 * 13 * 13).astype(np.float32)
    h_result_fire6_expand = np.empty(1 * 384 * 13 * 13).astype(np.float32)
    h_result_fire7_squeeze = np.empty(1 * 64 * 13 * 13).astype(np.float32)
    h_result_fire7_expand = np.empty(1 * 512 * 13 * 13).astype(np.float32)
    h_result_fire8_squeeze = np.empty(1 * 64 * 13 * 13).astype(np.float32)
    h_result_fire8_expand = np.empty(1 * 512 * 13 * 13).astype(np.float32)

    h_result_classifier_conv = np.empty(1 * 1000 * 13 * 13).astype(np.float32)
    h_result_classifier = np.empty(1 * 1000).astype(np.float32)

    d_conv1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=conv1_weight)
    d_conv1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=conv1_bias)

    d_fire1_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
    d_fire1_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)
    d_fire1_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand1x1_weight)
    d_fire1_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand1x1_bias)
    d_fire1_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand3x3_weight)
    d_fire1_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand3x3_bias)

    d_fire2_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_squeeze_weight)
    d_fire2_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_squeeze_bias)
    d_fire2_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand1x1_weight)
    d_fire2_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand1x1_bias)
    d_fire2_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand3x3_weight)
    d_fire2_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand3x3_bias)

    d_fire3_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_squeeze_weight)
    d_fire3_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_squeeze_bias)
    d_fire3_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand1x1_weight)
    d_fire3_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand1x1_bias)
    d_fire3_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand3x3_weight)
    d_fire3_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand3x3_bias)

    d_fire4_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_squeeze_weight)
    d_fire4_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_squeeze_bias)
    d_fire4_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand1x1_weight)
    d_fire4_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand1x1_bias)
    d_fire4_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand3x3_weight)
    d_fire4_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand3x3_bias)

    d_fire5_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_squeeze_weight)
    d_fire5_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_squeeze_bias)
    d_fire5_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand1x1_weight)
    d_fire5_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand1x1_bias)
    d_fire5_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand3x3_weight)
    d_fire5_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand3x3_bias)

    d_fire6_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_weight)
    d_fire6_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_bias)
    d_fire6_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_weight)
    d_fire6_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_bias)
    d_fire6_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_weight)
    d_fire6_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_bias)

    d_fire7_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_weight)
    d_fire7_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_bias)
    d_fire7_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_weight)
    d_fire7_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_bias)
    d_fire7_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_weight)
    d_fire7_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_bias)

    d_fire8_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_weight)
    d_fire8_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_bias)
    d_fire8_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_weight)
    d_fire8_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_bias)
    d_fire8_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_weight)
    d_fire8_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_bias)

    d_classifier_conv_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=classifier_conv_weight)
    d_classifier_conv_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=classifier_conv_bias)

    d_result_conv = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_conv.nbytes)
    d_result_pool1 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool1.nbytes)

    d_result_fire1_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)
    d_result_fire1_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_expand.nbytes)
    d_result_fire2_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire2_squeeze.nbytes)
    d_result_fire2_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire2_expand.nbytes)
    d_result_pool2 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool2.nbytes)

    d_result_fire3_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire3_squeeze.nbytes)
    d_result_fire3_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire3_expand.nbytes)
    d_result_fire4_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire4_squeeze.nbytes)
    d_result_fire4_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire4_expand.nbytes)
    d_result_pool3 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool3.nbytes)

    d_result_fire5_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire5_squeeze.nbytes)
    d_result_fire5_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire5_expand.nbytes)
    d_result_fire6_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_squeeze.nbytes)
    d_result_fire6_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_expand.nbytes)
    d_result_fire7_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_squeeze.nbytes)
    d_result_fire7_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_expand.nbytes)
    d_result_fire8_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_squeeze.nbytes)
    d_result_fire8_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_expand.nbytes)

    d_result_classifier_conv = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_classifier_conv.nbytes)
    d_result_classifier = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_classifier.nbytes)

    tic5 = pc()

    #first conv layer
    conv3x3_NDR(queue,(64, 111), None, 3, 224, 0, 2, 0, 111, d_sample, d_conv1_weight, d_conv1_bias, d_result_conv)
    maxpool_NDR(queue, (64, ), None, 111, 55, d_result_conv, d_result_pool1)

    #block1
    conv1x1_NDR(queue,(16, 55), None, np.int32(64/4), 55, d_result_pool1, d_fire1_squeeze_weight, d_fire1_squeeze_bias, d_result_fire1_squeeze)
    queue.finish()
    conv1x1_NDR(queue1,(64, 55), None, np.int32(16/4), 55, d_result_fire1_squeeze, d_fire1_expand1x1_weight, d_fire1_expand1x1_bias, d_result_fire1_expand)
    conv3x3_NDR(queue,(64, 55), None, 16, 55, 1, 1, 64, 55, d_result_fire1_squeeze, d_fire1_expand3x3_weight, d_fire1_expand3x3_bias, d_result_fire1_expand)
    queue.finish()
    queue1.finish()

    conv1x1_NDR(queue,(16, 55), None, np.int32(128/4), 55, d_result_fire1_expand, d_fire2_squeeze_weight, d_fire2_squeeze_bias, d_result_fire2_squeeze)
    queue.finish()
    conv1x1_NDR(queue1,(64, 55), None, np.int32(16/4), 55, d_result_fire2_squeeze, d_fire2_expand1x1_weight, d_fire2_expand1x1_bias, d_result_fire2_expand)
    conv3x3_NDR(queue,(64, 55), None, 16, 55, 1, 1, 64, 55, d_result_fire2_squeeze, d_fire2_expand3x3_weight, d_fire2_expand3x3_bias, d_result_fire2_expand)
    queue.finish()
    queue1.finish()

    maxpool_NDR(queue, (128, ), None, 55, 27, d_result_fire2_expand, d_result_pool2)

    #block2
    conv1x1_NDR(queue,(32, 27), None, np.int32(128/4), 27, d_result_pool2, d_fire3_squeeze_weight, d_fire3_squeeze_bias, d_result_fire3_squeeze)
    queue.finish()
    conv1x1_NDR(queue1,(128, 27), None, np.int32(32/4), 27, d_result_fire3_squeeze, d_fire3_expand1x1_weight, d_fire3_expand1x1_bias, d_result_fire3_expand)
    conv3x3_NDR(queue,(128, 27), None, 32, 27, 1, 1, 128, 27, d_result_fire3_squeeze, d_fire3_expand3x3_weight, d_fire3_expand3x3_bias, d_result_fire3_expand)
    queue.finish()
    queue1.finish()

    conv1x1_NDR(queue,(32, 27), None, np.int32(256/4), 27, d_result_fire3_expand, d_fire4_squeeze_weight, d_fire4_squeeze_bias, d_result_fire4_squeeze)
    queue.finish()
    conv1x1_NDR(queue1,(128, 27), None, np.int32(32/4), 27, d_result_fire4_squeeze, d_fire4_expand1x1_weight, d_fire4_expand1x1_bias, d_result_fire4_expand)
    conv3x3_NDR(queue,(128, 27), None, 32, 27, 1, 1, 128, 27, d_result_fire4_squeeze, d_fire4_expand3x3_weight, d_fire4_expand3x3_bias, d_result_fire4_expand)
    queue.finish()
    queue1.finish()

    maxpool_NDR(queue, (256, ), None, 27, 13, d_result_fire4_expand, d_result_pool3)

    #block3
    conv1x1_NDR(queue,(48, 13), None, np.int32(256/4), 13, d_result_pool3, d_fire5_squeeze_weight, d_fire5_squeeze_bias, d_result_fire5_squeeze)
    queue.finish()
    conv1x1_NDR(queue1,(192, 13), None, np.int32(48/4), 13, d_result_fire5_squeeze, d_fire5_expand1x1_weight, d_fire5_expand1x1_bias, d_result_fire5_expand)
    conv3x3_NDR(queue,(192, 13), None, 48, 13, 1, 1, 192, 13, d_result_fire5_squeeze, d_fire5_expand3x3_weight, d_fire5_expand3x3_bias, d_result_fire5_expand)
    queue.finish()
    queue1.finish()

    conv1x1_NDR(queue,(48, 13), None, np.int32(384/4), 13, d_result_fire5_expand, d_fire6_squeeze_weight, d_fire6_squeeze_bias, d_result_fire6_squeeze)
    queue.finish()
    conv1x1_NDR(queue1,(192, 13), None, np.int32(48/4), 13, d_result_fire6_squeeze, d_fire6_expand1x1_weight, d_fire6_expand1x1_bias, d_result_fire6_expand)
    conv3x3_NDR(queue,(192, 13), None, 48, 13, 1, 1, 192, 13, d_result_fire6_squeeze, d_fire6_expand3x3_weight, d_fire6_expand3x3_bias, d_result_fire6_expand)
    queue.finish()
    queue1.finish()

    conv1x1_NDR(queue,(64, 13), None, np.int32(384/4), 13, d_result_fire6_expand, d_fire7_squeeze_weight, d_fire7_squeeze_bias, d_result_fire7_squeeze)
    queue.finish()
    conv1x1_NDR(queue1,(256, 13), None, np.int32(64/4), 13, d_result_fire7_squeeze, d_fire7_expand1x1_weight, d_fire7_expand1x1_bias, d_result_fire7_expand)
    conv3x3_NDR(queue,(256, 13), None, 64, 13, 1, 1, 256, 13, d_result_fire7_squeeze, d_fire7_expand3x3_weight, d_fire7_expand3x3_bias, d_result_fire7_expand)
    queue.finish()
    queue1.finish()

    conv1x1_NDR(queue,(64, 13), None, np.int32(512/4), 13, d_result_fire7_expand, d_fire8_squeeze_weight, d_fire8_squeeze_bias, d_result_fire8_squeeze)
    queue.finish()
    conv1x1_NDR(queue1,(256, 13), None, np.int32(64/4), 13, d_result_fire8_squeeze, d_fire8_expand1x1_weight, d_fire8_expand1x1_bias, d_result_fire8_expand)
    conv3x3_NDR(queue,(256, 13), None, 64, 13, 1, 1, 256, 13, d_result_fire8_squeeze, d_fire8_expand3x3_weight, d_fire8_expand3x3_bias, d_result_fire8_expand)
    queue.finish()
    queue1.finish()

    # classifier
    conv1x1_NDR(queue,(1000, 13), None, np.int32(512/4), 13, d_result_fire8_expand, d_classifier_conv_weight, d_classifier_conv_bias, d_result_classifier_conv)

    avgpool_NDR(queue,(1000, ), None, d_result_classifier_conv, d_result_classifier)

    cl.enqueue_copy(queue, h_result_classifier, d_result_classifier)

    queue.finish()

    veamos3 = h_result_classifier
    
    toc5 = pc()

    acumulado_kernel = toc5 - tic5 + acumulado_kernel
    
    comparativa &= np.allclose(salida18_a_numpy.reshape(-1), veamos3,rtol=1e-01, atol=1e-01)
    
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)
print ("tiempo en segundos con opencl (NDRange)=",acumulado_kernel/count)
print("comparativa (pytorch == NDRange): ",comparativa)

tiempo en segundos con pytorch=  0.05108882825000819
tiempo en segundos con opencl (Simple Task)= 0.028037996070006555
comparativa (pytorch == Simple Task):  True


In [1]:
queeze_activation = nn.ReLU(inplace=True)

count = 100
acumulado_pytorch=0
acumulado_kernel=0
comparativa = True

#####################################################

params = torch.load('squeezenet1_1.pth')

### First Conv3x3 and maxpool
weights0=params['features.0.weight'].numpy()
bias0=params['features.0.bias'].numpy()

######## BLOCK 1 ########
#fire - fire - maxpool
#### FIRE 1 ####
weights1=params['features.3.squeeze.weight'].numpy()
bias1=params['features.3.squeeze.bias'].numpy()

weights2a=params['features.3.expand1x1.weight'].numpy()
bias2a=params['features.3.expand1x1.bias'].numpy()    

weights2b=params['features.3.expand3x3.weight'].numpy()   
bias2b=params['features.3.expand3x3.bias'].numpy()

#### FIRE 2 ####
weights3=params['features.4.squeeze.weight'].numpy()
bias3=params['features.4.squeeze.bias'].numpy()

weights4a=params['features.4.expand1x1.weight'].numpy()
bias4a=params['features.4.expand1x1.bias'].numpy()    

weights4b=params['features.4.expand3x3.weight'].numpy()   
bias4b=params['features.4.expand3x3.bias'].numpy()

######## BLOCK 2 ########
#fire - fire - maxpool
#### FIRE 3 ####
weights5=params['features.6.squeeze.weight'].numpy()
bias5=params['features.6.squeeze.bias'].numpy()

weights6a=params['features.6.expand1x1.weight'].numpy()
bias6a=params['features.6.expand1x1.bias'].numpy()    

weights6b=params['features.6.expand3x3.weight'].numpy()   
bias6b=params['features.6.expand3x3.bias'].numpy()

#### FIRE 4 ####
weights7=params['features.7.squeeze.weight'].numpy()
bias7=params['features.7.squeeze.bias'].numpy()

weights8a=params['features.7.expand1x1.weight'].numpy()
bias8a=params['features.7.expand1x1.bias'].numpy()    

weights8b=params['features.7.expand3x3.weight'].numpy()  
bias8b=params['features.7.expand3x3.bias'].numpy()

######## BLOCK 3 ########
#fire - fire - fire - fire
#### FIRE 5 ####
weights9=params['features.9.squeeze.weight'].numpy()
bias9=params['features.9.squeeze.bias'].numpy()

weights10a=params['features.9.expand1x1.weight'].numpy()
bias10a=params['features.9.expand1x1.bias'].numpy()    

weights10b=params['features.9.expand3x3.weight'].numpy()
bias10b=params['features.9.expand3x3.bias'].numpy()

#### FIRE 6 ####
weights11=params['features.10.squeeze.weight'].numpy()
bias11=params['features.10.squeeze.bias'].numpy()

weights12a=params['features.10.expand1x1.weight'].numpy()
bias12a=params['features.10.expand1x1.bias'].numpy()    

weights12b=params['features.10.expand3x3.weight'].numpy()
bias12b=params['features.10.expand3x3.bias'].numpy()

#### FIRE 7 ####
weights13=params['features.11.squeeze.weight'].numpy()
bias13=params['features.11.squeeze.bias'].numpy()

weights14a=params['features.11.expand1x1.weight'].numpy()
bias14a=params['features.11.expand1x1.bias'].numpy()    

weights14b=params['features.11.expand3x3.weight'].numpy()
bias14b=params['features.11.expand3x3.bias'].numpy()

#### FIRE 8 ####
weights15=params['features.12.squeeze.weight'].numpy()
bias15=params['features.12.squeeze.bias'].numpy()

weights16a=params['features.12.expand1x1.weight'].numpy()
bias16a=params['features.12.expand1x1.bias'].numpy()    

weights16b=params['features.12.expand3x3.weight'].numpy()
bias16b=params['features.12.expand3x3.bias'].numpy()

######## Classifier ########
#conv3x3 - avgpool
### Classifier Conv3x3 and avgpool
weights17=params['classifier.1.weight'].numpy()
bias17=params['classifier.1.bias'].numpy()

for i in range(count):
   

    transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ],
                         std = [ 0.229, 0.224, 0.225 ]),])

    imagen = np.random.rand(3, 224, 224).astype(np.float32)
    imagen = Image.fromarray(imagen,'RGB')
    imagen = transform(imagen).numpy()
    imagen = imagen[np.newaxis,:]

    #imagen = np.ones((1, 3, 224, 224))
    #print(imagen.shape)

    tic=pc()

    squeeze0=nn.Conv2d(3, 64, kernel_size=3, bias=False, stride=2)
    squeeze0.weight = nn.Parameter(torch.from_numpy(weights0))
    squeeze0.bias = nn.Parameter(torch.from_numpy(bias0))

    maxpool=nn.MaxPool2d(3, stride=2)

    squeeze1=nn.Conv2d(64, 16, kernel_size=1, bias=False)
    squeeze1.weight = nn.Parameter(torch.from_numpy(weights1))
    squeeze1.bias = nn.Parameter(torch.from_numpy(bias1))    

    squeeze2a=nn.Conv2d(16, 64, kernel_size=1, bias=False)
    squeeze2a.weight = nn.Parameter(torch.from_numpy(weights2a))
    squeeze2a.bias = nn.Parameter(torch.from_numpy(bias2a))

    squeeze2b=nn.Conv2d(16, 64, kernel_size=3, bias=False, padding=1)
    squeeze2b.weight = nn.Parameter(torch.from_numpy(weights2b))
    squeeze2b.bias = nn.Parameter(torch.from_numpy(bias2b))

    squeeze3=nn.Conv2d(128, 16, kernel_size=1, bias=False)
    squeeze3.weight = nn.Parameter(torch.from_numpy(weights3))
    squeeze3.bias = nn.Parameter(torch.from_numpy(bias3))    

    squeeze4a=nn.Conv2d(16, 64, kernel_size=1, bias=False)
    squeeze4a.weight = nn.Parameter(torch.from_numpy(weights4a))
    squeeze4a.bias = nn.Parameter(torch.from_numpy(bias4a))

    squeeze4b=nn.Conv2d(16, 64, kernel_size=3, bias=False, padding=1)
    squeeze4b.weight = nn.Parameter(torch.from_numpy(weights4b))
    squeeze4b.bias = nn.Parameter(torch.from_numpy(bias4b))

    squeeze5=nn.Conv2d(128, 32, kernel_size=1, bias=False)
    squeeze5.weight = nn.Parameter(torch.from_numpy(weights5))
    squeeze5.bias = nn.Parameter(torch.from_numpy(bias5))    

    squeeze6a=nn.Conv2d(32, 128, kernel_size=1, bias=False)
    squeeze6a.weight = nn.Parameter(torch.from_numpy(weights6a))
    squeeze6a.bias = nn.Parameter(torch.from_numpy(bias6a))

    squeeze6b=nn.Conv2d(32, 128, kernel_size=3, bias=False, padding=1)
    squeeze6b.weight = nn.Parameter(torch.from_numpy(weights6b))
    squeeze6b.bias = nn.Parameter(torch.from_numpy(bias6b))

    squeeze7=nn.Conv2d(256, 32, kernel_size=1, bias=False)
    squeeze7.weight = nn.Parameter(torch.from_numpy(weights7))
    squeeze7.bias = nn.Parameter(torch.from_numpy(bias7))    

    squeeze8a=nn.Conv2d(32, 128, kernel_size=1, bias=False)
    squeeze8a.weight = nn.Parameter(torch.from_numpy(weights8a))
    squeeze8a.bias = nn.Parameter(torch.from_numpy(bias8a))

    squeeze8b=nn.Conv2d(32, 128, kernel_size=3, bias=False, padding=1)
    squeeze8b.weight = nn.Parameter(torch.from_numpy(weights8b))
    squeeze8b.bias = nn.Parameter(torch.from_numpy(bias8b))

    squeeze9=nn.Conv2d(256, 48, kernel_size=1, bias=False)
    squeeze9.weight = nn.Parameter(torch.from_numpy(weights9))
    squeeze9.bias = nn.Parameter(torch.from_numpy(bias9))    

    squeeze10a=nn.Conv2d(48, 192, kernel_size=1, bias=False)
    squeeze10a.weight = nn.Parameter(torch.from_numpy(weights10a))
    squeeze10a.bias = nn.Parameter(torch.from_numpy(bias10a))

    squeeze10b=nn.Conv2d(48, 192, kernel_size=3, bias=False, padding=1)
    squeeze10b.weight = nn.Parameter(torch.from_numpy(weights10b))
    squeeze10b.bias = nn.Parameter(torch.from_numpy(bias10b))

    squeeze11=nn.Conv2d(384, 48, kernel_size=1, bias=False)
    squeeze11.weight = nn.Parameter(torch.from_numpy(weights11))
    squeeze11.bias = nn.Parameter(torch.from_numpy(bias11))    

    squeeze12a=nn.Conv2d(48, 192, kernel_size=1, bias=False)
    squeeze12a.weight = nn.Parameter(torch.from_numpy(weights12a))
    squeeze12a.bias = nn.Parameter(torch.from_numpy(bias12a))

    squeeze12b=nn.Conv2d(48, 192, kernel_size=3, bias=False, padding=1)
    squeeze12b.weight = nn.Parameter(torch.from_numpy(weights12b))
    squeeze12b.bias = nn.Parameter(torch.from_numpy(bias12b))

    squeeze13=nn.Conv2d(384, 64, kernel_size=1, bias=False)
    squeeze13.weight = nn.Parameter(torch.from_numpy(weights13))
    squeeze13.bias = nn.Parameter(torch.from_numpy(bias13))    

    squeeze14a=nn.Conv2d(64, 256, kernel_size=1, bias=False)
    squeeze14a.weight = nn.Parameter(torch.from_numpy(weights14a))
    squeeze14a.bias = nn.Parameter(torch.from_numpy(bias14a))

    squeeze14b=nn.Conv2d(64, 256, kernel_size=3, bias=False, padding=1)
    squeeze14b.weight = nn.Parameter(torch.from_numpy(weights14b))
    squeeze14b.bias = nn.Parameter(torch.from_numpy(bias14b))

    squeeze15=nn.Conv2d(512, 64, kernel_size=1, bias=False)
    squeeze15.weight = nn.Parameter(torch.from_numpy(weights15))
    squeeze15.bias = nn.Parameter(torch.from_numpy(bias15))    

    squeeze16a=nn.Conv2d(64, 256, kernel_size=1, bias=False)
    squeeze16a.weight = nn.Parameter(torch.from_numpy(weights16a))
    squeeze16a.bias = nn.Parameter(torch.from_numpy(bias16a))

    squeeze16b=nn.Conv2d(64, 256, kernel_size=3, bias=False, padding=1)
    squeeze16b.weight = nn.Parameter(torch.from_numpy(weights16b))
    squeeze16b.bias = nn.Parameter(torch.from_numpy(bias16b))

    conv_class=nn.Conv2d(512, 1000, kernel_size=1, bias=False)
    conv_class.weight = nn.Parameter(torch.from_numpy(weights17))
    conv_class.bias = nn.Parameter(torch.from_numpy(bias17))

    avgpool=nn.AvgPool2d(13)

    imagen1  = torch.from_numpy(imagen).float()

    salida0=squeeze0(imagen1)
    salida0_activation=squeeze_activation(salida0)

    salida_pool1 = maxpool(salida0_activation)

    salida1=squeeze1(salida_pool1)
    salida1_activation=squeeze_activation(salida1)

    salida2a=squeeze2a(salida1_activation)
    salida2a_activation=squeeze_activation(salida2a)
    salida2b=squeeze2b(salida1_activation)
    salida2b_activation=squeeze_activation(salida2b)    
    salida2_total=torch.cat([salida2a_activation,salida2b_activation], 1)

    salida3=squeeze3(salida2_total)
    salida3_activation=squeeze_activation(salida3)

    salida4a=squeeze4a(salida3_activation)
    salida4a_activation=squeeze_activation(salida4a)
    salida4b=squeeze4b(salida3_activation)
    salida4b_activation=squeeze_activation(salida4b)    
    salida4_total=torch.cat([salida4a_activation,salida4b_activation], 1)

    salida_pool2 = maxpool(salida4_total)

    salida5=squeeze5(salida_pool2)
    salida5_activation=squeeze_activation(salida5)

    salida6a=squeeze6a(salida5_activation)
    salida6a_activation=squeeze_activation(salida6a)
    salida6b=squeeze6b(salida5_activation)
    salida6b_activation=squeeze_activation(salida6b)    
    salida6_total=torch.cat([salida6a_activation,salida6b_activation], 1)

    salida7=squeeze7(salida6_total)
    salida7_activation=squeeze_activation(salida7)

    salida8a=squeeze8a(salida7_activation)
    salida8a_activation=squeeze_activation(salida8a)
    salida8b=squeeze8b(salida7_activation)
    salida8b_activation=squeeze_activation(salida8b)    
    salida8_total=torch.cat([salida8a_activation,salida8b_activation], 1)

    salida_pool3 = maxpool(salida8_total)

    salida9=squeeze9(salida_pool3)
    salida9_activation=squeeze_activation(salida9)

    salida10a=squeeze10a(salida9_activation)
    salida10a_activation=squeeze_activation(salida10a)
    salida10b=squeeze10b(salida9_activation)
    salida10b_activation=squeeze_activation(salida10b)    
    salida10_total=torch.cat([salida10a_activation,salida10b_activation], 1)

    salida11=squeeze11(salida10_total)
    salida11_activation=squeeze_activation(salida11)

    salida12a=squeeze12a(salida11_activation)
    salida12a_activation=squeeze_activation(salida12a)
    salida12b=squeeze12b(salida11_activation)
    salida12b_activation=squeeze_activation(salida12b)    
    salida12_total=torch.cat([salida12a_activation,salida12b_activation], 1)

    salida13=squeeze13(salida12_total)
    salida13_activation=squeeze_activation(salida13)

    salida14a=squeeze14a(salida13_activation)
    salida14a_activation=squeeze_activation(salida14a)
    salida14b=squeeze14b(salida13_activation)
    salida14b_activation=squeeze_activation(salida14b)    
    salida14_total=torch.cat([salida14a_activation,salida14b_activation], 1)

    salida15=squeeze15(salida14_total)
    salida15_activation=squeeze_activation(salida15)

    salida16a=squeeze16a(salida15_activation)
    salida16a_activation=squeeze_activation(salida16a)
    salida16b=squeeze16b(salida15_activation)
    salida16b_activation=squeeze_activation(salida16b)    
    salida16_total=torch.cat([salida16a_activation,salida16b_activation], 1)

    salida17=conv_class(salida16_total)
    salida17_activation=squeeze_activation(salida17)
    salida18=avgpool(salida17_activation)

    salida18_a_numpy=salida18.detach().numpy()

    toc=pc()

    acumulado_pytorch=toc-tic+acumulado_pytorch

    ####### OPENCL COMPARISON #######
                               
    # Simple task
    h_sample = imagen.reshape(-1).astype(np.float32)
    d_sample = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_sample)

    conv1_weight = weights0.reshape(-1)
    conv1_bias = bias0

    fire1_squeeze_weight = weights1.reshape(-1)
    fire1_squeeze_bias = bias1
    fire1_expand1x1_weight = weights2a.reshape(-1)
    fire1_expand1x1_bias = bias2a
    fire1_expand3x3_weight =weights2b.reshape(-1)
    fire1_expand3x3_bias = bias2b

    fire2_squeeze_weight = weights3.reshape(-1)
    fire2_squeeze_bias = bias3
    fire2_expand1x1_weight = weights4a.reshape(-1)
    fire2_expand1x1_bias = bias4a
    fire2_expand3x3_weight =weights4b.reshape(-1)
    fire2_expand3x3_bias = bias4b

    fire3_squeeze_weight = weights5.reshape(-1)
    fire3_squeeze_bias = bias5
    fire3_expand1x1_weight = weights6a.reshape(-1)
    fire3_expand1x1_bias = bias6a
    fire3_expand3x3_weight =weights6b.reshape(-1)
    fire3_expand3x3_bias = bias6b

    fire4_squeeze_weight = weights7.reshape(-1)
    fire4_squeeze_bias = bias7
    fire4_expand1x1_weight = weights8a.reshape(-1)
    fire4_expand1x1_bias = bias8a
    fire4_expand3x3_weight =weights8b.reshape(-1)
    fire4_expand3x3_bias = bias8b

    fire5_squeeze_weight = weights9.reshape(-1)
    fire5_squeeze_bias = bias9
    fire5_expand1x1_weight = weights10a.reshape(-1)
    fire5_expand1x1_bias = bias10a
    fire5_expand3x3_weight =weights10b.reshape(-1)
    fire5_expand3x3_bias = bias10b

    fire6_squeeze_weight = weights11.reshape(-1)
    fire6_squeeze_bias = bias11
    fire6_expand1x1_weight = weights12a.reshape(-1)
    fire6_expand1x1_bias = bias12a
    fire6_expand3x3_weight =weights12b.reshape(-1)
    fire6_expand3x3_bias = bias12b

    fire7_squeeze_weight = weights13.reshape(-1)
    fire7_squeeze_bias = bias13
    fire7_expand1x1_weight = weights14a.reshape(-1)
    fire7_expand1x1_bias = bias14a
    fire7_expand3x3_weight =weights14b.reshape(-1)
    fire7_expand3x3_bias = bias14b

    fire8_squeeze_weight = weights15.reshape(-1)
    fire8_squeeze_bias = bias15
    fire8_expand1x1_weight = weights16a.reshape(-1)
    fire8_expand1x1_bias = bias16a
    fire8_expand3x3_weight =weights16b.reshape(-1)
    fire8_expand3x3_bias = bias16b

    classifier_conv_weight = weights17.reshape(-1)
    classifier_conv_bias = bias17

    h_result_conv = np.empty(1 * 64 * 111 * 111).astype(np.float32)
    h_result_pool1 = np.empty(1 * 64 * 55 * 55).astype(np.float32)

    h_result_fire1_squeeze = np.empty(1 * 16 * 55 * 55).astype(np.float32)
    h_result_fire1_expand = np.empty(1 * 128 * 55 * 55).astype(np.float32)
    h_result_fire2_squeeze = np.empty(1 * 16 * 55 * 55).astype(np.float32)
    h_result_fire2_expand = np.empty(1 * 128 * 55 * 55).astype(np.float32)
    h_result_pool2 = np.empty(1 * 128 * 27 * 27).astype(np.float32)

    h_result_fire3_squeeze = np.empty(1 * 32 * 27 * 27).astype(np.float32)
    h_result_fire3_expand = np.empty(1 * 256 * 27 * 27).astype(np.float32)
    h_result_fire4_squeeze = np.empty(1 * 32 * 27 * 27).astype(np.float32)
    h_result_fire4_expand = np.empty(1 * 256 * 27 * 27).astype(np.float32)
    h_result_pool3 = np.empty(1 * 256 * 13 * 13).astype(np.float32)

    h_result_fire5_squeeze = np.empty(1 * 48 * 13 * 13).astype(np.float32)
    h_result_fire5_expand = np.empty(1 * 384 * 13 * 13).astype(np.float32)
    h_result_fire6_squeeze = np.empty(1 * 48 * 13 * 13).astype(np.float32)
    h_result_fire6_expand = np.empty(1 * 384 * 13 * 13).astype(np.float32)
    h_result_fire7_squeeze = np.empty(1 * 64 * 13 * 13).astype(np.float32)
    h_result_fire7_expand = np.empty(1 * 512 * 13 * 13).astype(np.float32)
    h_result_fire8_squeeze = np.empty(1 * 64 * 13 * 13).astype(np.float32)
    h_result_fire8_expand = np.empty(1 * 512 * 13 * 13).astype(np.float32)

    h_result_classifier_conv = np.empty(1 * 1000 * 13 * 13).astype(np.float32)
    h_result_classifier = np.empty(1 * 1000).astype(np.float32)

    d_conv1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=conv1_weight)
    d_conv1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=conv1_bias)

    d_fire1_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_weight)
    d_fire1_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_squeeze_bias)
    d_fire1_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand1x1_weight)
    d_fire1_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand1x1_bias)
    d_fire1_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand3x3_weight)
    d_fire1_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire1_expand3x3_bias)

    d_fire2_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_squeeze_weight)
    d_fire2_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_squeeze_bias)
    d_fire2_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand1x1_weight)
    d_fire2_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand1x1_bias)
    d_fire2_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand3x3_weight)
    d_fire2_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire2_expand3x3_bias)

    d_fire3_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_squeeze_weight)
    d_fire3_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_squeeze_bias)
    d_fire3_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand1x1_weight)
    d_fire3_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand1x1_bias)
    d_fire3_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand3x3_weight)
    d_fire3_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire3_expand3x3_bias)

    d_fire4_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_squeeze_weight)
    d_fire4_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_squeeze_bias)
    d_fire4_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand1x1_weight)
    d_fire4_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand1x1_bias)
    d_fire4_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand3x3_weight)
    d_fire4_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire4_expand3x3_bias)

    d_fire5_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_squeeze_weight)
    d_fire5_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_squeeze_bias)
    d_fire5_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand1x1_weight)
    d_fire5_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand1x1_bias)
    d_fire5_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand3x3_weight)
    d_fire5_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire5_expand3x3_bias)

    d_fire6_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_weight)
    d_fire6_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_squeeze_bias)
    d_fire6_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_weight)
    d_fire6_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand1x1_bias)
    d_fire6_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_weight)
    d_fire6_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire6_expand3x3_bias)

    d_fire7_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_weight)
    d_fire7_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_squeeze_bias)
    d_fire7_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_weight)
    d_fire7_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand1x1_bias)
    d_fire7_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_weight)
    d_fire7_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire7_expand3x3_bias)

    d_fire8_squeeze_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_weight)
    d_fire8_squeeze_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_squeeze_bias)
    d_fire8_expand1x1_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_weight)
    d_fire8_expand1x1_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand1x1_bias)
    d_fire8_expand3x3_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_weight)
    d_fire8_expand3x3_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=fire8_expand3x3_bias)

    d_classifier_conv_weight = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=classifier_conv_weight)
    d_classifier_conv_bias = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=classifier_conv_bias)

    d_result_conv = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_conv.nbytes)
    d_result_pool1 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool1.nbytes)

    d_result_fire1_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_squeeze.nbytes)
    d_result_fire1_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire1_expand.nbytes)
    d_result_fire2_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire2_squeeze.nbytes)
    d_result_fire2_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire2_expand.nbytes)
    d_result_pool2 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool2.nbytes)

    d_result_fire3_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire3_squeeze.nbytes)
    d_result_fire3_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire3_expand.nbytes)
    d_result_fire4_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire4_squeeze.nbytes)
    d_result_fire4_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire4_expand.nbytes)
    d_result_pool3 = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_pool3.nbytes)

    d_result_fire5_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire5_squeeze.nbytes)
    d_result_fire5_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire5_expand.nbytes)
    d_result_fire6_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_squeeze.nbytes)
    d_result_fire6_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire6_expand.nbytes)
    d_result_fire7_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_squeeze.nbytes)
    d_result_fire7_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire7_expand.nbytes)
    d_result_fire8_squeeze = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_squeeze.nbytes)
    d_result_fire8_expand = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_fire8_expand.nbytes)

    d_result_classifier_conv = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_classifier_conv.nbytes)
    d_result_classifier = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_result_classifier.nbytes)

    tic5 = pc()

    #first conv layer
    conv3x3_ST(queue,(1,), None, 3, 224, 0, 2, 0, 111, 64, d_sample, d_conv1_weight, d_conv1_bias, d_result_conv)
    maxpool_ST(queue, (1, ), None, 111, 55, 64, d_result_conv, d_result_pool1)

    #block1
    conv1x1_ST(queue,(1,), None, 64, 55, 16, d_result_pool1, d_fire1_squeeze_weight, d_fire1_squeeze_bias, d_result_fire1_squeeze)
    queue.finish()
    conv1x1_ST(queue1,(1,), None, 16, 55, 64, d_result_fire1_squeeze, d_fire1_expand1x1_weight, d_fire1_expand1x1_bias, d_result_fire1_expand)
    conv3x3_ST(queue,(1,), None, 16, 55, 1, 1, 64, 55, 64, d_result_fire1_squeeze, d_fire1_expand3x3_weight, d_fire1_expand3x3_bias, d_result_fire1_expand)
    queue.finish()
    queue1.finish()

    conv1x1_ST(queue,(1,), None, 128, 55, 16, d_result_fire1_expand, d_fire2_squeeze_weight, d_fire2_squeeze_bias, d_result_fire2_squeeze)
    queue.finish()
    conv1x1_ST(queue1,(1,), None, 16, 55, 64, d_result_fire2_squeeze, d_fire2_expand1x1_weight, d_fire2_expand1x1_bias, d_result_fire2_expand)
    conv3x3_ST(queue,(1,), None, 16, 55, 1, 1, 64, 55, 64, d_result_fire2_squeeze, d_fire2_expand3x3_weight, d_fire2_expand3x3_bias, d_result_fire2_expand)
    queue.finish()
    queue1.finish()

    maxpool_ST(queue, (1, ), None, 55, 27, 128, d_result_fire2_expand, d_result_pool2)

    #block2
    conv1x1_ST(queue,(1,), None, 128, 27, 32, d_result_pool2, d_fire3_squeeze_weight, d_fire3_squeeze_bias, d_result_fire3_squeeze)
    queue.finish()
    conv1x1_ST(queue1,(1,), None, 32, 27, 128, d_result_fire3_squeeze, d_fire3_expand1x1_weight, d_fire3_expand1x1_bias, d_result_fire3_expand)
    conv3x3_ST(queue,(1,), None, 32, 27, 1, 1, 128, 27, 128, d_result_fire3_squeeze, d_fire3_expand3x3_weight, d_fire3_expand3x3_bias, d_result_fire3_expand)
    queue.finish()
    queue1.finish()

    conv1x1_ST(queue,(1,), None, 256, 27, 32, d_result_fire3_expand, d_fire4_squeeze_weight, d_fire4_squeeze_bias, d_result_fire4_squeeze)
    queue.finish()
    conv1x1_ST(queue1,(1,), None, 32, 27, 128, d_result_fire4_squeeze, d_fire4_expand1x1_weight, d_fire4_expand1x1_bias, d_result_fire4_expand)
    conv3x3_ST(queue,(1,), None, 32, 27, 1, 1, 128, 27, 128, d_result_fire4_squeeze, d_fire4_expand3x3_weight, d_fire4_expand3x3_bias, d_result_fire4_expand)
    queue.finish()
    queue1.finish()

    maxpool_ST(queue, (1, ), None, 27, 13, 256, d_result_fire4_expand, d_result_pool3)

    #block3
    conv1x1_ST(queue,(1,), None, 256, 13, 48, d_result_pool3, d_fire5_squeeze_weight, d_fire5_squeeze_bias, d_result_fire5_squeeze)
    queue.finish()
    conv1x1_ST(queue1,(1,), None, 48, 13, 192, d_result_fire5_squeeze, d_fire5_expand1x1_weight, d_fire5_expand1x1_bias, d_result_fire5_expand)
    conv3x3_ST(queue,(1,), None, 48, 13, 1, 1, 192, 13, 192, d_result_fire5_squeeze, d_fire5_expand3x3_weight, d_fire5_expand3x3_bias, d_result_fire5_expand)
    queue.finish()
    queue1.finish()

    conv1x1_ST(queue,(1,), None, 384, 13, 48, d_result_fire5_expand, d_fire6_squeeze_weight, d_fire6_squeeze_bias, d_result_fire6_squeeze)
    queue.finish()
    conv1x1_ST(queue1,(1,), None, 48, 13, 192, d_result_fire6_squeeze, d_fire6_expand1x1_weight, d_fire6_expand1x1_bias, d_result_fire6_expand)
    conv3x3_ST(queue,(1,), None, 48, 13, 1, 1, 192, 13, 192, d_result_fire6_squeeze, d_fire6_expand3x3_weight, d_fire6_expand3x3_bias, d_result_fire6_expand)
    queue.finish()
    queue1.finish()

    conv1x1_ST(queue,(1,), None, 384, 13, 64, d_result_fire6_expand, d_fire7_squeeze_weight, d_fire7_squeeze_bias, d_result_fire7_squeeze)
    queue.finish()
    conv1x1_ST(queue1,(1,), None, 64, 13, 256, d_result_fire7_squeeze, d_fire7_expand1x1_weight, d_fire7_expand1x1_bias, d_result_fire7_expand)
    conv3x3_ST(queue,(1,), None, 64, 13, 1, 1, 256, 13, 256, d_result_fire7_squeeze, d_fire7_expand3x3_weight, d_fire7_expand3x3_bias, d_result_fire7_expand)
    queue.finish()
    queue1.finish()

    conv1x1_ST(queue,(1,), None, 512, 13, 64, d_result_fire7_expand, d_fire8_squeeze_weight, d_fire8_squeeze_bias, d_result_fire8_squeeze)
    queue.finish()
    conv1x1_ST(queue1,(1,), None, 64, 13, 256, d_result_fire8_squeeze, d_fire8_expand1x1_weight, d_fire8_expand1x1_bias, d_result_fire8_expand)
    conv3x3_ST(queue,(1,), None, 64, 13, 1, 1, 256, 13, 256, d_result_fire8_squeeze, d_fire8_expand3x3_weight, d_fire8_expand3x3_bias, d_result_fire8_expand)
    queue.finish()
    queue1.finish()

    # classifier
    conv1x1_ST(queue,(1,), None, 512, 13, 1000, d_result_fire8_expand, d_classifier_conv_weight, d_classifier_conv_bias, d_result_classifier_conv)

    avgpool_ST(queue, (1, ), None, d_result_classifier_conv, d_result_classifier)

    cl.enqueue_copy(queue, h_result_classifier, d_result_classifier)

    veamos3 = h_result_classifier
    
    toc5 = pc()

    acumulado_kernel = toc5 - tic5 + acumulado_kernel
    
    comparativa &= np.allclose(salida18_a_numpy.reshape(-1), veamos3,rtol=1e-01, atol=1e-01)
    
print ("tiempo en segundos con pytorch= ", acumulado_pytorch/count)
print ("tiempo en segundos con opencl (Simple Task)=",acumulado_kernel/count)
print("comparativa (pytorch == Simple Task): ",comparativa)

SyntaxError: invalid syntax (675052290.py, line 72)