[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/giulianogrossi/GPUcomputing/blob/master/lab2/CUDA_lab2.ipynb)

# CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

## NVCC Plugin for Jupyter notebook

*Usage*:


*   Load Extension `%load_ext nvcc_plugin`
*   Mark a cell to be treated as cuda cell
`%%cuda --name example.cu --compile false`

**NOTE**: The cell must contain either code or comments to be run successfully. It accepts 2 arguments. `-n | --name` - which is the name of either CUDA source or Header. The name parameter must have extension `.cu` or `.h`. Second argument -c | --compile; default value is false. The argument is a flag to specify if the cell will be compiled and run right away or not. It might be usefull if you're playing in the main function

*  We are ready to run CUDA C/C++ code right in your Notebook. For this we need explicitly say to the interpreter, that we want to use the extension by adding `%%cu` at the beginning of each cell with CUDA code. 




In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

# Image flip - CPU (multithreading)

In [None]:
%%writefile /content/src/ImageStuff.h

struct ImgProp {
	int Hpixels;
	int Vpixels;
	unsigned char HeaderInfo[54];
	unsigned long int Hbytes;
};

struct Pixel {
	unsigned char R;
	unsigned char G;
	unsigned char B;
};

typedef unsigned char pel;    // pixel element

pel** ReadBMP(char*);         // Load a BMP image
void WriteBMP(pel**, char*);  // Store a BMP image

extern struct ImgProp ip;


In [None]:
%%writefile /content/src/ImageStuff.c

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#include "ImageStuff.h"

/*
 * Load a BMP image
 */

pel** ReadBMP(char* filename) {
	FILE* f = fopen(filename, "rb");
	if (f == NULL) {
		printf("\n\n%s NOT FOUND\n\n", filename);
		exit(1);
	}

	pel HeaderInfo[54];
	fread(HeaderInfo, sizeof(pel), 54, f); // read the 54-byte header

	// extract image height and width from header
	int width = *(int*) &HeaderInfo[18];
	int height = *(int*) &HeaderInfo[22];

	//copy header for re-use
	for (unsigned int i = 0; i < 54; i++)
		ip.HeaderInfo[i] = HeaderInfo[i];

	ip.Vpixels = height;
	ip.Hpixels = width;
	int RowBytes = (width * 3 + 3) & (~3);
	ip.Hbytes = RowBytes;

	printf("\n   Input BMP File name: %20s  (%u x %u)", filename, ip.Hpixels, ip.Vpixels);

	pel tmp;
	pel **TheImage = (pel **) malloc(height * sizeof(pel*));
	for (unsigned int i = 0; i < height; i++)
		TheImage[i] = (pel *) malloc(RowBytes * sizeof(pel));

	for (unsigned int i = 0; i < height; i++)
		fread(TheImage[i], sizeof(unsigned char), RowBytes, f);

	fclose(f);
	return TheImage;  // remember to free() it in caller!
}

/*
 * Store a BMP image
 */
void WriteBMP(pel** img, char* filename) {
	FILE* f = fopen(filename, "wb");
	if (f == NULL) {
		printf("\n\nFILE CREATION ERROR: %s\n\n", filename);
		exit(1);
	}

	//write header
	for (unsigned int x = 0; x < 54; x++)
		fputc(ip.HeaderInfo[x], f);

	//write data
	for (unsigned int x = 0; x < ip.Vpixels; x++)
		for (unsigned int y = 0; y < ip.Hbytes; y++) {
			char temp = img[x][y];
			fputc(temp, f);
		}

	printf("\n  Output BMP File name: %20s  (%u x %u)", filename, ip.Hpixels,
			ip.Vpixels);

	fclose(f);
}

In [None]:
%%writefile /content/src/Imflip.c

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#include "ImageStuff.h"

struct ImgProp ip;

pel** FlipImageV(pel** img) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	//vertical flip
	for (col = 0; col < ip.Hbytes; col += 3) {
		row = 0;
		while (row < ip.Vpixels / 2) {
			pix.B = img[row][col];
			pix.G = img[row][col + 1];
			pix.R = img[row][col + 2];

			img[row][col] = img[ip.Vpixels - (row + 1)][col];
			img[row][col + 1] = img[ip.Vpixels - (row + 1)][col + 1];
			img[row][col + 2] = img[ip.Vpixels - (row + 1)][col + 2];

			img[ip.Vpixels - (row + 1)][col] = pix.B;
			img[ip.Vpixels - (row + 1)][col + 1] = pix.G;
			img[ip.Vpixels - (row + 1)][col + 2] = pix.R;

			row++;
		}
	}
	return img;
}

pel** FlipImageH(pel** img) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	//horizontal flip
	for (row = 0; row < ip.Vpixels; row++) {
		col = 0;
		while (col < (ip.Hpixels * 3) / 2) {
			pix.B = img[row][col];
			pix.G = img[row][col + 1];
			pix.R = img[row][col + 2];

			img[row][col] = img[row][ip.Hpixels * 3 - (col + 3)];
			img[row][col + 1] = img[row][ip.Hpixels * 3 - (col + 2)];
			img[row][col + 2] = img[row][ip.Hpixels * 3 - (col + 1)];

			img[row][ip.Hpixels * 3 - (col + 3)] = pix.B;
			img[row][ip.Hpixels * 3 - (col + 2)] = pix.G;
			img[row][ip.Hpixels * 3 - (col + 1)] = pix.R;

			col += 3;
		}
	}
	return img;
}

int main(int argc, char** argv) {
	if (argc != 4) {
		printf("\n\nUsage: imflip [input] [output] [V | H]");
		printf("\n\nExample: imflip square.bmp square_h.bmp h\n\n");
		return 0;
	}

	pel** data = ReadBMP(argv[1]);
	double timer;
	unsigned int a;
	clock_t start, stop;

	start = clock();
	switch (argv[3][0]) {
	case 'v':
	case 'V':
		data = FlipImageV(data);
		break;
	case 'h':
	case 'H':
		data = FlipImageH(data);
		break;
	default:
		printf("\nINVALID OPTION\n");
		return 0;
	}
	stop = clock();
	timer = ((double)(stop-start))/(double)CLOCKS_PER_SEC;

	// merge with header and write to file
	WriteBMP(data, argv[2]);

	// free() the allocated memory for the image
	for (int i = 0; i < ip.Vpixels; i++)
		free(data[i]);
	free(data);

	printf("\n\nTotal execution time: %9.4f sec", timer);
	printf(" (%7.3f ns per pixel)\n",
			1000000 * timer / (double) (ip.Hpixels * ip.Vpixels));

	return 0;
}

In [None]:
!gcc -o imflip src/ImageStuff.c src/Imflip.c 

In [None]:
!./imflip /content/drive/MyDrive/dog.bmp /content/drive/MyDrive/dogV.bmp V
!./imflip /content/drive/MyDrive/dog.bmp /content/drive/MyDrive/dogH.bmp H

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Librerie python per lettura/scrittura file di immagini e loro display: [openCV](https://docs.opencv.org/master/index.html) e [matplotlib](https://matplotlib.org/). Le immagini vengono rappresentate come array multidimensionali tratti dalla libreria fondamentale per il calcolo scientifico [NumPy](https://numpy.org/)

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

# reads as a NumPy array: row (height) x column (width) x color (3)
dog = cv.imread('/content/drive/MyDrive/dog.bmp')     
print('Image size: ', dog.shape)
# BGR is converted to RGB         
dog = cv.cvtColor(dog, cv.COLOR_BGR2RGB) 
dogV = cv.imread('/content/drive/MyDrive/dogV.bmp')
dogV = cv.cvtColor(dogV, cv.COLOR_BGR2RGB)
dogH = cv.imread('/content/drive/MyDrive/dogH.bmp')
dogH = cv.cvtColor(dogH, cv.COLOR_BGR2RGB)
plt.imshow(dog)
plt.show()
plt.imshow(dogV)
plt.show()
plt.imshow(dogH)
plt.show()

In [None]:
%%writefile /content/src/ImflipPth.c

#include <pthread.h>
#include <stdint.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include "ImageStuff.h"

#define MAXTHREADS   128

int NumThreads;         		       // Total number of threads working in parallel
int ThParam[MAXTHREADS];		       // Thread parameters ...
pthread_t ThHandle[MAXTHREADS];	   // Thread handles
pthread_attr_t ThAttr;			       // Pthread attrributes
void (*FlipFunc)(pel** img);	      // Function pointer to flip the image
void* (*MTFlipFunc)(void *arg);	   // Function pointer to flip the image, multi-threaded version
pel** TheImage;       			       // This is the main image
struct ImgProp ip;

void FlipImageV(pel** img) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	//vertical flip
	for (col = 0; col < ip.Hbytes; col += 3) {
		row = 0;
		while (row < ip.Vpixels / 2) {
			pix.B = img[row][col];
			pix.G = img[row][col + 1];
			pix.R = img[row][col + 2];

			img[row][col] = img[ip.Vpixels - (row + 1)][col];
			img[row][col + 1] = img[ip.Vpixels - (row + 1)][col + 1];
			img[row][col + 2] = img[ip.Vpixels - (row + 1)][col + 2];

			img[ip.Vpixels - (row + 1)][col] = pix.B;
			img[ip.Vpixels - (row + 1)][col + 1] = pix.G;
			img[ip.Vpixels - (row + 1)][col + 2] = pix.R;

			row++;
		}
	}
}

void FlipImageH(pel** img) {

	// TODO
}

void *MTFlipV(void* tid) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	long ts = *((int *) tid);                 	// My thread ID is stored here
	ts *= ip.Hbytes / NumThreads;               // start index
	long te = ts + ip.Hbytes / NumThreads - 1; 	// end index

	for (col = ts; col <= te; col += 3) {
		row = 0;
		while (row < ip.Vpixels / 2) {
			pix.B = TheImage[row][col];
			pix.G = TheImage[row][col + 1];
			pix.R = TheImage[row][col + 2];

			TheImage[row][col] = TheImage[ip.Vpixels - (row + 1)][col];
			TheImage[row][col + 1] = TheImage[ip.Vpixels - (row + 1)][col + 1];
			TheImage[row][col + 2] = TheImage[ip.Vpixels - (row + 1)][col + 2];

			TheImage[ip.Vpixels - (row + 1)][col] = pix.B;
			TheImage[ip.Vpixels - (row + 1)][col + 1] = pix.G;
			TheImage[ip.Vpixels - (row + 1)][col + 2] = pix.R;

			row++;
		}
	}
	pthread_exit(0);
}

void *MTFlipH(void* tid) {
	struct Pixel pix; //temp swap pixel
	int row, col;

	long ts = *((int *) tid);       	// My thread ID is stored here
	ts *= ip.Vpixels / NumThreads;			// start index
	long te = ts + ip.Vpixels / NumThreads - 1; 	// end index

	for (row = ts; row <= te; row++) {
		col = 0;
		while (col < ip.Hpixels * 3 / 2) {
			pix.B = TheImage[row][col];
			pix.G = TheImage[row][col + 1];
			pix.R = TheImage[row][col + 2];

			TheImage[row][col] = TheImage[row][ip.Hpixels * 3 - (col + 3)];
			TheImage[row][col + 1] = TheImage[row][ip.Hpixels * 3 - (col + 2)];
			TheImage[row][col + 2] = TheImage[row][ip.Hpixels * 3 - (col + 1)];

			TheImage[row][ip.Hpixels * 3 - (col + 3)] = pix.B;
			TheImage[row][ip.Hpixels * 3 - (col + 2)] = pix.G;
			TheImage[row][ip.Hpixels * 3 - (col + 1)] = pix.R;

			col += 3;
		}
	}
	pthread_exit(NULL);
}

int main(int argc, char** argv) {
	char Flip;
	int a, i, ThErr;
	struct timeval t;
	double StartTime, EndTime;
	double TimeElapsed;

	switch (argc) {
	case 3:
		NumThreads = 1;
		Flip = 'V';
		break;
	case 4:
		NumThreads = 1;
		Flip = toupper(argv[3][0]);
		break;
	case 5:
		NumThreads = atoi(argv[4]);
		Flip = toupper(argv[3][0]);
		break;
	default:
		printf("\n\nUsage: imflipP input output [v/h] [thread count]");
		printf("\n\nExample: imflipP infilename.bmp outname.bmp h 8\n\n");
		return 0;
	}

  if (NumThreads != 1) {
    printf("\nExecuting the multi-threaded version with %d threads ...\n",NumThreads);
    MTFlipFunc = (Flip == 'V') ? MTFlipV : MTFlipH;
  } else {
    printf("\nExecuting the serial version ...\n");
    FlipFunc = (Flip == 'V') ? FlipImageV : FlipImageH;
  }
	

	// load image
	TheImage = ReadBMP(argv[1]);

	gettimeofday(&t, NULL);
	StartTime = (double) t.tv_sec * 1000000.0 + ((double) t.tv_usec);

	if (NumThreads > 1) {
		pthread_attr_init(&ThAttr);
		pthread_attr_setdetachstate(&ThAttr, PTHREAD_CREATE_JOINABLE);
    for (i = 0; i < NumThreads; i++) {
      ThParam[i] = i;
      ThErr = pthread_create(&ThHandle[i], &ThAttr, MTFlipFunc, (void *) &ThParam[i]);
      if (ThErr != 0) {
        printf("\nThread Creation Error %d. Exiting abruptly... \n", ThErr);
        exit(EXIT_FAILURE);
      }
    }
    pthread_attr_destroy(&ThAttr);
    for (i = 0; i < NumThreads; i++) {
      pthread_join(ThHandle[i], NULL);
    }
	} else
		(*FlipFunc)(TheImage);

	gettimeofday(&t, NULL);
	EndTime = (double) t.tv_sec * 1000000.0 + ((double) t.tv_usec);
	TimeElapsed = (EndTime - StartTime) / 1000000.00;

	//merge with header and write to file
	WriteBMP(TheImage, argv[2]);

	// free() the allocated memory for the image
	for (i = 0; i < ip.Vpixels; i++) {
		free(TheImage[i]);
	}
	free(TheImage);

	printf("\n\nTotal execution time: %9.4f sec (%s flip)", TimeElapsed,
			Flip == 'V' ? "Vertical" : "Horizontal");
	printf(" (%6.3f ns/pixel)\n",
			1000000 * TimeElapsed / (double) (ip.Hpixels * ip.Vpixels));

	return (EXIT_SUCCESS);
}

In [None]:
!gcc -o imflip src/ImageStuff.c src/ImflipPth.c -pthread

In [None]:
!./imflip dog.bmp dogV.bmp V 4

# Blocks and grids

**Grid 1D**: stampa DIMs e IDs di grid, block e thread


In [None]:
%%cu
#include <stdio.h>

__global__ void checkIndex(void) {
	printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) "
			"blockDim:(%d, %d, %d) gridDim:(%d, %d, %d)\n",
			threadIdx.x, threadIdx.y, threadIdx.z,
			blockIdx.x, blockIdx.y, blockIdx.z,
			blockDim.x, blockDim.y, blockDim.z,
			gridDim.x,gridDim.y,gridDim.z);
}

int main(int argc, char **argv) {

	// definisce grid e struttura dei blocchi
	dim3 block(4);
	dim3 grid(3);

	// controlla dim. dal lato host
	printf("CHECK lato host:\n");
	printf("grid.x = %d\t grid.y = %d\t grid.z = %d\n", grid.x, grid.y, grid.z);
	printf("block.x = %d\t block.y = %d\t block.z %d\n\n", block.x, block.y, block.z);

	// controlla dim. dal lato device
	printf("CHECK lato device:\n");
	checkIndex<<<grid, block>>>();

	// reset device
	cudaDeviceReset();
	return(0);
}

In [None]:
!./grid1D

### **TODO**: definire un kernel con block 2D e grid 2D e stampare a video solo i thread la cui somma degli ID (`threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y`) è pari a un numero della sequenza di Fibonacci ([Fibonacci-wikipedia](https://it.wikipedia.org/wiki/Successione_di_Fibonacci))

$
\begin{align}
F_0 &= 0,\\
F_1 &= 1,\\
F_{n}&=F_{{n-1}}+F_{{n-2}},\quad \text{(per ogni $n>1$)}
\end{align}
$


In [None]:
%%cu
#include <stdio.h>

/*
 * Show DIMs & IDs for grid, block and thread
 */
__global__ void checkIndex(void) {
  
  uint sum = 0;
  
  // TODO
  sum=threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y;
  int fib=0;
  int fib_prev=1;
  while(fib<sum){
      int temp=fib;
      fib=fib+fib_prev;
      fib_prev=temp;
  }
  if(fib==sum){
    printf("ID sum = %d  --  threadIdx:(%d, %d) blockIdx:(%d, %d)\n", sum, 
            threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y);
  }
}

int main(int argc, char **argv) {

	// grid and block structure
	dim3 block(10,10);
	dim3 grid(1,1);

	// check for host
	printf("CHECK for host:\n");
	printf("grid.x = %d\t grid.y = %d\t grid.z = %d\n", grid.x, grid.y, grid.z);
	printf("block.x = %d\t block.y = %d\t block.z %d\n\n", block.x, block.y, block.z);

	// check for device
	printf("CHECK for device:\n");
	checkIndex<<<grid, block>>>();

	// reset device
	cudaDeviceReset();
	return (0);
}

# Image fplip - GPU

In [None]:
%%writefile /content/src/bmpUtil.h

struct imgBMP {
	int width;
	int height;
	unsigned char headInfo[54];
	unsigned long int rowByte;
} img;

#define	WIDTHB		img.rowByte
#define	WIDTH		img.width
#define	HEIGHT		img.height
#define	IMAGESIZE	(WIDTHB*HEIGHT)

struct pixel {
	unsigned char R;
	unsigned char G;
	unsigned char B;
};

typedef unsigned long ulong;
typedef unsigned int uint;
typedef unsigned char pel;    // pixel element

pel *ReadBMPlin(char*);         // Load a BMP image
void WriteBMPlin(pel *, char*); // Store a BMP image


In [None]:
%%writefile /content/src/common.h

#include <sys/time.h>

#ifndef _COMMON_H
#define _COMMON_H

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
    }                                                                          \
}

inline double seconds() {
    struct timeval tp;
    struct timezone tzp;
    int i = gettimeofday(&tp, &tzp);
    return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}

inline void device_name() {
    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));
}

#endif // _COMMON_H


In [None]:
%%writefile /content/src/ImgFlipCUDA.cu

#include <stdio.h>
#include <stdlib.h>
#include "bmpUtil.h"
#include "common.h"

/*
 * Kernel 1D that flips the given image vertically
 * each thread only flips a single pixel (R,G,B)
 */
__global__ void Vflip(pel *imgDst, const pel *imgSrc, const uint w, const uint h) {
	// ** pixel granularity **
	uint i = blockIdx.x;               // block ID
	uint j = threadIdx.x;              // thread ID
	uint b = blockDim.x;               // block dim
	uint x = b * i + j;                // 1D pixel linear index
	uint m = (w + b - 1) / b;          // num of blocks in a row
	uint r = i / m;                    // row of the source pixel
	uint c = x - r * w;                // col of the source pixel

	if (c >= w)                        // col out of range
		return;

	//  ** byte granularity **
	uint s = (w * 3 + 3) & (~3);       // num bytes x row (mult. 4)
	uint r1 = h - 1 - r;               // dest. row (mirror)
	uint p = s * r + 3*c;              // src byte position of the pixel
	uint q = s * r1 + 3*c;             // dst byte position of the pixel
	// swap pixels RGB
	imgDst[q] = imgSrc[p];             // R
	imgDst[q + 1] = imgSrc[p + 1];     // G
	imgDst[q + 2] = imgSrc[p + 2];     // B
}

/*
 *  Kernel that flips the given image horizontally
 *  each thread only flips a single pixel (R,G,B)
 */
__global__ void Hflip(pel *ImgDst, pel *ImgSrc, uint width) {
	
  uint i = blockIdx.x;               // block ID
	uint j = threadIdx.x;              // thread ID
	uint b = blockDim.x;               // block dim
	uint x = b * i + j;                // 1D pixel linear index
	uint m = (width + b - 1) / b;          // num of blocks in a row
	uint r = i / m;                    // row of the source pixel
	uint c = x - r * width;                // col of the source pixel

	if (c >= width)                        // col out of range
		return;

	//  ** byte granularity **
	uint s = (width * 3 + 3) & (~3);       // num bytes x row (mult. 4)
	uint c1 = width - 1 - c;               // dest. row (mirror)
	uint p = s * r + 3*c;              // src byte position of the pixel
	uint q = s * r + 3*c1;             // dst byte position of the pixel
	// swap pixels RGB
	ImgDst[q] = ImgSrc[p];             // R
	ImgDst[q + 1] = ImgSrc[p + 1];     // G
	ImgDst[q + 2] = ImgSrc[p + 2];     // B
}


/*
 *  Read a 24-bit/pixel BMP file into a 1D linear array.
 *  Allocate memory to store the 1D image and return its pointer
 */
pel *ReadBMPlin(char* fn) {
	static pel *Img;
	FILE* f = fopen(fn, "rb");
	if (f == NULL) {
		printf("\n\n%s NOT FOUND\n\n", fn);
		exit(EXIT_FAILURE);
	}

	pel HeaderInfo[54];
	size_t nByte = fread(HeaderInfo, sizeof(pel), 54, f); // read the 54-byte header
	// extract image height and width from header
	int width = *(int*) &HeaderInfo[18];
	img.width = width;
	int height = *(int*) &HeaderInfo[22];
	img.height = height;
	int RowBytes = (width * 3 + 3) & (~3);  // row is multiple of 4 pixel
	img.rowByte = RowBytes;
	//save header for re-use
	memcpy(img.headInfo, HeaderInfo, 54);
	printf("\n Input File name: %5s  (%d x %d)   File Size=%lu", fn, img.width,
			img.height, IMAGESIZE);
	// allocate memory to store the main image (1 Dimensional array)
	Img = (pel *) malloc(IMAGESIZE);
	if (Img == NULL)
		return Img;      // Cannot allocate memory
	// read the image from disk
	size_t out = fread(Img, sizeof(pel), IMAGESIZE, f);
	fclose(f);
	return Img;
}

/*
 *  Write the 1D linear-memory stored image into file
 */
void WriteBMPlin(pel *Img, char* fn) {
	FILE* f = fopen(fn, "wb");
	if (f == NULL) {
		printf("\n\nFILE CREATION ERROR: %s\n\n", fn);
		exit(1);
	}
	//write header
	fwrite(img.headInfo, sizeof(pel), 54, f);
	//write data
	fwrite(Img, sizeof(pel), IMAGESIZE, f);
	printf("\nOutput File name: %5s  (%u x %u)   File Size=%lu", fn, img.width,
			img.height, IMAGESIZE);
	fclose(f);
}

/*
 * MAIN
 */
int main(int argc, char **argv) {
	char flip = 'V';
	uint dimBlock = 256, dimGrid;
	pel *imgSrc, *imgDst;		 // Where images are stored in CPU
	pel *imgSrcGPU, *imgDstGPU;	 // Where images are stored in GPU

	if (argc > 4) {
		dimBlock = atoi(argv[4]);
		flip = argv[3][0];
	}
	else if (argc > 3) {
		flip = argv[3][0];
	}
	else if (argc < 3) {
		printf("\n\nUsage:   imflipGPU InputFilename OutputFilename [V/H] [dimBlock]");
		exit(EXIT_FAILURE);
	}
	if ((flip != 'V') && (flip != 'H')) {
		printf("Invalid flip option '%c'. Must be 'V','H'... \n",flip);
		exit(EXIT_FAILURE);
	}

	// Create CPU memory to store the input and output images
	imgSrc = ReadBMPlin(argv[1]); // Read the input image if memory can be allocated
	if (imgSrc == NULL) {
		printf("Cannot allocate memory for the input image...\n");
		exit(EXIT_FAILURE);
	}
	imgDst = (pel *) malloc(IMAGESIZE);
	if (imgDst == NULL) {
		free(imgSrc);
		printf("Cannot allocate memory for the input image...\n");
		exit(EXIT_FAILURE);
	}

	// Allocate GPU buffer for the input and output images
	CHECK(cudaMalloc((void**) &imgSrcGPU, IMAGESIZE));
	CHECK(cudaMalloc((void**) &imgDstGPU, IMAGESIZE));

	// Copy input vectors from host memory to GPU buffers.
	CHECK(cudaMemcpy(imgSrcGPU, imgSrc, IMAGESIZE, cudaMemcpyHostToDevice));

	// invoke kernels (define grid and block sizes)
	int rowBlock = (WIDTH + dimBlock - 1) / dimBlock;
	dimGrid = HEIGHT * rowBlock;

	double start = seconds();   // start time

	switch (flip) {
	case 'H':
		Hflip<<<dimGrid, dimBlock>>>(imgDstGPU, imgSrcGPU, WIDTH);
		break;
	case 'V':
		Vflip<<<dimGrid, dimBlock>>>(imgDstGPU, imgSrcGPU, WIDTH, HEIGHT);
		break;
	}
	// cudaDeviceSynchronize waits for the kernel to finish, and returns
	// any errors encountered during the launch.
	CHECK(cudaDeviceSynchronize());

	double stop = seconds();   // elapsed time

	// Copy output (results) from GPU buffer to host (CPU) memory.
	CHECK(cudaMemcpy(imgDst, imgDstGPU, IMAGESIZE, cudaMemcpyDeviceToHost));

	// Write the flipped image back to disk
	WriteBMPlin(imgDst, argv[2]);

	printf("\nKernel elapsed time %f sec \n\n", stop - start);

	// Deallocate CPU, GPU memory and destroy events.
	cudaFree(imgSrcGPU);
	cudaFree(imgDstGPU);

	// cudaDeviceReset must be called before exiting in order for profiling and
	// tracing tools spel as Parallel Nsight and Visual Profiler to show complete traces.
	cudaError_t	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceReset failed!");
		free(imgSrc);
		free(imgDst);
		exit(EXIT_FAILURE);
	}
	free(imgSrc);
	free(imgDst);
	return (EXIT_SUCCESS);
}



In [None]:
!nvcc src/ImgFlipCUDA.cu -o imfpliGPU

In [None]:
!./imfpliGPU /content/drive/MyDrive/dog.bmp /content/drive/MyDrive/dogV.bmp V
!./imfpliGPU /content/drive/MyDrive/dog.bmp /content/drive/MyDrive/dogH.bmp H

In [None]:
# reads as a NumPy array: row (height) x column (width) x color (3)
dog = cv.imread('/content/drive/MyDrive/dog.bmp')     
print('Image size: ', dog.shape)
# BGR is converted to RGB         
dog = cv.cvtColor(dog, cv.COLOR_BGR2RGB) 
dogV = cv.imread('/content/drive/MyDrive/dogV.bmp')
dogV = cv.cvtColor(dogV, cv.COLOR_BGR2RGB)
dogH = cv.imread('/content/drive/MyDrive/dogH.bmp')
dogH = cv.cvtColor(dogH, cv.COLOR_BGR2RGB)
plt.imshow(dog)
plt.show()
plt.imshow(dogV)
plt.show()
plt.imshow(dogH)
plt.show()

In [None]:
/content/drive/MyDrive/GPU_github/GPUcomputing/lab2/images/dog.bmp