# Installing environment


## Installing CUDA library

In [1]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'nvidia-kernel-common-418-server' for glob 'nvidia*'
Note, selecting 'nvidia-325-updates' for glob 'nvidia*'
Note, selecting 'nvidia-346-updates' for glob 'nvidia*'
Note, selecting 'nvidia-driver-binary' for glob 'nvidia*'
Note, selecting 'nvidia-331-dev' for glob 'nvidia*'
Note, selecting 'nvidia-304-updates-dev' for glob 'nvidia*'
Note, selecting 'nvidia-compute-utils-418-server' for glob 'nvidia*'
Note, selecting 'nvidia-384-dev' for glob 'nvidia*'
Note, selecting 'nvidia-libopencl1-346-updates' for glob 'nvidia*'
Note, selecting 'nvidia-driver-440-server' for glob 'nvidia*'
Note, selecting 'nvidia-340-updates-uvm' for glob 'nvidia*'
Note, selecting 'nvidia-dkms-450-server' for glob 'nvidia*'
Note, selecting 'nvidia-kernel-common' for glob 'nvidia*'
Note, selecting 'nvidia-kernel-source-440-server' for glob 'nvidia*'
Note, selecting 'nvidia-331-updates-uvm' for glob 'nvidi

In [2]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

--2021-05-25 04:20:58--  https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64
Resolving developer.nvidia.com (developer.nvidia.com)... 152.195.57.194
Connecting to developer.nvidia.com (developer.nvidia.com)|152.195.57.194|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://developer.nvidia.com/compute/cuda/9.2/prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 [following]
--2021-05-25 04:20:58--  https://developer.nvidia.com/compute/cuda/9.2/prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64
Reusing existing connection to developer.nvidia.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://developer.download.nvidia.com/compute/cuda/9.2/secure/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb?ywyTJY57SXM9DyHZ4dhAOTHICcPbhoGkIL56n-vjhIZZSuKChzvIvLaS7TrqykMB64gt-_lX5Y8WmKCYB0W-q-Ukw2SG9AhAY5WfF6uZiOMA_

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-6gs26fm9
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-6gs26fm9
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=2252483a18db2aa9a1d8d2aaa26b2de8c2c34af1c4ef05cb2325fe6c60c2a60d
  Stored in directory: /tmp/pip-ephem-wheel-cache-7bfn1kdd/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


#Installing the file with helper functions.

In [None]:
%%cuda --name header.h 
#ifndef HEADER_H
#define HEADER_H

#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>

#define N 			10
#define DISPLAY		100
#define MAX_VALUE	10000

struct timeval startTime, stopTime;
int started = 0;

void start_timer() {
	started = 1;
	gettimeofday(&startTime, NULL);
}

double stop_timer() {
	long seconds, useconds;
	double duration = -1;

	if (started) {
		gettimeofday(&stopTime, NULL);
		seconds  = stopTime.tv_sec  - startTime.tv_sec;
		useconds = stopTime.tv_usec - startTime.tv_usec;
		duration = (seconds * 1000.0) + (useconds / 1000.0);
		started = 0;
	}
	return duration;
}

void random_array(int *array, int size) {
	int i;

	srand(time(0));
	for (i = 0; i < size; i++) {
		array[i] = (rand() % 100) + 1;
	}
}

void fill_array(int *array, int size) {
	int i;

	for (i = 0; i < size; i++) {
		array[i] = (i % MAX_VALUE) + 1;
	}
}

void display_array(const char *text, int *array) {
	int i;

	printf("%s = [%4i", text, array[0]);
	for (i = 1; i < DISPLAY; i++) {
		printf(",%4i", array[i]);
	}
	printf(", ... ,]\n");
}

#endif /* HEADER_H */


'File written in /content/src/header.h'

#Basic examples

## intro0.cu

In [None]:
%%cu
#include <stdio.h>

int main(int argc, char* argv[]) {
	int i, count;
	cudaDeviceProp prop;
	
	cudaGetDeviceCount(&count);
	for (i = 0; i < count; i++) {
		cudaGetDeviceProperties(&prop, i);
		printf("Device name: %s\n", prop.name);
	}
	return 0;
}


Device name: Tesla P100-PCIE-16GB



## intro1.cu

In [None]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel(void) {
	printf("GPU[%i, %i]: Hello world\n", blockIdx.x, threadIdx.x);
}

int main(int argc, char* argv[]) {
	kernel<<<2, 4>>>();
	cudaDeviceSynchronize();

	return 0;
}

GPU[1, 0]: Hello world
GPU[1, 1]: Hello world
GPU[1, 2]: Hello world
GPU[1, 3]: Hello world
GPU[0, 0]: Hello world
GPU[0, 1]: Hello world
GPU[0, 2]: Hello world
GPU[0, 3]: Hello world



## intro2.cu

In [None]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__device__ float fx(float x, float y) {
	return x + y;
}

__global__ void gpu_main(void) {
	printf("GPU[%i, %i] res = %f\n", blockIdx.x, threadIdx.x, fx(1.0, 2.0));
}

int main(int argc, char* argv[]) {
	gpu_main<<<1, 2>>>();
	cudaDeviceSynchronize();

	return 0;
}

GPU[0, 0] res = 3.000000
GPU[0, 1] res = 3.000000



## intro3.cu

In [None]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void add(int *a, int *b, int *c) {
	*c = *a + *b;
}

int main(int argc, char* argv[]) {
	int a, b, c;
	int *d_a, *d_b, *d_c;
	
	a = 10;
	b = 10;

	cudaMalloc((void**) &d_a, sizeof(int));
	cudaMalloc((void**) &d_b, sizeof(int));
	cudaMalloc((void**) &d_c, sizeof(int));
	
	cudaMemcpy(d_a, &a, sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);
	
	add<<<1, 1>>>(d_a, d_b, d_c);

	cudaMemcpy(&c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
	
	printf("c = %i\n", c);
	
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	
	return 0;
}

c = 20



## intro4.cu

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include "/content/src/header.h"

#define SIZE 512

__global__ void add(int *a, int *b, int *c) {
	c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

int main(int argc, char* argv[]) {
	int *a, *b, *c;
	int *d_a, *d_b, *d_c;
	double ms;
	
	a = (int*) malloc(SIZE * sizeof(int));
	fill_array(a, SIZE);
	display_array("a", a);
	
	b = (int*) malloc(SIZE * sizeof(int));
	fill_array(b, SIZE);
	display_array("b", b);
	
	c = (int*) malloc(SIZE * sizeof(int));

	cudaMalloc((void**) &d_a, SIZE * sizeof(int));
	cudaMalloc((void**) &d_b, SIZE * sizeof(int));
	cudaMalloc((void**) &d_c, SIZE * sizeof(int));
	
	cudaMemcpy(d_a, a, SIZE * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SIZE * sizeof(int), cudaMemcpyHostToDevice);
	
	start_timer();
	add<<<1, SIZE>>>(d_a, d_b, d_c);
	ms += stop_timer();
	
	cudaMemcpy(c, d_c, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
	display_array("c", c);
	printf("time = %.5lf\n", ms);
	
	cudaFree(d_c);
	cudaFree(d_b);
	cudaFree(d_a);
	
	free(c);
	free(b);
	free(a);
	
	return 0;
}

a = [   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, ... ,]
b = [   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  

## intro5.cu

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include "/content/src/header.h"

#define SIZE 512

__global__ void add(int *a, int *b, int *c) {
	c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}

int main(int argc, char* argv[]) {
	int *a, *b, *c;
	int *d_a, *d_b, *d_c;
	double ms;

	a = (int*) malloc(SIZE * sizeof(int));
	fill_array(a, SIZE);
	display_array("a", a);
	
	b = (int*) malloc(SIZE * sizeof(int));
	fill_array(b, SIZE);
	display_array("b", b);
	
	c = (int*) malloc(SIZE * sizeof(int));

	cudaMalloc((void**) &d_a, SIZE * sizeof(int));
	cudaMalloc((void**) &d_b, SIZE * sizeof(int));
	cudaMalloc((void**) &d_c, SIZE * sizeof(int));
	
	cudaMemcpy(d_a, a, SIZE * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SIZE * sizeof(int), cudaMemcpyHostToDevice);
	
	start_timer();
	add<<<SIZE, 1>>>(d_a, d_b, d_c);
	ms += stop_timer();
	
	cudaMemcpy(c, d_c, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
	display_array("c", c);
	printf("time = %.5lf\n", ms);
	
	cudaFree(d_c);
	cudaFree(d_b);
	cudaFree(d_a);
	
	free(c);
	free(b);
	free(a);
	
	return 0;
}

a = [   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, ... ,]
b = [   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  

## intro6.cu

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include "/content/src/header.h"

#define SIZE 	1e6
#define THREADS 512

__global__ void add(int *a, int *b, int *c) {
	int i = threadIdx.x + (blockIdx.x * blockDim.x);
	if (i < SIZE) {
		c[i] = a[i] + b[i];
	}
}

int main(int argc, char* argv[]) {
	int *a, *b, *c;
	int *d_a, *d_b, *d_c;
	double ms;
	
	a = (int*) malloc(SIZE * sizeof(int));
	fill_array(a, SIZE);
	display_array("a", a);
	
	b = (int*) malloc(SIZE * sizeof(int));
	fill_array(b, SIZE);
	display_array("b", b);
	
	c = (int*) malloc(SIZE * sizeof(int));

	cudaMalloc((void**) &d_a, SIZE * sizeof(int));
	cudaMalloc((void**) &d_b, SIZE * sizeof(int));
	cudaMalloc((void**) &d_c, SIZE * sizeof(int));
	
	cudaMemcpy(d_a, a, SIZE * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SIZE * sizeof(int), cudaMemcpyHostToDevice);
	
	start_timer();
	add<<<SIZE/THREADS + 1, THREADS>>>(d_a, d_b, d_c);
	ms = stop_timer();
	
	cudaMemcpy(c, d_c, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
	display_array("c", c);
	printf("time = %.5lf\n", ms);
	
	cudaFree(d_c);
	cudaFree(d_b);
	cudaFree(d_a);
	
	free(c);
	free(b);
	free(a);
	
	return 0;
}


a = [   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, ... ,]
b = [   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  

# Example 1

## Sequential (C)

In [None]:
%%cu
// =================================================================
//
// File: example1.c
// Author: Pedro Perez
// Description: This file contains the code that adds all the 
//				elements of an integer array. The time this 
//				implementation takes will be used as the basis to 
//				calculate the improvement obtained with parallel 
//				technologies.
//
// Copyright (c) 2020 by Tecnologico de Monterrey.
// All Rights Reserved. May be reproduced for any non-commercial
// purpose.
//
// =================================================================

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "/content/src/header.h"

#define SIZE 1000000 //1e6

double sum_array(int *array, int size) {
	double acum = 0;
	int i;
	
	for (i = 0; i < size; i++) {
		acum += array[i];
	}
	return acum;
}

int main(int argc, char* argv[]) {
	int i, j, *a;
	double ms, result;
	
	a = (int *) malloc(sizeof(int) * SIZE);
	fill_array(a, SIZE);
	display_array("a", a);
	
	printf("Starting...\n");
	ms = 0;
	for (i = 0; i < N; i++) {
		start_timer();
		
		result = sum_array(a, SIZE);
		
		ms += stop_timer();
	}
	printf("sum = %lf\n", result);
	printf("avg time = %.5lf ms\n", (ms / N));
	
	free(a);
	return 0;
}

a = [   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, ... ,]
Starting...
sum = 5000500000.000000
avg time = 3.07320 ms



## Parallel (CUDA)

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include "/content/src/header.h"

#define MIN(a,b) (a<b?a:b)

#define SIZE	1e6
#define THREADS	256
#define BLOCKS	MIN(32, (SIZE + THREADS - 1)/ THREADS)

__global__ void sum(int *array, long *result) {
	__shared__ long cache[THREADS];
	
	int tid = threadIdx.x + (blockIdx.x * blockDim.x);
	int cacheIndex = threadIdx.x;
	
	long acum = 0;
	while (tid < SIZE) {
		acum += array[tid];
		tid += blockDim.x * gridDim.x;
	}
	
	cache[cacheIndex] = acum;
	
	__syncthreads();
	
	int i = blockDim.x / 2;
	while (i > 0) {
		if (cacheIndex < i) {
			cache[cacheIndex] += cache[cacheIndex + i];
		}
		__syncthreads();
		i /= 2;
	}
	
	if (cacheIndex == 0) {
		result[blockIdx.x] = cache[cacheIndex];
	}
}

int main(int argc, char* argv[]) {
	int i, *array, *d_a;
	long *results, *d_r;
	double ms;
	
	array = (int*) malloc( SIZE * sizeof(int) );
	fill_array(array, SIZE);
	display_array("array", array);
	
	results = (long*) malloc( BLOCKS * sizeof(long) );
	
	cudaMalloc( (void**) &d_a, SIZE * sizeof(int) );
	cudaMalloc( (void**) &d_r, BLOCKS * sizeof(long) );
	
	cudaMemcpy(d_a, array, SIZE * sizeof(int), cudaMemcpyHostToDevice);
	
	printf("Starting...\n");
	ms = 0;
	for (i = 1; i <= N; i++) {
		start_timer();
		sum<<<BLOCKS, THREADS>>> (d_a, d_r);
		ms += stop_timer();
	}
	
	cudaMemcpy(results, d_r, BLOCKS * sizeof(long), cudaMemcpyDeviceToHost);
	
	long acum = 0;
	for (i = 0; i < BLOCKS; i++) {
		acum += results[i];
	}
	
	printf("sum = %li\n", acum);
	printf("avg time = %.5lf\n", (ms / N));
	
	cudaFree(d_r);
	cudaFree(d_a);
	
	free(array);
	free(results);
	return 0;
}


array = [   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, ... ,]
Starting...
sum = 5000500000
avg time = 0.00590



# Example 2

## Sequential (C)

In [None]:
%%cu
// =================================================================
//
// File: example2.c
// Author: Pedro Perez
// Description: This file contains the code to perform the numerical 
//				integration of a function within a defined interval. 
//				The time this implementation takes will be used as 
//				the basis to calculate the improvement obtained with 
//				parallel technologies.
//
// Copyright (c) 2020 by Tecnologico de Monterrey.
// All Rights Reserved. May be reproduced for any non-commercial
// purpose.
//
// =================================================================

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "/content/src/header.h"

#define PI 3.14159265
#define RECTS 1000000 //1e6

#define MIN(a,b) (((a)<(b))?(a):(b))
#define MAX(a,b) (((a)>(b))?(a):(b))

double function(double x) {
	return sin(x);
}

double integration(double a, double b, double (*fn) (double)) {
	double high, dx, acum, x;
	int i;

	x = MIN(a, b);
	dx = (MAX(a, b) - MIN(a, b)) / RECTS;
	acum = 0;
	for (i = 0; i < RECTS; i++) {
		acum += fn(x + (i * dx));
	}
	acum = acum * dx;
}

int main(int argc, char* argv[]) {
	int i, j;
	double ms, result;
	
	printf("Starting...\n");
	ms = 0;
	for (i = 0; i < N; i++) {
		start_timer();
		
		result = integration(0, PI, function);
		
		ms += stop_timer();
	}
	printf("sum = %lf\n", result);
	printf("avg time = %.5lf ms\n", (ms / N));
	
	return 0;
}

Starting...
sum = 2.000000
avg time = 34.57860 ms



## Parallel (CUDA)

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include "/content/src/header.h"

#define MIN(a,b) (((a)<(b))?(a):(b))
#define MAX(a,b) (((a)>(b))?(a):(b))

#define SIZE	1e6
#define THREADS	256
#define BLOCKS MIN(32, (SIZE + THREADS - 1)/ THREADS)

#define PI 3.14159265
#define RECTS 1000000 //1e6
#define START 0.0
#define END PI

__global__ void integration(double *x, double *dx, double *results) {
	__shared__ double cache[THREADS];
	
	int tid = threadIdx.x + (blockIdx.x * blockDim.x);
	int cacheIndex = threadIdx.x;
	
	double acum = 0;
	while (tid < SIZE) {
    acum += sin( (*x) + (tid * (*dx)) );
		tid += blockDim.x * gridDim.x;
	}
	
	cache[cacheIndex] = acum;
	
	__syncthreads();
	
	int i = blockDim.x / 2;
	while (i > 0) {
		if (cacheIndex < i) {
			cache[cacheIndex] += cache[cacheIndex + i];
		}
		__syncthreads();
		i /= 2;
	}
	
	if (cacheIndex == 0) {
		results[blockIdx.x] = cache[cacheIndex];
	}
}

int main(int argc, char* argv[]) {
	double x, dx, *results;
  double *d_x, *d_dx, *d_r;
	double ms;
  int i;

  // vid integration(double *x, double *dx, double *results) {
  x = START;
  dx = (END - START) / RECTS;
  
	results = (double*) malloc( BLOCKS * sizeof(double) );
	
	cudaMalloc( (void**) &d_x, sizeof(double));
  cudaMalloc( (void**) &d_dx, sizeof(double));
	cudaMalloc( (void**) &d_r, BLOCKS * sizeof(double) );
	
	cudaMemcpy(d_x, &x, sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dx, &dx, sizeof(double), cudaMemcpyHostToDevice);
	
	printf("Starting...\n");
	ms = 0;
	for (i = 1; i <= N; i++) {
		start_timer();
		integration<<<BLOCKS, THREADS>>> (d_x, d_dx, d_r);
		ms += stop_timer();
	}
	
	cudaMemcpy(results, d_r, BLOCKS * sizeof(double), cudaMemcpyDeviceToHost);
	
	double acum = 0;
	for (i = 0; i < BLOCKS; i++) {
		acum += results[i];
	}
	
	printf("area = %.5lf\n", (acum * dx));
	printf("avg time = %.5lf\n", (ms / N));
	
  cudaFree(d_x);
  cudaFree(d_dx);
	cudaFree(d_r);
	
	free(results);
	return 0;
}


Starting...
area = 2.00000
avg time = 0.00490



# Example 3

## Sequential (C)

In [None]:
%%cu
// =================================================================
//
// File: example3.c
// Author: Pedro Perez
// Description: This file contains the code that searches for the 
// 				smallest value stored in an array. The time this 
//				implementation takes will be used as the basis to 
//				calculate the improvement obtained with parallel 
//				technologies.
//
// Copyright (c) 2020 by Tecnologico de Monterrey.
// All Rights Reserved. May be reproduced for any non-commercial
// purpose.
//
// =================================================================

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <limits.h>
#include "/content/src/header.h"

#define SIZE 1000000 //1e6

#define MIN(a,b) (((a)<(b))?(a):(b))
#define MAX(a,b) (((a)>(b))?(a):(b))

int min_value(int *array, int size) {
	int i, result;
	
	result = INT_MAX;
	for (i = 0; i < size; i++) {
		result = MIN(result, array[i]);
	}
	return result;
}

int main(int argc, char* argv[]) {
	int i, *a, pos, result;
	double ms;
	
	a = (int *) malloc(sizeof(int) * SIZE);
	random_array(a, SIZE);
	display_array("a", a);
	
	printf("Starting...\n");
	ms = 0;
	for (i = 0; i < N; i++) {
		start_timer();
		
		result = min_value(a, SIZE);
		
		ms += stop_timer();
	}
	printf("result = %i\n", result);
	printf("avg time = %.5lf ms\n", (ms / N));
	
	free(a);
	return 0;
}

a = [  77,  79,  34,  56,  34,  42,  14,  95,  34,  99,  48,  13,  80,  50,   9,   1,  44,  64,  86,  77,  69,  17,  87,  22,  95,  57,  74,  98,  60,  61,   2,  88,  40,  87,  44,  25,  29,   9,  71,  14,   7,  19,  26,  86,  20,  86,  39,  16,  49,  24,  92,  69,  93,  78,  91,  87,  35,  16,  36,  46,  76,  89,  85,  67,  76,  28,  44,  56,  36,  14,  21,  94,  84,  46,  32,  56,  83,  22,  71,  84,  97,  62,  52,  89,  91,  42,  28,  77,  57,  63,  74,  85,  52,  59,  51,  79,  86,  94,  86,  74, ... ,]
Starting...
result = 1
avg time = 2.84380 ms



## Parallel (CUDA)

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <limits.h>
#include "/content/src/header.h"

#define MIN(a,b) (a<b?a:b)

#define SIZE	1e6
#define THREADS	256
#define BLOCKS	MIN(32, (SIZE + THREADS - 1)/ THREADS)

__global__ void minimum(int *array, int *results) {
	__shared__ int cache[THREADS];
	
	int tid = threadIdx.x + (blockIdx.x * blockDim.x);
	int cacheIndex = threadIdx.x;
	
	int aux = 2147483647;
	while (tid < SIZE) {
		aux = (aux < array[tid])? aux : array[tid];
		tid += blockDim.x * gridDim.x;
	}
	
	cache[cacheIndex] = aux;
	
	__syncthreads();
	
	int i = blockDim.x / 2;
	while (i > 0) {
		if (cacheIndex < i) {
			cache[cacheIndex] = (cache[cacheIndex] < cache[cacheIndex + i])? cache[cacheIndex] : cache[cacheIndex + 1];
		}
		__syncthreads();
		i /= 2;
	}
	
	if (cacheIndex == 0) {
		results[blockIdx.x] = cache[cacheIndex];
	}
}

int main(int argc, char* argv[]) {
	int i, *a, pos, *results;
  int *d_a, *d_r;
	double ms;
	
	a = (int *) malloc(sizeof(int) * SIZE);
	random_array(a, SIZE);
	display_array("a", a);

  results = (int *) malloc(sizeof(int) * BLOCKS);
	
	cudaMalloc( (void**) &d_a, SIZE * sizeof(int) );
	cudaMalloc( (void**) &d_r, BLOCKS * sizeof(int) );
	
	cudaMemcpy(d_a, a, SIZE * sizeof(int), cudaMemcpyHostToDevice);
	
	printf("Starting...\n");
	ms = 0;
	for (i = 1; i <= N; i++) {
		start_timer();
		minimum<<<BLOCKS, THREADS>>> (d_a, d_r);
		ms += stop_timer();
	}
	
	cudaMemcpy(results, d_r, BLOCKS * sizeof(int), cudaMemcpyDeviceToHost);
	
	int aux = INT_MAX;
	for (i = 0; i < BLOCKS; i++) {
		aux = MIN(aux, results[i]);
	}
	
	printf("minimum = %i\n", aux);
	printf("avg time = %.5lf\n", (ms / N));
	
	cudaFree(d_r);
	cudaFree(d_a);
	
	free(a);
  free(results);
	return 0;
}

a = [  89,  71,  82,  18,  62,  29,   2,  10,  18,  46,  31,  96,  16,  18,  71,  18,  51,  94,  32,  87,  54,  90,  47,  92,  81,  64,  76,  24,  61,   2,   7,   1,  24,  40,  19,  86,  21,  20,  47,  90,  17,  78,  85,  32,  47,   8,   2,  98,  53,  33,  84,   6,  22,  82,  49,  54,  97,  25,  78,   9,  78,  36,  10,   2,  27,  80,  39,  47,  51,  85,  88,  67,  14,  73,  98,  61,  32,  51,  10,  84,  35,  45,  90,   9,  78,  38,  62,  74,  14,  91,  35,  92,  78,  96,  93,  57,  75,  31,   3,  25, ... ,]
Starting...
minimum = 1
avg time = 0.00510



# Example 4

## Sequential (C)

In [None]:
%%cu
// =================================================================
//
// File: example4.c
// Author: Pedro Perez
// Description: This file implements the multiplication of a matrix 
//				by a vector. The time this implementation takes will
//				be used as the basis to calculate the improvement 
//				obtained with parallel technologies.
//
// Copyright (c) 2020 by Tecnologico de Monterrey.
// All Rights Reserved. May be reproduced for any non-commercial
// purpose.
//
// =================================================================

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "/content/src/header.h"

#define RENS 3000
#define COLS 3000 

void matrix_vector(int *m, int *b, int *c) {
	int i, j, acum;
	
	for (i = 0; i < RENS; i++) {
		acum = 0;
		for (j = 0; j < COLS; j++) {
			acum += (m[(i * COLS) + j] * b[i]);
		}
		c[i] = acum;
	}
}

int main(int argc, char* argv[]) {
	int i, j, *m, *b, *c;
	double ms;
	
	m = (int*) malloc(sizeof(int) * RENS* COLS);
	b = (int*) malloc(sizeof(int) * RENS);
	c = (int*) malloc(sizeof(int) * RENS);
	
	for (i = 0; i < RENS; i++) {
		for (j = 0; j < COLS; j++) {
			m[(i * COLS) + j] = (j + 1);
		}
		b[i] = 1;
	}
	
	printf("Starting...\n");
	ms = 0;
	for (i = 0; i < N; i++) {
		start_timer();
		
		matrix_vector(m, b, c);
		
		ms += stop_timer();
	}
	display_array("c:", c);
	printf("avg time = %.5lf ms\n", (ms / N));
	
	free(m); free(b); free(c);
	return 0;
}

Starting...
c: = [4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500,4501500, ... ,]
avg time = 21.94690 ms



## Parallel (CUDA)

In [None]:
%%cu
// =================================================================
//
// File: example4.c
// Author: Pedro Perez
// Description: This file implements the multiplication of a matrix 
//				by a vector. The time this implementation takes will
//				be used as the basis to calculate the improvement 
//				obtained with parallel technologies.
//
// Copyright (c) 2020 by Tecnologico de Monterrey.
// All Rights Reserved. May be reproduced for any non-commercial
// purpose.
//
// =================================================================

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "/content/src/header.h"

#define RENS    3000
#define COLS    3000 
#define THREADS 256
#define BLOCKS	((COLS / THREADS) + 1)

__global__ void matrix_vector(int *m, int *b, int *c) {
	int tid = threadIdx.x + (blockIdx.x * blockDim.x);
  int i, sum = 0;

  if (tid < COLS){
    sum = 0;
    for(i = 0; i < RENS; i++) {
          sum += b[i] * m[(i * COLS) + tid];
    }
    c[tid] = sum;
  }
}

int main(int argc, char* argv[]) {
	int i, j, *m, *b, *c;
  int *d_m, *d_b, *d_c;
	double ms;
	
	m = (int*) malloc(sizeof(int) * RENS* COLS);
	b = (int*) malloc(sizeof(int) * RENS);
	c = (int*) malloc(sizeof(int) * RENS);

  for (i = 0; i < RENS; i++) {
		for (j = 0; j < COLS; j++) {
			m[(i * COLS) + j] = (j + 1);
		}
		b[i] = 1;
	}

  cudaMalloc((void**)&d_m, sizeof(int) * RENS* COLS);
  cudaMalloc((void**)&d_b, sizeof(int) * RENS);
  cudaMalloc((void**)&d_c, sizeof(int) * RENS);

  cudaMemcpy(d_m, m, sizeof(int) * RENS* COLS, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, sizeof(int) * RENS, cudaMemcpyHostToDevice);
	
	printf("Starting...\n");
	ms = 0;
	for (i = 0; i < N; i++) {
		start_timer();
		
		matrix_vector<<<BLOCKS, THREADS>>>(d_m, d_b, d_c);
		
		ms += stop_timer();
	}

  cudaMemcpy(c, d_c, sizeof(int) * RENS, cudaMemcpyDeviceToHost);

	display_array("c:", c);
	printf("avg time = %.5lf ms\n", (ms / N));
	
  cudaFree(d_m); cudaFree(d_b); cudaFree(d_c);
	free(m); free(b); free(c);
	return 0;
}

Starting...
c: = [3000,6000,9000,12000,15000,18000,21000,24000,27000,30000,33000,36000,39000,42000,45000,48000,51000,54000,57000,60000,63000,66000,69000,72000,75000,78000,81000,84000,87000,90000,93000,96000,99000,102000,105000,108000,111000,114000,117000,120000,123000,126000,129000,132000,135000,138000,141000,144000,147000,150000,153000,156000,159000,162000,165000,168000,171000,174000,177000,180000,183000,186000,189000,192000,195000,198000,201000,204000,207000,210000,213000,216000,219000,222000,225000,228000,231000,234000,237000,240000,243000,246000,249000,252000,255000,258000,261000,264000,267000,270000,273000,276000,279000,282000,285000,288000,291000,294000,297000,300000, ... ,]
avg time = 0.00820 ms

