<a href="https://colab.research.google.com/github/Leonardpepa/High-Performance-Computing/blob/main/Leonard_Pepa_ics20033_hpc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [33]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-_tf2zqgl
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-_tf2zqgl
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [34]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [35]:
!base64 /dev/urandom | head -c 1000000000 > file.txt

In [36]:
%%cuda --name char_freq_gpu.cu
#include <stdio.h> 
#include <stdlib.h> 
#include <cuda.h>

#define N 128
#define base 0

__global__ void character_frequency_gpu(char *buffer, int *freq, long size){
    long index = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (index > size) return;
	
	atomicAdd(&(freq[buffer[index] - base]), 1);
	
}



int main (int argc, char *argv[]) {
	
	FILE *pFile;
	long file_size;
	char * buffer;
	char * filename;
	size_t result;
	int freq[N];

    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;

	if (argc != 3) {
		printf ("Usage : %s <file_name> <number of threads per block>\n", argv[0]);
		return 1;
	}

	filename = argv[1];
	pFile = fopen ( filename , "rb" );

	if (pFile==NULL) {
		printf ("File error\n");
		return 2;
	}

	int THREADS_PER_BLOCK = strtol(argv[2], NULL, 10);

	// obtain file size:
	fseek (pFile , 0 , SEEK_END);
	file_size = ftell (pFile);
	rewind (pFile);
	printf("file size is %ld\n", file_size);

	// allocate memory to contain the file:
	buffer = (char*) malloc (sizeof(char)*file_size);
	if (buffer == NULL) {
		printf ("Memory error\n");
	 	return 3;
	}

	// copy the file into the buffer:
	result = fread (buffer,1,file_size,pFile);

	if (result != file_size) {
		printf ("Reading error\n");
		return 4;
	} 

	for (int j=0; j<N; j++){
		freq[j]=0;
	}

    cudaEventCreate(&total_start);
  	cudaEventCreate(&total_stop);
  	cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);

    cudaEventRecord(total_start);

    char *device_buffer;
    int *device_freq;

    cudaMalloc((void **)&device_freq, N * sizeof(int));
    cudaMalloc((void **)&device_buffer, file_size * sizeof(char));

    cudaMemcpy(device_buffer, buffer, file_size * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(device_freq, freq, N * sizeof(int), cudaMemcpyHostToDevice);

    cudaEventRecord(comp_start);

    long BLOCKS = ((file_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK); 
    character_frequency_gpu<<<BLOCKS, THREADS_PER_BLOCK>>>(device_buffer, device_freq, file_size);

    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    cudaMemcpy(freq, device_freq, N * sizeof(int), cudaMemcpyDeviceToHost);


    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    cudaFree(device_freq);
    cudaFree(device_buffer);
	fclose(pFile);
	free(buffer);

    for (int j=0; j<N; j++){
		printf("%d = %d\n", j+base, freq[j]);
	}

	/*
    * GPU timing
    */
   
	printf("\n");
    printf("File size: %ld, blocks: %ld, threads per block: %d, total_threads: %ld\n", file_size, BLOCKS, THREADS_PER_BLOCK, THREADS_PER_BLOCK*BLOCKS);
    printf("Total time (seconds): %f\n", total_time / 1000);
    printf("Kernel time (seconds): %f\n", comp_time / 1000);
    printf("Data transfer time (seconds): %f\n", (total_time-comp_time) / 1000); 
	printf("\n");
    return 0;
}

'File written in /content/src/char_freq_gpu.cu'

In [37]:
# Character frequency
!nvcc src/char_freq_gpu.cu -O2 -o char_freq

# 256 threads per block
!./char_freq file.txt 256

# 512 threads per block
!./char_freq file.txt 512

# 1024 threads per block
!./char_freq file.txt 1024

file size is 1000000000
0 = 0
1 = 0
2 = 0
3 = 0
4 = 0
5 = 0
6 = 0
7 = 0
8 = 0
9 = 0
10 = 12987012
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 15420429
44 = 0
45 = 0
46 = 0
47 = 15418585
48 = 15425432
49 = 15428062
50 = 15428860
51 = 15425447
52 = 15419372
53 = 15422854
54 = 15423150
55 = 15423900
56 = 15420965
57 = 15425734
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 15427983
66 = 15417402
67 = 15428154
68 = 15420801
69 = 15419227
70 = 15423340
71 = 15420656
72 = 15415580
73 = 15425498
74 = 15420501
75 = 15422065
76 = 15422480
77 = 15421634
78 = 15418857
79 = 15422355
80 = 15421851
81 = 15423078
82 = 15421395
83 = 15427645
84 = 15420416
85 = 15420454
86 = 15421621
87 = 15422840
88 = 15416178
89 = 15421503
90 = 15419640
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 15417816
98 = 15425694
99 = 1

In [38]:
%%cuda --name string_matching_gpu.cu
#include <stdio.h> 
#include <stdlib.h> 
#include <string.h>
#include <cuda.h>

__global__ void string_matching_gpu(char* buffer, int* match, char* pattern, int* total_matches, long match_size, long pattern_size){

    long index = threadIdx.x + blockIdx.x * blockDim.x;

    if(index > match_size) return;

    int i;

    for (i = 0; i < pattern_size && pattern[i] == buffer[i + index]; ++i){

    }

    if (i >= pattern_size){
        match[index] = 1;
        atomicAdd(total_matches, 1);
    } 

}


int main (int argc, char *argv[]) {
	
	FILE *pFile;
	long file_size, match_size, pattern_size;
	char *buffer;
	char *filename, *pattern;
	size_t result;
	int  *match, total_matches;

    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;

    if (argc != 4) {
        printf ("Usage : %s <file_name> <string> <number of threads per block>\n", argv[0]);
        return 1;
    }
	
    filename = argv[1];
	pattern = argv[2];
    int THREADS_PER_BLOCK = strtol(argv[3], NULL, 10);
	
	pFile = fopen ( filename , "rb" );
	if (pFile==NULL) {printf ("File error\n"); return 2;}

	// obtain file size:
	fseek (pFile , 0 , SEEK_END);
	file_size = ftell (pFile);
	rewind (pFile);
	
	// allocate memory to contain the file:
	buffer = (char*) malloc (sizeof(char)*file_size);
	if (buffer == NULL) {printf ("Memory error\n"); return 3;}

	// copy the file into the buffer:
	result = fread (buffer,1,file_size,pFile);
	if (result != file_size) {printf ("Reading error\n"); return 4;} 
	
	pattern_size = strlen(pattern);
	match_size = file_size - pattern_size + 1;
	
	match = (int *) malloc (sizeof(int)*match_size);
	if (match == NULL) {printf ("Malloc error\n"); return 5;}
	
	total_matches = 0;
	
    for (int j = 0; j < match_size; j++){
		match[j]=0;
	}

    cudaEventCreate(&total_start);
  	cudaEventCreate(&total_stop);
  	cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);

    cudaEventRecord(total_start);
    

    char *device_buffer;
    int  *device_match;
    char *device_pattern;
    int  *device_total_matches;

    cudaMalloc((void **)&device_buffer, sizeof(char)*file_size);
    cudaMalloc((void **)&device_match, sizeof(int)*match_size);
    cudaMalloc((void **)&device_pattern, sizeof(char)*pattern_size);
    cudaMalloc((void **)&device_total_matches, sizeof(int));

    cudaMemcpy(device_buffer, buffer, file_size * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(device_match, match, match_size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(device_pattern, pattern, pattern_size * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(device_total_matches, &total_matches, sizeof(int), cudaMemcpyHostToDevice);
    
    cudaEventRecord(comp_start);

    long BLOCKS = (file_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    string_matching_gpu<<<BLOCKS, THREADS_PER_BLOCK>>>(device_buffer, device_match, device_pattern, device_total_matches, match_size, pattern_size);
    
    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    cudaMemcpy(match, device_match, match_size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&total_matches, device_total_matches, sizeof(int), cudaMemcpyDeviceToHost);
    
    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);
    
    /*
    * GPU timing
    */
   
    printf("\n");
	printf("File size: %ld, blocks: %ld, threads per block: %d, total_threads: %ld\n", file_size, BLOCKS, THREADS_PER_BLOCK, THREADS_PER_BLOCK*BLOCKS);
    printf("Total time (seconds): %f\n", total_time / 1000);
    printf("Kernel time (seconds): %f\n", comp_time / 1000);
    printf("Data transfer time(seconds): %f\n", (total_time-comp_time) / 1000); 
    printf("\nTotal matches = %d\n", total_matches);
    printf("\n");
    
    cudaFree(device_buffer);
    cudaFree(device_match);
    cudaFree(device_pattern);
    cudaFree(device_total_matches);

    fclose (pFile);
	free (buffer);
	free (match);

    return 0;
}

'File written in /content/src/string_matching_gpu.cu'

In [39]:
# String matching
!nvcc src/string_matching_gpu.cu -O2 -o string_matching

# 256 threads per block
!./string_matching file.txt Atz 256

# 512 threads per block
!./string_matching file.txt Atz 512

# 1024 threads per block
!./string_matching file.txt Atz 1024



File size: 1000000000, blocks: 3906250, threads per block: 256, total_threads: 1000000000
Total time (seconds): 1.956160
Kernel time (seconds): 0.030951
Data transfer time(seconds): 1.925208

Total matches = 3617


File size: 1000000000, blocks: 1953125, threads per block: 512, total_threads: 1000000000
Total time (seconds): 1.964289
Kernel time (seconds): 0.033179
Data transfer time(seconds): 1.931110

Total matches = 3617


File size: 1000000000, blocks: 976563, threads per block: 1024, total_threads: 1000000512
Total time (seconds): 1.950082
Kernel time (seconds): 0.037969
Data transfer time(seconds): 1.912112

Total matches = 3617



In [40]:
%%cuda --name count_sort_gpu.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define UPPER 1000
#define LOWER 0

__global__ void count_sort_gpu(int *x, int *y, int n){

    int index = threadIdx.x + blockIdx.x * blockDim.x;


    if (index > n) return;

    int my_num = x[index];
    int my_place = 0;

    for (int i=0; i<n; i++){
        if ((my_num > x[i]) || ((my_num == x[i]) && (index < i))){
            my_place++;
        }
    }

    y[my_place] = my_num;

}

int main(int argc, char *argv[]){
    int *x, *y;
    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;

    if (argc != 3) {
        printf ("Usage : %s <array_size> <number of threads per block>\n", argv[0]);
        return 1;
    }

    int n = strtol(argv[1], NULL, 10);
    x = ( int * ) malloc ( n * sizeof ( int ) );
    y = ( int * ) malloc ( n * sizeof ( int ) );

    for (int i=0; i<n; i++){
        x[i] = (rand() % (UPPER - LOWER + 1)) + LOWER;
    }

    int THREADS_PER_BLOCK = strtol(argv[2], NULL, 10);

    cudaEventCreate(&total_start);
  	cudaEventCreate(&total_stop);
  	cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);

    cudaEventRecord(total_start);
    
    int *device_array, *device_sorted_array;

    cudaMalloc((void **)&device_array, n*sizeof(int));
    cudaMalloc((void **)&device_sorted_array, n*sizeof(int));

    cudaMemcpy(device_array, x, n*sizeof(int), cudaMemcpyHostToDevice);

    cudaEventRecord(comp_start);

    long BLOCKS = (n + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
    count_sort_gpu<<< BLOCKS, THREADS_PER_BLOCK >>>(device_array, device_sorted_array, n);

    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    cudaMemcpy(y, device_sorted_array, n*sizeof(int), cudaMemcpyDeviceToHost);


    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    for (int i=0; i<n-1; i++){
        if (y[i] <= y[i + 1]){
            continue;
        }

        printf("Error | Array is not sorted");
        exit(1);
    }

    /*
    * GPU timing
    */

    printf("\n");
    printf("Array size: %d, blocks: %ld, threads per block: %d, total_threads: %ld\n", n, BLOCKS, THREADS_PER_BLOCK, THREADS_PER_BLOCK*BLOCKS);
    printf("Total time (seconds): %f\n", total_time / 1000);
    printf("Kernel time (seconds): %f\n", comp_time / 1000);
    printf("Data transfer time (seconds): %f\n", (total_time-comp_time) / 1000); 
    printf("Array is sorted!");
    printf("\n");

    cudaFree(device_array);
    cudaFree(device_sorted_array);
    
    free(x);
    free(y);
    return 0;
}

'File written in /content/src/count_sort_gpu.cu'

In [41]:
# Count Sort
!nvcc src/count_sort_gpu.cu -O2 -o count_sort

# 256 threads per block
!./count_sort 900000 256

# 512 threads per block
!./count_sort 900000 512

# 1024 threads per block
!./count_sort 900000 1024




Array size: 900000, blocks: 3516, threads per block: 256, total_threads: 900096
Total time (seconds): 1.449746
Kernel time (seconds): 1.445932
Data transfer time (seconds): 0.003814
Array is sorted!

Array size: 900000, blocks: 1758, threads per block: 512, total_threads: 900096
Total time (seconds): 1.443927
Kernel time (seconds): 1.439657
Data transfer time (seconds): 0.004271
Array is sorted!

Array size: 900000, blocks: 879, threads per block: 1024, total_threads: 900096
Total time (seconds): 1.453651
Kernel time (seconds): 1.449799
Data transfer time (seconds): 0.003852
Array is sorted!
