In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Wed_Nov_22_10:17:15_PST_2023
Cuda compilation tools, release 12.3, V12.3.107
Build cuda_12.3.r12.3/compiler.33567101_0


In [2]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [3]:
%load_ext nvcc4jupyter

Detected platform "Kaggle". Running its setup...
Updating the package lists...
Installing nvidia-cuda-toolkit, this may take a few minutes...
Source files will be saved in "/tmp/tmp9ohynh5m".


# Week7 Codes

### q1

In [18]:
%%writefile cuda_program.cu
#include <stdio.h>
#include <cuda.h>

__global__ void block_size_add(float *a, float *b, float *c) {
    int i = blockIdx.x;
    c[i] = a[i] + b[i];
}

__global__ void thread_add(float *a, float *b, float *c) {
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

__global__ void varying_block_add(float *a, float *b, float *c, int n) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < n) c[i] = a[i] + b[i];
}

int main() {
    int n = 10;
    // Allocating and initializing memory
    float a[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    float b[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    float c[10];
    float *da, *db, *dc;

    // Allocating device memory and transferring data to the device
    cudaMalloc((void**)&da, n * sizeof(float));
    cudaMalloc((void**)&db, n * sizeof(float));
    cudaMalloc((void**)&dc, n * sizeof(float));
    cudaMemcpy(da, a, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(db, b, n * sizeof(float), cudaMemcpyHostToDevice);

    // Launching the block_size_add kernel
    block_size_add<<<n, 1>>>(da, db, dc);
    cudaMemcpy(c, dc, n * sizeof(float), cudaMemcpyDeviceToHost);
    printf("Block add result:\n");
    for (int i = 0; i < n; i++) {
        printf("%f \n", c[i]);
    }

    // Launching the thread_add kernel
    thread_add<<<1, n>>>(da, db, dc);
    cudaMemcpy(c, dc, n * sizeof(float), cudaMemcpyDeviceToHost);
    printf("Thread add result:\n");
    for (int i = 0; i < n; i++) {
        printf("%f \n", c[i]);
    }

    // Launching the varying_block_add kernel
    varying_block_add<<<(n + 255) / 256, 256>>>(da, db, dc, n);
    cudaMemcpy(c, dc, n * sizeof(float), cudaMemcpyDeviceToHost);
    printf("Varying block add result:\n");
    for (int i = 0; i < n; i++) {
        printf("%f \n", c[i]);
    }

    // Freeing device memory
    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);

    return 0;
}


Overwriting cuda_program.cu


In [19]:
!nvcc cuda_program.cu -o cuda_program

In [20]:
!./cuda_program

Block add result:
2.000000 
4.000000 
6.000000 
8.000000 
10.000000 
12.000000 
14.000000 
16.000000 
18.000000 
20.000000 
Thread add result:
2.000000 
4.000000 
6.000000 
8.000000 
10.000000 
12.000000 
14.000000 
16.000000 
18.000000 
20.000000 
Varying block add result:
2.000000 
4.000000 
6.000000 
8.000000 
10.000000 
12.000000 
14.000000 
16.000000 
18.000000 
20.000000 


### q2 ->pending

### q3

In [29]:
%%writefile cuda_program.cu
#include <stdio.h>
#include <cuda.h>

__global__ void linearalgebra(int *x, int *y, int *c, int n, int m) {
    int i = threadIdx.x;
    if (i < n) {  // Ensure that the thread index is within bounds
        c[i] = (m * x[i]) + y[i];
    }
}

int main() {
    int n = 10;
    int x[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    int y[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    int c[10];
    int m = 2;
    int *d_x, *d_y, *d_c;

    // Allocate memory on the device
    cudaMalloc((void**)&d_x, n * sizeof(int));
    cudaMalloc((void**)&d_y, n * sizeof(int));
    cudaMalloc((void**)&d_c, n * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_x, x, n * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, n * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel with one block and n threads
    linearalgebra<<<1, n>>>(d_x, d_y, d_c, n, m);

    // Copy the result back to host memory
    cudaMemcpy(c, d_c, n * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result
    printf("Result:\n");
    for (int i = 0; i < n; i++) {
        printf("%d ", c[i]);
    }
    printf("\n");

    // Free the device memory
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_c);

    return 0;
}


Overwriting cuda_program.cu


In [30]:
!nvcc -o cuda_program cuda_program.cu

In [31]:
!./cuda_program

Result:
3 6 9 12 15 18 21 24 27 30 


### q4

In [32]:
%%writefile q.cu
#include <stdio.h>
#include <cuda.h>
#include <math.h>

__global__ void sine_angle(float* angle, float* sine) {
    int i = threadIdx.x;
    sine[i] = sinf(angle[i]); // Calculate sine of the angle
}

int main() {
    int n = 10;
    float a[10] = {0.0, 0.5236, 0.7854, 1.0472, 1.5708, 2.0944, 2.3562, 2.6180, 3.1416, 4.7124};//radians of pi/4 and all
    float b[n];
    float *d_angle, *d_sine;

    // Allocate memory on the device
    cudaMalloc((void**)&d_angle, n * sizeof(float));
    cudaMalloc((void**)&d_sine, n * sizeof(float));

    // Copy the data from host to device
    cudaMemcpy(d_angle, a, n * sizeof(float), cudaMemcpyHostToDevice);

    // Launch the kernel with one block and n threads
    sine_angle<<<1, n>>>(d_angle, d_sine);

    // Copy the result back to host memory
    cudaMemcpy(b, d_sine, n * sizeof(float), cudaMemcpyDeviceToHost);

    // Display the sine values of the angles
    printf("The sine of the angles in radians are:\n");
    for (int i = 0; i < n; i++) {
        printf("sin(%f) = %f\n", a[i], b[i]);
    }

    // Free the device memory
    cudaFree(d_angle);
    cudaFree(d_sine);

    return 0;
}


Writing q.cu


In [34]:
!nvcc -o q q.cu

In [35]:
!./q

The sine of the angles in radians are:
sin(0.000000) = 0.000000
sin(0.523600) = 0.500001
sin(0.785400) = 0.707108
sin(1.047200) = 0.866027
sin(1.570800) = 1.000000
sin(2.094400) = 0.866023
sin(2.356200) = 0.707103
sin(2.618000) = 0.499995
sin(3.141600) = -0.000007
sin(4.712400) = -1.000000


## week 8

### q1

In [43]:
%%writefile q.cu
#include <stdio.h>
#include <cuda.h>
#include <string.h>

__global__ void n_times(char *string, char *result, int string_len, int n) {
    int i = threadIdx.x; // Each thread handles a character in the string
    if (i < string_len) {
        for (int j = 0; j < n; j++) {
            result[j * string_len + i] = string[i]; // Copy each character n times
        }
    }
}

int main() {
    char string[] = "hello";
    int n = 3;
    int string_len = strlen(string);
    char result[3 * 5 + 1] = {0}; // +1 to accommodate the null terminator
    char *d_string, *d_result;

    // Allocate memory on the device
    cudaMalloc((void**)&d_string, string_len * sizeof(char));
    cudaMalloc((void**)&d_result, n * string_len * sizeof(char));

    // Copy the string to device memory
    cudaMemcpy(d_string, string, string_len * sizeof(char), cudaMemcpyHostToDevice);

    // Launch the kernel with string_len threads
    n_times<<<1, string_len>>>(d_string, d_result, string_len, n);

    // Copy the result back to the host
    cudaMemcpy(result, d_result, n * string_len * sizeof(char), cudaMemcpyDeviceToHost);

    // Add a null terminator to the result
    result[n * string_len] = '\0';

    printf("Result is:\n%s\n", result);

    // Free the device memory
    cudaFree(d_string);
    cudaFree(d_result);

    return 0;
}


Overwriting q.cu


In [44]:
!nvcc -o q q.cu

In [45]:
!./q

Result is:
hellohellohello


### q2

In [100]:
%%writefile q.cu
#include <stdio.h>
#include <cuda.h>
#include <string.h>

__global__ void rev_words_parallel(char* str, char* rev_str, int* words, int num_words, int str_length) {
    int i = threadIdx.x;
    if (i < num_words) { // Check for boundary
        int start = words[i];
        int end = (i == num_words - 1) ? str_length : words[i + 1]; // End of current word

        // Reverse the current word
        for (int x = start; x < end; x++) {
            rev_str[start + (end - 1 - x)] = str[x]; // Reverse the word
        }
        rev_str[end] = ' '; // Add a space after each reversed word
    }
}

int main() {
    char str[] = "my name is keerthan";
    char rev_str[100] = {0}; // Initialize to zero
    int words[5]; // Enough to hold indices for 4 words + 1
    int num_words = 0;

    words[num_words++] = 0; // Start of the first word
    // Fill words array with indices of spaces
    for (int x = 0; x < strlen(str); x++) {
        if (str[x] == ' ') {
            words[num_words++] = x + 1; // Store the index of the character after the space
        }
    }
    words[num_words++] = strlen(str); // End index for the last word

    char *d_str, *d_rev_str;
    int *d_words;

    // Allocate memory on the device
    cudaMalloc((void**)&d_str, strlen(str) * sizeof(char));
    cudaMalloc((void**)&d_rev_str, strlen(str) * sizeof(char));
    cudaMalloc((void**)&d_words, num_words * sizeof(int));

    // Copy the string and words to device memory
    cudaMemcpy(d_str, str, strlen(str) * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_words, words, num_words * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel with enough threads for the number of words
    rev_words_parallel<<<1, num_words>>>(d_str, d_rev_str, d_words, num_words, strlen(str));

    // Copy the result back to the host
    cudaMemcpy(rev_str, d_rev_str, strlen(str) * sizeof(char), cudaMemcpyDeviceToHost);

    // Add null terminator at the end, remove the last space
    rev_str[strlen(str)] = '\0';

    printf("The reversed words string is:\n");
    printf("%s\n", rev_str);

    // Free device memory
    cudaFree(d_str);
    cudaFree(d_rev_str);
    cudaFree(d_words);
    
    return 0;
}


Overwriting q.cu


In [101]:
!nvcc -o q q.cu

In [102]:
!./q

The reversed words string is:
 ym eman si ahtreek


### q3

In [15]:
%%writefile q.cu
#include <stdio.h>
#include <cuda.h>
#include <string.h>

// Kernel function to count the occurrence of a specific word
__global__ void count_word(char *str, char *word, int *count, int words[]) {
    int i = threadIdx.x;
    int start = words[i];
    int end = words[i + 1] - 1;
    int j = 0;

    for (int x = start; x < end; x++) {
        if (str[x] != word[j]) {
            return; // If any character does not match, return immediately
        }
        j++;
    }
    atomicAdd(count, 1); // Increase count atomically if a word match is found
}

int main() {
    char str[] = "wise man is wise man";
    char word[] = "wise";
    int words[10] = {0}; // Adjusted array size to handle more words
    int count = 0; // Corrected to an integer variable instead of a pointer
    int i = 0;

    // Finding the positions of words in the string
    for (int x = 0; x < strlen(str); x++) {
        if (str[x] == ' ') {
            i = i + 1;
            words[i] = x + 1; // Storing the location of the next word's start
        }
    }
    i = i + 1;
    words[i] = strlen(str) + 1;

    // Device memory pointers
    char *d_str, *d_word;
    int *d_words;
    int *d_count;

    // Allocate device memory
    cudaMalloc((void**)&d_str, (strlen(str) + 1) * sizeof(char));
    cudaMalloc((void**)&d_word, (strlen(word) + 1) * sizeof(char));
    cudaMalloc((void**)&d_words, (i + 1) * sizeof(int));
    cudaMalloc((void**)&d_count, sizeof(int));

    // Copy data to device memory
    cudaMemcpy(d_str, str, (strlen(str) + 1) * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_word, word, (strlen(word) + 1) * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(d_words, words, (i + 1) * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(d_count, 0, sizeof(int)); // Initialize count to zero on the device

    // Launch the kernel
    count_word<<<1, i>>>(d_str, d_word, d_count, d_words);

    // Copy the result back to host memory
    cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost);

    printf("The number of occurrences of the word '%s' is: %d\n", word, count);

    // Free device memory
    cudaFree(d_str);
    cudaFree(d_word);
    cudaFree(d_words);
    cudaFree(d_count);

    return 0;
}


Overwriting q.cu


In [16]:
!nvcc -o q q.cu

In [17]:
!./q

The number of occurrences of the word 'wise' is: 2


### q4
normal reverse string so not doing

# Week 9

### q1

In [28]:
%%writefile q.cu
#include<stdio.h>
#include<cuda.h>
#include<math.h>

__global__ void replace_matrix(int *matrix,int width){
    //when we have matrix we take it as array and do like this and passed as 1 block with mxn threads in 2d
    int i = threadIdx.x;
    int j = threadIdx.y;
    if(i==0){
        return;
    }
    if (i < width && j < width) {
        matrix[i * width + j] = pow(matrix[i * width + j],i+1);//row major access
    }
}
int main(){
    int matrix[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    int width = 4;
    int *d_matrix;
    cudaMalloc((void**)&d_matrix, 16 * sizeof(int));
    cudaMemcpy(d_matrix, matrix, 16 * sizeof(int), cudaMemcpyHostToDevice);
    replace_matrix<<<1,dim3(width,width)>>>(d_matrix, width);//it is a 4x4 matrix i.e actually taken m and n
    cudaMemcpy(matrix,d_matrix,16*sizeof(int),cudaMemcpyDeviceToHost);
    printf("the resultant matrix is \n");
    for(int i=0;i<width;i++){
        for(int j=0;j<width;j++){
            printf("%d ",matrix[i*width+j]);
        }
        printf("\n");
    }
    cudaFree(d_matrix);
    return 0;
}

Overwriting q.cu


In [29]:
!nvcc -o q q.cu
!./q

the resultant matrix is 
1 2 3 4 
25 36 49 64 
729 1000 1331 1728 
28561 38416 50625 65536 


### q2

In [33]:
%%writefile q.cu
#include<stdio.h>
#include<cuda.h>
#include<math.h>

__global__ void row_add(int *m,int *n,int *sum,int row,int col){
    int i= threadIdx.x;//this is the row number
    for(int j=0;j<col;j++){
        sum[i*col+j]=m[i*col+j]+n[i*col+j];//adding using row major
    }
}
__global__ void column_add(int *m,int *n,int *sum,int row,int col){
    int i=threadIdx.x;//this is x but is passed as no of columns so
    for(int j=0;j<row;j++){
        sum[i*row+j]=m[i*row+j]+n[i*row+j];//adding using column major
    }
}

__global__ void one_thread_add(int *m,int *n,int *sum,int row,int col){
    int i=threadIdx.x;
    int j=threadIdx.y;
    sum[i*row+j]=m[i*row+j]+n[i*row+j];//doing one thread addition
}

int main(){
    int matrix1[9] = {1,2,3,4,5,6,7,8,9};
    int matrix2[9] = {1,2,3,4,5,6,7,8,9};
    int matrix3[9];
    int *d_matrix1;
    int *d_matrix2;
    int *d_matrix3;
    cudaMalloc((void**)&d_matrix1, 9 * sizeof(int));
    cudaMalloc((void**)&d_matrix2, 9 * sizeof(int));
    cudaMalloc((void**)&d_matrix3, 9 * sizeof(int));
    cudaMemcpy(d_matrix1, matrix1, 9 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix2, matrix2, 9 * sizeof(int), cudaMemcpyHostToDevice);
    row_add<<<1,3>>>(d_matrix1,d_matrix2,d_matrix3,3,3);//here 3 is row
    cudaMemcpy(matrix3,d_matrix3,9*sizeof(int),cudaMemcpyDeviceToHost);
    printf("the resultant matrix by row add is \n");
    for(int i=0;i<3;i++){
        for(int j=0;j<3;j++){
            printf("%d ",matrix3[i*3+j]);
        }
        printf("\n");
    }
    column_add<<<1,3>>>(d_matrix1,d_matrix2,d_matrix3,3,3);//here 3 is col
    cudaMemcpy(matrix3,d_matrix3,9*sizeof(int),cudaMemcpyDeviceToHost);
    printf("the resultant matrix by col add is \n");
    for(int i=0;i<3;i++){
        for(int j=0;j<3;j++){
            printf("%d ",matrix3[i*3+j]);
        }
        printf("\n");
    }
    one_thread_add<<<1,dim3(3,3)>>>(d_matrix1,d_matrix2,d_matrix3,3,3);//here passed as 1 block with dim3
    cudaMemcpy(matrix3,d_matrix3,9*sizeof(int),cudaMemcpyDeviceToHost);
    printf("the resultant matrix by one thread addition is \n");
    for(int i=0;i<3;i++){
        for(int j=0;j<3;j++){
            printf("%d ",matrix3[i*3+j]);
        }
        printf("\n");
    }
    cudaFree(d_matrix3);
    cudaFree(d_matrix1);
    cudaFree(d_matrix2);
    return 0;
}

Overwriting q.cu


In [34]:
!nvcc -o q q.cu
!./q

the resultant matrix by row add is 
2 4 6 
8 10 12 
14 16 18 
the resultant matrix by col add is 
2 4 6 
8 10 12 
14 16 18 
the resultant matrix by one thread addition is 
2 4 6 
8 10 12 
14 16 18 


### q3

In [35]:
%%writefile q.cu
#include <stdio.h>
#include <cuda.h>

// Kernel for row-wise matrix multiplication
__global__ void row_multiply(int *m, int *n, int *product, int row, int col, int common_dim) {
    int i = threadIdx.x; // Row index
    for (int j = 0; j < col; j++) { // Loop over columns of the result matrix
        int sum = 0;
        for (int k = 0; k < common_dim; k++) { // Loop over the shared dimension
            sum += m[i * common_dim + k] * n[k * col + j];
        }
        product[i * col + j] = sum;
    }
}

// Kernel for column-wise matrix multiplication
__global__ void column_multiply(int *m, int *n, int *product, int row, int col, int common_dim) {
    int j = threadIdx.x; // Column index
    for (int i = 0; i < row; i++) { // Loop over rows of the result matrix
        int sum = 0;
        for (int k = 0; k < common_dim; k++) { // Loop over the shared dimension
            sum += m[i * common_dim + k] * n[k * col + j];
        }
        product[i * col + j] = sum;
    }
}

// Kernel for matrix multiplication using one thread per element
__global__ void one_thread_multiply(int *m, int *n, int *product, int row, int col, int common_dim) {
    int i = threadIdx.x; // Row index
    int j = threadIdx.y; // Column index
    int sum = 0;
    for (int k = 0; k < common_dim; k++) { // Loop over the shared dimension
        sum += m[i * common_dim + k] * n[k * col + j];
    }
    product[i * col + j] = sum;
}

int main() {
    int matrix1[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};  // 3x3 matrix
    int matrix2[9] = {9, 8, 7, 6, 5, 4, 3, 2, 1};  // 3x3 matrix
    int matrix3[9];  // Resultant matrix

    int *d_matrix1, *d_matrix2, *d_matrix3;
    cudaMalloc((void**)&d_matrix1, 9 * sizeof(int));
    cudaMalloc((void**)&d_matrix2, 9 * sizeof(int));
    cudaMalloc((void**)&d_matrix3, 9 * sizeof(int));

    cudaMemcpy(d_matrix1, matrix1, 9 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix2, matrix2, 9 * sizeof(int), cudaMemcpyHostToDevice);

    // Row-wise matrix multiplication
    row_multiply<<<1, 3>>>(d_matrix1, d_matrix2, d_matrix3, 3, 3, 3);
    cudaMemcpy(matrix3, d_matrix3, 9 * sizeof(int), cudaMemcpyDeviceToHost);
    printf("The resultant matrix by row-wise multiplication is:\n");
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            printf("%d ", matrix3[i * 3 + j]);
        }
        printf("\n");
    }

    // Column-wise matrix multiplication
    column_multiply<<<1, 3>>>(d_matrix1, d_matrix2, d_matrix3, 3, 3, 3);
    cudaMemcpy(matrix3, d_matrix3, 9 * sizeof(int), cudaMemcpyDeviceToHost);
    printf("The resultant matrix by column-wise multiplication is:\n");
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            printf("%d ", matrix3[i * 3 + j]);
        }
        printf("\n");
    }

    // One thread per element matrix multiplication
    one_thread_multiply<<<1, dim3(3, 3)>>>(d_matrix1, d_matrix2, d_matrix3, 3, 3, 3);
    cudaMemcpy(matrix3, d_matrix3, 9 * sizeof(int), cudaMemcpyDeviceToHost);
    printf("The resultant matrix by one thread multiplication is:\n");
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            printf("%d ", matrix3[i * 3 + j]);
        }
        printf("\n");
    }

    cudaFree(d_matrix1);
    cudaFree(d_matrix2);
    cudaFree(d_matrix3);

    return 0;
}


Overwriting q.cu


In [36]:
!nvcc -o q q.cu
!./q

The resultant matrix by row-wise multiplication is:
30 24 18 
84 69 54 
138 114 90 
The resultant matrix by column-wise multiplication is:
30 24 18 
84 69 54 
138 114 90 
The resultant matrix by one thread multiplication is:
30 24 18 
84 69 54 
138 114 90 


### q4

In [43]:
%%writefile q.cu
#include <stdio.h>
#include <cuda.h>

// CUDA kernel to compute 1's complement for non-border elements
__global__ void complement(int *a, int rows, int cols) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int row = idx / cols;
    int col = idx % cols;//just for easy way to check the for non border ele

    // Check if the element is a non-border element
    if (row == 0 || row == rows - 1 || col == 0 || col == cols - 1) return;

    // Calculate the smallest power of 2 greater than the value of a[idx]
    int pow2 = 1;
    while (pow2 <= a[idx]) pow2 *= 2;

    // Perform the 1's complement operation
    int complementValue = a[idx] ^ (pow2 - 1);//taking pow2-1 as mask and do bitwise xor we get

    // Convert the 1's complement value to its binary representation (in decimal format)
    int binary = 0, offset = 1;
    while (complementValue > 0) {
        binary += (complementValue % 2) * offset;
        offset *= 10;
        complementValue /= 2;
    }

    // Store the binary representation back into the matrix
    a[idx] = binary;
}

int main() {
    // Define the dimensions of the matrix
    int rows = 4;
    int cols = 4;

    // Example matrix A of size 4x4
    int h_matrix[] = {
        1, 2, 3, 4,
        6, 5, 8, 3,
        2, 4, 10,1,
        9, 1, 2, 5
    };

    int size = rows * cols * sizeof(int);

    // Allocate device memory for the matrix
    int *d_matrix;
    cudaMalloc((void**)&d_matrix, size);

    // Copy the matrix from host to device
    cudaMemcpy(d_matrix, h_matrix, size, cudaMemcpyHostToDevice);

    // Launch the kernel with one thread per element
    complement<<<1, rows * cols>>>(d_matrix, rows, cols);

    // Copy the result back to the host
    cudaMemcpy(h_matrix, d_matrix, size, cudaMemcpyDeviceToHost);

    // Display the resultant matrix
    printf("Resultant matrix with 1's complement for non-border elements:\n");
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            printf("%d\t", h_matrix[i * cols + j]);
        }
        printf("\n");
    }

    // Free device memory
    cudaFree(d_matrix);

    return 0;
}



Overwriting q.cu


In [44]:
!nvcc -o q q.cu
!./q


Resultant matrix with 1's complement for non-border elements:
1	2	3	4	
6	10	111	3	
2	11	101	1	
9	1	2	5	


# Week 10

### q1

In [9]:
%%writefile q.cu
#include <stdio.h>
#include <cuda.h>

// CUDA kernel for matrix multiplication
__global__ void matrix_multiply(int *a, int *b, int *c, int width) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (row < width && col < width) {
        int sum = 0;
        for (int i = 0; i < width; i++) {
            sum += a[row * width + i] * b[i * width + col];
        }
        c[row * width + col] = sum;
    }
}

int main() {
    int width = 4;
    int a[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
    int b[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
    int c[16];

    int *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, width * width * sizeof(int));
    cudaMalloc((void**)&d_b, width * width * sizeof(int));
    cudaMalloc((void**)&d_c, width * width * sizeof(int));

    cudaMemcpy(d_a, a, width * width * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, width * width * sizeof(int), cudaMemcpyHostToDevice);

    // Define a 2D block and grid size for matrix multiplication
    dim3 block(2, 2, 1);  // 2x2 threads per block
    //dim3 grid((width + block.x - 1) / block.x, (width + block.y - 1) / block.y, 1);->is one way else
    dim3 grid(ceil(width / 2.0), ceil(width / 2.0), 1);

    // Launch the kernel
    matrix_multiply<<<grid, block>>>(d_a, d_b, d_c, width);

    // Copy the result back to the host
    cudaMemcpy(c, d_c, width * width * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the resultant matrix
    printf("The resultant matrix is:\n");
    for (int i = 0; i < width; i++) {
        for (int j = 0; j < width; j++) {
            printf("%d ", c[i * width + j]);
        }
        printf("\n");
    }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Overwriting q.cu


In [10]:
!nvcc -o q q.cu
!./q

The resultant matrix is:
80 70 60 50 
240 214 188 162 
400 358 316 274 
560 502 444 386 
