In [10]:
%%writefile Matrix_transpose.cu
#include <cuda_runtime.h>
#include <iostream>

#define WIDTH 4
#define HEIGHT 4

__global__ void transposeMatrix(const float* input, float* output, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < width && y < height) {
        int inputIndex = y * width + x;
        int outputIndex = x * height + y;
        output[outputIndex] = input[inputIndex];
    }
}

void checkCudaError(const char* message) {
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        std::cerr << message << " - CUDA Error: " << cudaGetErrorString(error) << std::endl;
        exit(EXIT_FAILURE);
    }
}

void printMatrix(const float* matrix, int width, int height, const char* name) {
    std::cout << name << ":\n";
    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
            std::cout << matrix[i * width + j] << "\t";
        }
        std::cout << "\n";
    }
    std::cout << std::endl;
}

int main() {
    int width = WIDTH;
    int height = HEIGHT;

    size_t size = width * height * sizeof(float);
    float* h_input = (float*)malloc(size);
    float* h_output = (float*)malloc(size);

    for (int i = 0; i < width * height; i++) {
        h_input[i] = static_cast<float>(i);
    }

    float* d_input;
    float* d_output;
    cudaMalloc((void**)&d_input, size);
    cudaMalloc((void**)&d_output, size);

    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
    checkCudaError("Failed to copy input data to device");

    dim3 blockSize(2, 2);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
                  (height + blockSize.y - 1) / blockSize.y);

    transposeMatrix<<<gridSize, blockSize>>>(d_input, d_output, width, height);
    cudaDeviceSynchronize();
    checkCudaError("Kernel execution failed");

    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
    checkCudaError("Failed to copy output data to host");

    printMatrix(h_input, width, height, "Input Matrix");
    printMatrix(h_output, height, width, "Transposed Matrix");

    bool success = true;
    for (int i = 0; i < width; i++) {
        for (int j = 0; j < height; j++) {
            if (h_output[i * height + j] != h_input[j * width + i]) {
                success = false;
                break;
            }
        }
    }

    std::cout << (success ? "Matrix transposition succeeded!" : "Matrix transposition failed!") << std::endl;

    cudaFree(d_input);
    cudaFree(d_output);
    free(h_input);
    free(h_output);

    return 0;
}


Overwriting Matrix_transpose.cu


In [11]:
# Compile with the specified architecture
!nvcc Matrix_transpose.cu -o Matrix_transpose -gencode arch=compute_75,code=sm_75

# Run the executable
!./Matrix_transpose


Input Matrix:
0	1	2	3	
4	5	6	7	
8	9	10	11	
12	13	14	15	

Transposed Matrix:
0	4	8	12	
1	5	9	13	
2	6	10	14	
3	7	11	15	

Matrix transposition succeeded!
