In [None]:
code = r"""
#include <iostream>
#include <cstdlib>
using namespace std;

// CUDA kernel to multiply matrices
__global__ void multiply(int* A, int* B, int* C, int size) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < size && col < size) {
        int sum = 0;
        for (int i = 0; i < size; i++) {
            sum += A[row * size + i] * B[i * size + col];
        }
        C[row * size + col] = sum;
    }
}

void initialize(int* matrix, int size) {
    for (int i = 0; i < size * size; i++) {
        matrix[i] = rand() % 10; // Initializing with random values
    }
}

void print(int* matrix, int size) {
    for (int row = 0; row < size; row++) {
        for (int col = 0; col < size; col++) {
            cout << matrix[row * size + col] << " ";
        }
        cout << '\n';
    }
    cout << '\n';
}

int main() {
    int* A, * B, * C;
    int N = 2;  // Matrix size (N x N)

    int matrixSize = N * N;
    size_t matrixBytes = matrixSize * sizeof(int);

    // Allocate memory for matrices A, B, and C
    A = new int[matrixSize];
    B = new int[matrixSize];
    C = new int[matrixSize];

    // Initialize matrices A and B
    initialize(A, N);
    initialize(B, N);

    // Print matrices A and B
    cout << "Matrix A: \n";
    print(A, N);

    cout << "Matrix B: \n";
    print(B, N);

    // Allocate device memory for matrices
    int* X, * Y, * Z;
    cudaMalloc(&X, matrixBytes);
    cudaMalloc(&Y, matrixBytes);
    cudaMalloc(&Z, matrixBytes);

    // Copy matrices A and B from host to device
    cudaMemcpy(X, A, matrixBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, matrixBytes, cudaMemcpyHostToDevice);

    // Set up block and grid dimensions
    int THREADS = 2;  // Thread block size (2x2)
    int BLOCKS = (N + THREADS - 1) / THREADS;  // Grid size (round up to cover all elements)

    dim3 threads(THREADS, THREADS);  // 2x2 block
    dim3 blocks(BLOCKS, BLOCKS);    // Grid size

    // Launch the kernel
    multiply<<<blocks, threads>>>(X, Y, Z, N);

    // Copy result matrix C from device to host
    cudaMemcpy(C, Z, matrixBytes, cudaMemcpyDeviceToHost);

    // Print the result matrix C
    cout << "Multiplication of matrix A and B: \n";
    print(C, N);

    // Free host memory
    delete[] A;
    delete[] B;
    delete[] C;

    // Free device memory
    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}

"""




with open("main.cu", "w") as f:
    f.write(code)

In [None]:
code = r"""
#include <iostream>
using namespace std;

__global__
void add(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

void initialize(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        vector[i] = rand() % 10;
    }
}

void print(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        cout << vector[i] << " ";
    }
    cout << endl;
}

int main() {
    int N = 4;  // Number of elements
    int* A, * B, * C;

    int vectorSize = N;
    size_t vectorBytes = vectorSize * sizeof(int);

    // Allocate memory on host
    A = new int[vectorSize];
    B = new int[vectorSize];
    C = new int[vectorSize];

    // Initialize vectors A and B
    initialize(A, vectorSize);
    initialize(B, vectorSize);

    cout << "Vector A: ";
    print(A, N);
    cout << "Vector B: ";
    print(B, N);

    int* X, * Y, * Z;

    // Allocate memory on device
    cudaMalloc(&X, vectorBytes);
    cudaMalloc(&Y, vectorBytes);
    cudaMalloc(&Z, vectorBytes);

    // Copy data from host to device
    cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Launch kernel
    add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);

    // Check for errors during kernel execution
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        cout << "CUDA error during kernel launch: " << cudaGetErrorString(error) << endl;
        return -1;
    }

    // Wait for the GPU to finish
    cudaDeviceSynchronize();

    // Copy the result from device to host
    cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);

    cout << "Addition: ";
    print(C, N);

    // Free allocated memory
    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}
"""




with open("main.cu", "w") as f:
    f.write(code)

In [None]:
!nvcc -arch=sm_75 main.cu -o main

In [None]:
!./main