In [3]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [12]:
%%writefile parallel_merge.cu
#include <stdio.h>
#include <cuda.h>
#include <algorithm> // For max and min
#include <limits.h>  // For INT_MIN and INT_MAX

// Simple sequential merge function as a backup
void sequential_merge(const int* A, const int* B, int* C, int N, int M) {
    int i = 0, j = 0, k = 0;

    while (i < N && j < M) {
        if (A[i] <= B[j]) {
            C[k++] = A[i++];
        } else {
            C[k++] = B[j++];
        }
    }

    // Copy remaining elements
    while (i < N) C[k++] = A[i++];
    while (j < M) C[k++] = B[j++];
}

__device__ void co_rank(const int* A, const int* B, int k, int N, int M, int &i_out, int &j_out) {
    int low = max(0, k - M);
    int high = min(k, N);

    while (low <= high) {
        int i = (low + high) / 2;
        int j = k - i;

        // Boundary conditions
        int Ai_1 = (i > 0) ? A[i - 1] : INT_MIN;
        int Bj_1 = (j > 0) ? B[j - 1] : INT_MIN;
        int Ai = (i < N) ? A[i] : INT_MAX;
        int Bj = (j < M) ? B[j] : INT_MAX;

        if (Ai_1 > Bj) {
            high = i - 1;
        }
        else if (Bj_1 > Ai) {
            low = i + 1;
        }
        else {
            i_out = i;
            j_out = j;
            return;
        }
    }

    // Default values if we somehow exit the loop without finding a valid solution
    i_out = min(k, N);
    j_out = k - i_out;
}

__global__ void parallel_merge_kernel(const int* A, const int* B, int* C, int N, int M) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < N + M) {
        int i, j;
        co_rank(A, B, tid, N, M, i, j);

        // Select the correct element for position tid
        if (i < N && (j >= M || A[i] <= B[j])) {
            C[tid] = A[i];
        } else if (j < M) {
            C[tid] = B[j];
        }
    }
}

int main() {
    const int N = 5;
    const int M = 5;
    int A[N] = {0, 2, 4, 6, 8};
    int B[M] = {1, 3, 5, 7, 9};
    int C[N+M] = {0}; // Initialize with zeros

    printf("Array A: ");
    for (int i = 0; i < N; i++) printf("%d ", A[i]);
    printf("\n");

    printf("Array B: ");
    for (int i = 0; i < M; i++) printf("%d ", B[i]);
    printf("\n");

    // Try CUDA implementation
    bool cuda_success = false;

    // Attempt CUDA merge
    int *d_A = NULL, *d_B = NULL, *d_C = NULL;
    cudaError_t err;

    // Allocate device memory
    err = cudaMalloc(&d_A, N * sizeof(int));
    if (err != cudaSuccess) {
        printf("Error allocating d_A: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    err = cudaMalloc(&d_B, M * sizeof(int));
    if (err != cudaSuccess) {
        printf("Error allocating d_B: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    err = cudaMalloc(&d_C, (N + M) * sizeof(int));
    if (err != cudaSuccess) {
        printf("Error allocating d_C: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    // Copy data to device
    err = cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        printf("Error copying to d_A: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    err = cudaMemcpy(d_B, B, M * sizeof(int), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        printf("Error copying to d_B: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    // Clear output array
    err = cudaMemset(d_C, 0, (N + M) * sizeof(int));
    if (err != cudaSuccess) {
        printf("Error in cudaMemset: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    // Launch kernel
    parallel_merge_kernel<<<1, N+M>>>(d_A, d_B, d_C, N, M);

    // Check for kernel errors
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Kernel error: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        printf("Error in cudaDeviceSynchronize: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    // Copy results back to host
    err = cudaMemcpy(C, d_C, (N + M) * sizeof(int), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        printf("Error copying from d_C: %s\n", cudaGetErrorString(err));
        goto cleanup;
    }

    cuda_success = true;

cleanup:
    // Free device memory
    if (d_A) cudaFree(d_A);
    if (d_B) cudaFree(d_B);
    if (d_C) cudaFree(d_C);

    // If CUDA failed, use sequential merge as fallback
    if (!cuda_success) {
        printf("CUDA implementation failed. Using CPU implementation.\n");
        sequential_merge(A, B, C, N, M);
    } else {
        printf("CUDA implementation successful.\n");
    }

    printf("Merged array: ");
    for (int i = 0; i < N + M; i++) printf("%d ", C[i]);
    printf("\n");

    return 0;
}

Overwriting parallel_merge.cu


In [13]:
!nvcc -o parallel_merge parallel_merge.cu


In [14]:
!./parallel_merge



Array A: 0 2 4 6 8 
Array B: 1 3 5 7 9 
Kernel error: the provided PTX was compiled with an unsupported toolchain.
CUDA implementation failed. Using CPU implementation.
Merged array: 0 1 2 3 4 5 6 7 8 9 
