In [10]:
%%writefile NaiveBayes.cu

// NaiveBayes.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "NaiveBayesKernel.cuh"
#include "NaiveBayesTrain.cuh" // Include the training header

#define SHARED_SIZE 20 // Adjust based on your needs and available shared memory

// CUDA Kernel to compute priors (P(Y = c)) and likelihoods (P(X | Y = c)).
__global__ void computePriorsAndLikelihood(
    int* d_Dataset, int* d_priors, int* d_likelihoods,
    int numSamples, int numFeatures, int numClasses, int numFeatureValues
) {
    int threadId = blockIdx.x * blockDim.x + threadIdx.x;

    // These should ideally be dynamically sized or handled differently for larger data
    extern __shared__ int local_d_priors[]; // Shared memory for priors
    int* local_d_likelihoods = local_d_priors + numClasses; // Shared memory for likelihoods

    // Initialize shared memory
    if (threadIdx.x < numClasses) {
        local_d_priors[threadIdx.x] = 0;
    }
    if (threadIdx.x < numClasses * numFeatures * numFeatureValues) {
         local_d_likelihoods[threadIdx.x] = 0;
    }
    __syncthreads();


    // If the thread is within bounds
    if (threadId < numSamples) {
        // Each thread processes one data sample
        int classLabel = d_Dataset[threadId * (numFeatures + 1) + numFeatures]; // Class label is in the last column

        // Atomic update to calculate the prior
        atomicAdd(&local_d_priors[classLabel], 1);

        // Compute likelihood for each feature
        for (int fIdx = 0; fIdx < numFeatures; ++fIdx) {
            int featureValue = d_Dataset[threadId * (numFeatures + 1) + fIdx];
            int likelihoodIndex = classLabel * numFeatures * numFeatureValues + (fIdx * numFeatureValues) + featureValue;

            // Atomic update to the likelihood matrix
            atomicAdd(&local_d_likelihoods[likelihoodIndex], 1);
        }
    }

    // Synchronize threads before writing shared results back to global memory
    __syncthreads();

    // Write local results to global memory (only one thread needs to do this)
    if (threadIdx.x < numClasses) {
        atomicAdd(&d_priors[threadIdx.x], local_d_priors[threadIdx.x]);
    }

    if (threadIdx.x < numClasses * numFeatures * numFeatureValues) {
         atomicAdd(&d_likelihoods[threadIdx.x], local_d_likelihoods[threadIdx.x]);
    }
}

// Function to launch the CUDA kernel for training (defined in NaiveBayesTrain.cuh)
void trainNaiveBayes(
    int* h_Dataset, int* h_priors, int* h_likelihoods,
    int numSamples, int numFeatures, int numClasses, int numFeatureValues
) {
    int datasetSize = numSamples * (numFeatures + 1) * sizeof(int);
    int priorsSize = numClasses * sizeof(int);
    int likelihoodsSize = numClasses * numFeatures * numFeatureValues * sizeof(int);

    int *d_Dataset, *d_priors, *d_likelihoods;

    // Allocate device memory
    cudaMalloc(&d_Dataset, datasetSize);
    cudaMalloc(&d_priors, priorsSize);
    cudaMalloc(&d_likelihoods, likelihoodsSize);

    // Copy data from host to device
    cudaMemcpy(d_Dataset, h_Dataset, datasetSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_priors, h_priors, priorsSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_likelihoods, h_likelihoods, likelihoodsSize, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    int blockSize = 256; // Adjust as needed
    int numBlocks = (numSamples + blockSize - 1) / blockSize;

    // Calculate shared memory size needed
    int sharedMemSize = numClasses * sizeof(int) + numClasses * numFeatures * numFeatureValues * sizeof(int);


    // Launch the kernel
    computePriorsAndLikelihood<<<numBlocks, blockSize, sharedMemSize>>>(
        d_Dataset, d_priors, d_likelihoods,
        numSamples, numFeatures, numClasses, numFeatureValues
    );

    // Copy results back from device to host
    cudaMemcpy(h_priors, d_priors, priorsSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_likelihoods, d_likelihoods, likelihoodsSize, cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_Dataset);
    cudaFree(d_priors);
    cudaFree(d_likelihoods);
}

int main() {
    // Example Usage (replace with your actual data and parameters)
    int numSamples = 10;
    int numFeatures = 2;
    int numClasses = 2;
    int numFeatureValues = 3; // Assuming feature values are 0, 1, 2

    // Host data
    int h_Dataset[] = {
        0, 1, 0,
        1, 0, 1,
        0, 2, 0,
        1, 1, 1,
        0, 0, 0,
        1, 2, 1,
        0, 1, 0,
        1, 0, 1,
        0, 2, 0,
        1, 1, 1
    };

    int h_priors[numClasses] = {0};
    int h_likelihoods[numClasses * numFeatures * numFeatureValues] = {0};

    // Train the Naive Bayes model
    trainNaiveBayes(
        h_Dataset, h_priors, h_likelihoods,
        numSamples, numFeatures, numClasses, numFeatureValues
    );

    // Print results (example)
    printf("Priors:\n");
    for (int i = 0; i < numClasses; ++i) {
        printf("Class %d: %d\n", i, h_priors[i]);
    }

    printf("\nLikelihoods:\n");
    for (int c = 0; c < numClasses; ++c) {
        for (int f = 0; f < numFeatures; ++f) {
            for (int v = 0; v < numFeatureValues; ++v) {
                int index = c * numFeatures * numFeatureValues + f * numFeatureValues + v;
                printf("P(Feature %d = %d | Class %d): %d\n", f, v, c, h_likelihoods[index]);
            }
        }
    }

    return 0;
}

Overwriting NaiveBayes.cu


In [7]:
%%writefile NaiveBayesTrain.cuh
// NaiveBayesTrain.cuh
#ifndef NAIVE_BAYES_TRAIN_CUH
#define NAIVE_BAYES_TRAIN_CUH

// Function to launch the CUDA kernel for training
void trainNaiveBayes(
    int* d_Dataset, int numSamples, int numFeatures,
    int numClasses, int numFeatureValues
);

#endif

Overwriting NaiveBayesTrain.cuh


In [11]:
# Compile with the specified architecture
!nvcc NaiveBayes.cu -o NaiveBayes -gencode arch=compute_75,code=sm_75

# Run the executable
!./NaiveBayes

Priors:
Class 0: 5
Class 1: 5

Likelihoods:
P(Feature 0 = 0 | Class 0): 5
P(Feature 0 = 1 | Class 0): 0
P(Feature 0 = 2 | Class 0): 0
P(Feature 1 = 0 | Class 0): 1
P(Feature 1 = 1 | Class 0): 2
P(Feature 1 = 2 | Class 0): 2
P(Feature 0 = 0 | Class 1): 0
P(Feature 0 = 1 | Class 1): 5
P(Feature 0 = 2 | Class 1): 0
P(Feature 1 = 0 | Class 1): 2
P(Feature 1 = 1 | Class 1): 2
P(Feature 1 = 2 | Class 1): 1
