<a href="https://colab.research.google.com/github/MichaelGelo/GRP2_CEPARCO_IP/blob/main/CEPARCO_IP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Group 2 - Implementing Hyyrö’s Bit Vector Algorithm Using CUDA SIMT**
## **GROUP 2 - S11**

**MEMBERS:**

- Alfred Bastin S. Agustines
- Allan David C. De Leon
- Michael Angelo Depasucat
- Kai Hiori J. Padilla


##Check if CUDA is present

In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Mar 12 14:11:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

##C

In [9]:
%%writefile C_hyyro.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <time.h>

#define MAX_LENGTH 64

typedef uint64_t bitvector;

int bit_vector_levenshtein(const char *query, const char *reference) {
    int m = strlen(query);
    int n = strlen(reference);
    if (m > MAX_LENGTH || n > MAX_LENGTH) {
        printf("Error: Strings too long for this implementation!\n");
        return -1;
    }

    bitvector Pv = ~0ULL;
    bitvector Mv = 0;
    bitvector Eq[256] = {0};
    bitvector Ph, Mh, Xv, Xh, Xp;

    for (int i = 0; i < m; i++) {
        Eq[(unsigned char)query[i]] |= (1ULL << i);
    }

    int score = m;

    for (int j = 0; j < n; j++) {
        Xv = Eq[(unsigned char)reference[j]] | Mv;
        Xh = ((~Xh & Xv) << 1) & Xp;

        Xh = Xh | ((Xv & Pv) + Pv) ^ Pv | Xv | Mv;
        Ph = Mv | ~(Xh | Pv);
        Mh = Xh & Pv;
        Xp = Xv;

        if (Ph & (1ULL << (m - 1))) score++;
        if (Mh & (1ULL << (m - 1))) score--;

        Xv = (Ph << 1);
        Pv = (Mh << 1) | ~(Xh | Xv);
        Mv = Xh & Xv;
    }

    return score;
}

int main() {
    const size_t loope = 10;
    clock_t start, end;
    double elapse = 0.0f;

    const char *query = "tcg";
    const char *reference = "attattcga";

    printf("Reference: %s\n", reference);
    printf("Query: %s\n", query);

    int distance = bit_vector_levenshtein(query, reference);
    if (distance >= 0) {
        printf("Edit Distance between \"%s\" and \"%s\" is: %d\n", query, reference, distance);
    }

    for (int i = 0; i < loope; i++) {
        start = clock();
        bit_vector_levenshtein(query, reference);
        end = clock();
        elapse += ((double)(end - start)) * 1E3 / CLOCKS_PER_SEC;
    }
    printf("Function (in C) average time for %lu loops is %f milliseconds\n", loope, elapse / loope);

    return 0;
}


Overwriting C_hyyro.c


In [10]:
%%shell
gcc C_hyyro.c -o C_hyyro



In [11]:
%%shell
./C_hyyro

Reference: attattcga
Query: tcg
Edit Distance between "tcg" and "attattcga" is: 1
Function (in C) average time for 10 loops is 0.000400 milliseconds




##CUDA

In [5]:
%%writefile CUDA.cu

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <cuda_runtime.h>

// Prefetch + page creation + memadvise
#define MAX_LENGTH 64
typedef uint64_t bitvector;
__constant__ bitvector d_Eq[256];

__host__ __device__ int bit_vector_levenshtein(int query_length, const char *reference, int reference_length, const bitvector *Eq) {
    if (query_length > MAX_LENGTH || reference_length > MAX_LENGTH) {
        return -1;
    }

    bitvector Pv = ~0ULL;
    bitvector Mv = 0;
    bitvector Ph = 0;
    bitvector Mh = 0;
    bitvector Xv = 0;
    bitvector Xh = 0;
    bitvector Xp = 0;
    int score = query_length;

    for (int j = 0; j < reference_length; j++) {
        unsigned char c = reference[j];
        Xv = Eq[c] | Mv;

        Xh = ((~Xh & Xv) << 1) & Xp;

        // Explicit parentheses for clarity
        Xh = Xh | ((((Xv & Pv) + Pv) ^ Pv) | Xv | Mv);

        Ph = Mv | ~(Xh | Pv);
        Mh = Xh & Pv;
        Xp = Xv;

        if (Ph & (1ULL << (query_length - 1))) score++;
        if (Mh & (1ULL << (query_length - 1))) score--;

        Xv = (Ph << 1);
        Pv = (Mh << 1) | ~(Xh | Xv);
        Mv = Xh & Xv;
    }

    return score;
}

__global__ void levenshtein_kernel(int query_length, const char *references, const int *reference_lengths, int *distances, int num_references) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = idx; i < num_references; i += stride) {
        const char *reference = &references[i * MAX_LENGTH];
        int reference_length = reference_lengths[i];
        // Use the constant memory table on the device.
        distances[i] = bit_vector_levenshtein(query_length, reference, reference_length, d_Eq);
    }
}

int main(){
    const char *query = "tcg"; // Query DNA sequence
    int query_length = strlen(query);
    if (query_length > MAX_LENGTH) {
        printf("Query sequence too long!\n");
        return -1;
    }

    // Precompute Eq for the query and store it in host memory.
    bitvector h_Eq[256] = {0};
    for (int i = 0; i < query_length; i++) {
        h_Eq[(unsigned char)query[i]] |= (1ULL << i);
    }

    // Copy Eq to constant memory on the device.
    cudaMemcpyToSymbol(d_Eq, h_Eq, sizeof(bitvector) * 256);

    // Reference sequences to process
    const char *references_input[] = { "attattcga", "atttcatctcgt" }; // Reference DNA sequences (Dummy lang toh, palagay ako ng actua data)
    int num_references = 2;
    int reference_lengths[num_references];
    for (int i = 0; i < num_references; i++) {
        reference_lengths[i] = strlen(references_input[i]);
        if (reference_lengths[i] > MAX_LENGTH) {
            printf("Reference sequence %d too long!\n", i);
            return -1;
        }
    }

    // Allocate Unified Memory
    char *references;
    int *d_reference_lengths, *d_distances;
    cudaMallocManaged(&references, num_references * MAX_LENGTH * sizeof(char));
    cudaMallocManaged(&d_reference_lengths, num_references * sizeof(int));
    cudaMallocManaged(&d_distances, num_references * sizeof(int));

    // Copy references and lengths to managed memory
    for (int i = 0; i < num_references; i++) {
        strncpy(&references[i * MAX_LENGTH], references_input[i], MAX_LENGTH);
        d_reference_lengths[i] = reference_lengths[i];
    }

    // Get GPU device ID
    int device = -1;
    cudaGetDevice(&device);

    // Memory advise
    cudaMemAdvise(references, num_references * MAX_LENGTH * sizeof(char), cudaMemAdviseSetReadMostly, device);
    cudaMemAdvise(d_reference_lengths, num_references * sizeof(int), cudaMemAdviseSetReadMostly, device);
    cudaMemAdvise(d_distances, num_references * sizeof(int), cudaMemAdviseSetPreferredLocation, device);

    // Prefetch data
    cudaMemPrefetchAsync(references, num_references * MAX_LENGTH * sizeof(char), device, NULL);
    cudaMemPrefetchAsync(d_reference_lengths, num_references * sizeof(int), device, NULL);
    cudaMemPrefetchAsync(d_distances, num_references * sizeof(int), device, NULL);

    // Kernel parameters
    int threadsPerBlock = 256;
    int blocksPerGrid = (num_references + threadsPerBlock - 1) / threadsPerBlock;

    // Number of times the program is executed
    const size_t loope = 10;

    printf("*** function = Levenshtein Distance\n");
    printf("numReferences = %d\n", num_references);
    printf("numBlocks = %d, numThreads = %d\n", blocksPerGrid, threadsPerBlock);

    // Execute kernel multiple times
    for (size_t i = 0; i < loope; i++) {
        levenshtein_kernel<<<blocksPerGrid, threadsPerBlock>>>(query_length, references, d_reference_lengths, d_distances, num_references);
    }

    // Synchronize device
    cudaDeviceSynchronize();

    // Prefetch distances back to CPU
    cudaMemPrefetchAsync(d_distances, num_references * sizeof(int), cudaCpuDeviceId, NULL);
    cudaDeviceSynchronize();

    // Error checking: compute expected distance on host using the host copy of Eq (h_Eq).
    size_t err_count = 0;
    for (int i = 0; i < num_references; i++) {
        int expected_distance = bit_vector_levenshtein(query_length, references_input[i], reference_lengths[i], h_Eq);
        if (d_distances[i] != expected_distance) {
            err_count++;
        }
    }
    printf("Error count (CUDA program): %zu\n", err_count);

    // Print results
    for (int i = 0; i < num_references; i++) {
        printf("Edit Distance between query \"%s\" and reference \"%s\" is: %d\n", query, references_input[i], d_distances[i]);
    }

    // Free memory
    cudaFree(references);
    cudaFree(d_reference_lengths);
    cudaFree(d_distances);

    return 0;
}


Overwriting CUDA.cu


In [6]:
%%shell
nvcc -o CUDA CUDA.cu -arch=sm_75



In [7]:
%%shell
nvprof ./CUDA

==4230== NVPROF is profiling process 4230, command: ./CUDA
*** function = Levenshtein Distance
numReferences = 2
numBlocks = 1, numThreads = 256
Error count (CUDA program): 0
Edit Distance between query "tcg" and reference "attattcga" is: 1
Edit Distance between query "tcg" and reference "atttcatctcgt" is: 1
==4230== Profiling application: ./CUDA
==4230== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.63%  214.62us        10  21.461us  5.5030us  164.38us  levenshtein_kernel(int, char const *, int const *, int*, int)
                    0.37%     800ns         1     800ns     800ns     800ns  [CUDA memcpy HtoD]
      API calls:   82.26%  98.943ms         1  98.943ms  98.943ms  98.943ms  cudaMemcpyToSymbol
                   16.93%  20.367ms         3  6.7890ms  4.8820us  20.337ms  cudaMallocManaged
                    0.26%  311.13us         4  77.781us  11.355us  156.13us  cudaMemPrefetchAsync
                  

