In [1]:
! nvcc --version

! pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-tmjkqqjg
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-tmjkqqjg
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0d2ab99cccbbc682722e708515fe9c4cfc50185a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4716 sha256=a04984eab87946fdfb393f3976510b000b97295dfe8d833ad3fd4a731f42ec6a
  Stored in directory: /tmp/pip-ephem-wheel-cache-fx5ib95d/wheels

In [2]:
%%cu
#include <iostream>

__global__ void helloWorld() {
    int threadId = threadIdx.x + blockIdx.x * blockDim.x;
    printf("Hello, World! from thread %d\n", threadId);
}

int main() {
    int threadsPerBlock = 4;
    int numBlocks = 1;

    helloWorld<<<numBlocks, threadsPerBlock>>>();

    cudaDeviceSynchronize();

    return 0;
}

Hello, World! from thread 0
Hello, World! from thread 1
Hello, World! from thread 2
Hello, World! from thread 3



In [3]:
%%cu

#include <iostream>
#include <cuda_runtime.h>
#include <chrono>

const int K = 10000;
const int M = 20;
const int N = 20;
const int P = 20;

__global__ void matrixMultiplicationGPU(const int* A, const int* B, int* C, int M, int N, int P) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < P) {
        int sum = 0;
        for (int k = 0; k < N; ++k) {
            sum += A[row * N + k] * B[k * P + col];
        }
        C[row * P + col] = sum;
    }
}

void matrixMultiplicationCPU(const int* A, const int* B, int* C, int M, int N, int P) {
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < P; ++j) {
            int sum = 0;
            for (int k = 0; k < N; ++k) {
                sum += A[i * N + k] * B[k * P + j];
            }
            C[i * P + j] = sum;
        }
    }
}

int main() {
    const int size_A = K * M * N;
    const int size_B = K * N * P;
    const int size_C = K * M * P;

    // Allocate host memory
    int* h_A = new int[size_A];
    int* h_B = new int[size_B];
    int* h_C_GPU = new int[size_C];
    int* h_C_CPU = new int[size_C];

    // Initialize matrices A and B on the host
    for (int i = 0; i < size_A; ++i) {
        h_A[i] = i % 10; // Example initialization
    }

    for (int i = 0; i < size_B; ++i) {
        h_B[i] = 1; // Example initialization
    }

    // Allocate device memory
    int* d_A, * d_B, * d_C;
    cudaMalloc((void**)&d_A, size_A * sizeof(int));
    cudaMalloc((void**)&d_B, size_B * sizeof(int));
    cudaMalloc((void**)&d_C, size_C * sizeof(int));

    // Copy matrices A and B from host to device
    cudaMemcpy(d_A, h_A, size_A * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size_B * sizeof(int), cudaMemcpyHostToDevice);

    // Set the block and grid dimensions
    dim3 dimBlock(16, 16);
    dim3 dimGrid((P + dimBlock.x - 1) / dimBlock.x, (M + dimBlock.y - 1) / dimBlock.y);

    // Create events for timing GPU multiplication
    // cudaEvent_t startGPU, stopGPU;
    // cudaEventCreate(&startGPU);
    // cudaEventCreate(&stopGPU);

    // Record start event for GPU
    // cudaEventRecord(startGPU);

    auto startGPU = std::chrono::high_resolution_clock::now();

    // Launch the GPU kernel for each matrix multiplication
    matrixMultiplicationGPU<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, M, N, P);

    auto stopGPU = std::chrono::high_resolution_clock::now();
    auto durationGPU = std::chrono::duration_cast<std::chrono::microseconds>(stopGPU - startGPU);

    // Record stop event for GPU
    // cudaEventRecord(stopGPU);
    // cudaEventSynchronize(stopGPU);

    // Calculate elapsed time for GPU
    // float millisecondsGPU = 0;
    // cudaEventElapsedTime(&millisecondsGPU, startGPU, stopGPU);

    // Copy the result matrix C from device to host for GPU
    cudaMemcpy(h_C_GPU, d_C, size_C * sizeof(int), cudaMemcpyDeviceToHost);

    // Record start time for CPU
    auto startCPU = std::chrono::high_resolution_clock::now();

    // Perform CPU matrix multiplication
    matrixMultiplicationCPU(h_A, h_B, h_C_CPU, M, N, P);

    // Record stop time for CPU
    auto stopCPU = std::chrono::high_resolution_clock::now();
    auto durationCPU = std::chrono::duration_cast<std::chrono::microseconds>(stopCPU - startCPU);

    // Print the elapsed time for GPU
    std::cout << "Time taken (GPU): " << durationGPU.count() << " microseconds" << std::endl;

    // Print the elapsed time for CPU
    std::cout << "Time taken (CPU): " << durationCPU.count() << " microseconds" << std::endl;

    // Verify GPU and CPU results
    for (int i = 0; i < size_C; ++i) {
        if (h_C_GPU[i] != h_C_CPU[i]) {
            std::cerr << "Verification failed!" << std::endl;
            break;
        }
    }

    // Free allocated memory
    delete[] h_A;
    delete[] h_B;
    delete[] h_C_GPU;
    delete[] h_C_CPU;
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // cudaEventDestroy(startGPU);
    // cudaEventDestroy(stopGPU);

    return 0;
}


Time taken (GPU): 45553 microseconds
Time taken (CPU): 23 microseconds



In [28]:
%%cu

#include <iostream>
#include <fstream>
#include <cstring>
#include <cuda_runtime.h>

using namespace std;

const int MAX_CONTACTS = 8000; // Maximum number of contacts
const int MAX_NAME_LENGTH = 60;
const int MAX_PHONE_LENGTH = 20;
const int BLOCK_SIZE = 256;


// Define a const char array that can be used on both host and device
__constant__ char targetName[MAX_NAME_LENGTH];

// CUDA kernel for searching contacts matching a given name
__global__ void searchContactsKernel(char* names, int numContacts, int* matchingIndices) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < numContacts) {
        // printf("in");
        // Custom implementation of string comparison
        int i = 0;

        while (targetName[i] != '\0' && names[tid * MAX_NAME_LENGTH + i] == targetName[i]) {
            ++i;
        }
        // printf("%c", targetName[0]);
        if (targetName[i] == '\0' && names[tid * MAX_NAME_LENGTH + i] == '\0') {
            matchingIndices[tid] = tid;
           // printf("found");
        } else {
            matchingIndices[tid] = -1;
        }
    }
}


// Function to read phonebook data from files
int readPhonebookData(const string& filename, char* names, char* phoneNumbers) {
    ifstream file(filename);
    int numContacts = 0;

    if (file.is_open()) {
        string line;
        while (getline(file, line) && numContacts < MAX_CONTACTS) {
            size_t spacePos = line.find(' ');
            if (spacePos != string::npos) {
                strncpy(names + numContacts * MAX_NAME_LENGTH, line.substr(0, spacePos).c_str(), MAX_NAME_LENGTH - 1);
                names[(numContacts + 1) * MAX_NAME_LENGTH - 1] = '\0';
                strncpy(phoneNumbers + numContacts * MAX_PHONE_LENGTH, line.substr(spacePos + 1).c_str(), MAX_PHONE_LENGTH - 1);
                phoneNumbers[(numContacts + 1) * MAX_PHONE_LENGTH - 1] = '\0';
                ++numContacts;
            }
        }
        file.close();
    }
    return numContacts;
}

int main(int argc, char* argv[]) {
    //if (argc < 2) {
    //    cout << "Usage: " << argv[0] << " <file1> <file2> <file3> ..." << endl;
    //    return 1;
    //}

   // const int numFiles = argc - 1;

     const string filenames[] = { "input.txt", "input2.txt" };



   // const char targetName[] = "Webster__Daniel";
    const char hostTargetName[MAX_NAME_LENGTH] = "Vela__Filemon";

    char* allNames = new char[MAX_CONTACTS * MAX_NAME_LENGTH];
    char* allPhoneNumbers = new char[MAX_CONTACTS * MAX_PHONE_LENGTH];
    int totalContacts = 0;

    int numFiles = sizeof(filenames) / sizeof(filenames[0]);

    for (int i = 0; i < numFiles; ++i) {
        int contactsInFile = readPhonebookData(filenames[i], allNames + totalContacts * MAX_NAME_LENGTH, allPhoneNumbers + totalContacts * MAX_PHONE_LENGTH);
        totalContacts += contactsInFile;
    }

    cout<<totalContacts<<endl;

    char* d_allNames;
    int* d_matchingIndices;
    cudaMalloc((void**)&d_allNames, totalContacts * MAX_NAME_LENGTH);
    cudaMalloc((void**)&d_matchingIndices, totalContacts * sizeof(int));

    cudaMemcpy(d_allNames, allNames, totalContacts * MAX_NAME_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(targetName, hostTargetName, MAX_NAME_LENGTH * sizeof(char));


    // Launch CUDA kernel
    int numBlocks = (totalContacts + BLOCK_SIZE - 1) / BLOCK_SIZE;
    searchContactsKernel<<<numBlocks, BLOCK_SIZE>>>(d_allNames, totalContacts, d_matchingIndices);

    int* h_matchingIndices = new int[totalContacts];
    cudaMemcpy(h_matchingIndices, d_matchingIndices, totalContacts * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the matching contacts
    cout << "total Contacts: " << totalContacts << endl;
    cout << "Matching Contacts for '" << hostTargetName << "':" << endl;
    for (int i = 0; i < totalContacts; ++i) {
        if (h_matchingIndices[i] != -1) {
            cout << allNames + h_matchingIndices[i] * MAX_NAME_LENGTH << "\t" << allPhoneNumbers + h_matchingIndices[i] * MAX_PHONE_LENGTH << endl;
        }
    }

    // Free allocated memory
    delete[] allNames;
    delete[] allPhoneNumbers;
    delete[] h_matchingIndices;
    cudaFree(d_allNames);
    cudaFree(d_matchingIndices);



    return 0;
}


1719
total Contacts: 1719
Matching Contacts for 'Vela__Filemon':
Vela__Filemon	202-225-9901
Vela__Filemon	202-225-9901
Vela__Filemon	202-225-9901
Vela__Filemon	202-225-9901

