# Prime Numbers between 2 and 100001 for CPU & GPU
## Adding timing code to measure the performance of both functions. Compare both

In [20]:
%%writefile 1.cu

#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <chrono>

__global__ void isPrimeGPU(int *d_n, int *d_output, int limit) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;

   // isPrime(id, d_n, d_output, limit);
    if (id >= 2 && id < limit) {
        d_output[id] = 1;
        for (int i = 2; i * i <= d_n[id]; i++) {
            if (d_n[id] % i == 0) {
                d_output[id] = 0;
                break;
            }
        }
    }

}

int isPrimeCPU(int n) {
    if (n <= 1) return 0;
    for (int i = 2; i * i <= n; i++) {
        if (n % i == 0) return 0;
    }
    return 1;
}

int main() {
    int n = 100001; // Upper limit for checking primes
    int *h_n = (int *)malloc(n * sizeof(int));
    int *h_output = (int *)malloc(n * sizeof(int));
    int *d_n, *d_output;

    for (int i = 0; i < n; i++) {
        h_n[i] = i;
    }

    printf("Prime numbers from CPU up to %d:\n", n);

    auto start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < n; i++) {
        if (isPrimeCPU(h_n[i])) {
            //printf("%d ", h_n[i]);
        }
    }
    printf("\n");
    auto end_time = std::chrono::high_resolution_clock::now();

    auto duration_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
    double seconds_cpu = duration_ns / 1000000000.0;
    std::cout << "Time taken by CPU: " << seconds_cpu << " seconds" << std::endl;


    cudaMalloc((void**)&d_n, n * sizeof(int));
    cudaMalloc((void**)&d_output, n * sizeof(int));

    cudaMemcpy(d_n, h_n, n * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 1024;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;


    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    isPrimeGPU<<<blocksPerGrid, threadsPerBlock>>>(d_n, d_output, n);
    cudaEventRecord(stop);
    cudaDeviceSynchronize();
    cudaEventSynchronize(stop);

    float milliseconds_device = 0;
    cudaEventElapsedTime(&milliseconds_device, start, stop);
    double seconds_gpu = milliseconds_device / 1000.0;
    std::cout << "Time taken by device (GPU): " << seconds_gpu << " seconds" << std::endl;

    cudaMemcpy(h_output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 2; i < n; i++) {
        if (h_output[i] == 1) {
            //printf("%d ", i);
        }
    }
    printf("\n");

    cudaFree(d_n);
    cudaFree(d_output);
    free(h_n);
    free(h_output);

    return 0;

}


Overwriting 1.cu


In [21]:
!nvcc -o 1 1.cu

In [22]:
!./1

Prime numbers from CPU up to 100001:

Time taken by CPU: 0.00844827 seconds
Time taken by device (GPU): 0.000550208 seconds



## Adding __device__ function in above code

In [4]:
%%writefile 2.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <chrono>
#include <iostream>


__device__ void isPrime(int id, int *d_n, int *d_output, int limit) {
    if (id >= 2 && id < limit) {
        d_output[id] = 1; // Assume the number is prime
        for (int i = 2; i * i <= d_n[id]; i++) {
            if (d_n[id] % i == 0) {
                d_output[id] = 0; // Mark as not prime if divisible
                break;
            }
        }
    }
}

__global__ void isPrimeGPU(int *d_n, int *d_output, int limit) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    isPrime(id, d_n, d_output, limit);
}

int isPrimeCPU(int n) {
    if (n <= 1) return 0;
    for (int i = 2; i * i <= n; i++) {
        if (n % i == 0) return 0;
    }
    return 1;
}

int main() {
    int n = 100001;
    int *h_n = (int *)malloc(n * sizeof(int));
    int *h_output = (int *)malloc(n * sizeof(int));
    int *d_n, *d_output;

    for (int i = 0; i < n; i++) {
        h_n[i] = i;
    }

    printf("Prime numbers from CPU up to %d:\n", n);

    auto start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < n; i++) {
        if (isPrimeCPU(h_n[i])) {
            // printf("%d ", h_n[i]);
        }
    }
    printf("\n");
    auto end_time = std::chrono::high_resolution_clock::now();

    auto duration_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();
    double seconds_cpu = duration_ns / 1000000000.0;
    std::cout << "Time taken by CPU: " << seconds_cpu << " seconds" << std::endl;

    cudaMalloc((void**)&d_n, n * sizeof(int));
    cudaMalloc((void**)&d_output, n * sizeof(int));

    cudaMemcpy(d_n, h_n, n * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 1024;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    isPrimeGPU<<<blocksPerGrid, threadsPerBlock>>>(d_n, d_output, n);
    cudaEventRecord(stop);
    cudaDeviceSynchronize();
    cudaEventSynchronize(stop);

    float milliseconds_device = 0;
    cudaEventElapsedTime(&milliseconds_device, start, stop);
    double seconds_gpu = milliseconds_device / 1000.0;
    std::cout << "Time taken by device (GPU): " << seconds_gpu << " seconds" << std::endl;

    cudaMemcpy(h_output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Prime numbers from GPU output:\n");
    for (int i = 2; i < n; i++) {
        if (h_output[i] == 1) {
            //printf("%d ", i);
        }
    }
    printf("\n");

    cudaFree(d_n);
    cudaFree(d_output);
    free(h_n);
    free(h_output);

    return 0;
}


Overwriting 2.cu


In [5]:
!nvcc -o 2 2.cu

In [6]:
!./2

Prime numbers from CPU up to 100001:

Time taken by CPU: 0.00930758 seconds
Time taken by device (GPU): 0.000567872 seconds
Prime numbers from GPU output:



## Operations on CSV file


In [4]:
%%writefile 5.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>

#define MAX_LINE_LENGTH 1024

typedef struct {
    int runs;
    int not_out;
} playerInnings;

__global__ void calcStats(playerInnings *innings, int numRecords, int *totalRuns, int *totalNotOuts) {
    int id = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ int runsBlock[256];
    __shared__ int notOutsBlock[256];

    runsBlock[threadIdx.x] = 0;
    notOutsBlock[threadIdx.x] = 0;

    // Each thread processes one record if within bounds
    if (id < numRecords) {
        runsBlock[threadIdx.x] = innings[id].runs;
        notOutsBlock[threadIdx.x] = innings[id].not_out;
    }

    __syncthreads();

    // Reduction within the block
    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
        if (threadIdx.x < stride) {
            runsBlock[threadIdx.x] += runsBlock[threadIdx.x + stride];
            notOutsBlock[threadIdx.x] += notOutsBlock[threadIdx.x + stride];
        }
        __syncthreads();
    }

    if (threadIdx.x == 0) {
        atomicAdd(totalRuns, runsBlock[0]);
        atomicAdd(totalNotOuts, notOutsBlock[0]);
    }
}

int main() {
    FILE *file;
    char line[MAX_LINE_LENGTH];
    char playerName[100];

    const char *fileName = "odi_players_combined_filtered.csv";
    file = fopen(fileName, "r");
    if (!file) {
        fprintf(stderr, "Error opening file: %s\n", fileName);
        return EXIT_FAILURE;
    }

    printf("Enter the player's name: ");
    fgets(playerName, sizeof(playerName), stdin);
    playerName[strcspn(playerName, "\n")] = 0;

    playerInnings *filteredInnings = (playerInnings*)malloc(MAX_LINE_LENGTH * sizeof(playerInnings));
    int numRecords = 0;

    // Read the file and filter innings for the specified player
    while (fgets(line, sizeof(line), file)) {
        char inningsPlayer[100];
        playerInnings innings;
        sscanf(line, "%99[^,],%d,%d", inningsPlayer, &innings.runs, &innings.not_out);

        if (strcmp(inningsPlayer, playerName) == 0) {
            filteredInnings[numRecords++] = innings;
        }
    }
    fclose(file);

    playerInnings *d_innings;
    int *d_totalRuns, *d_totalNotOuts;
    int h_totalRuns = 0, h_totalNotOuts = 0;

    cudaMalloc((void**)&d_innings, numRecords * sizeof(playerInnings));
    cudaMalloc((void**)&d_totalRuns, sizeof(int));
    cudaMalloc((void**)&d_totalNotOuts, sizeof(int));

    cudaMemcpy(d_innings, filteredInnings, numRecords * sizeof(playerInnings), cudaMemcpyHostToDevice);
    cudaMemcpy(d_totalRuns, &h_totalRuns, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_totalNotOuts, &h_totalNotOuts, sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (numRecords + threadsPerBlock - 1) / threadsPerBlock;

    calcStats<<<blocksPerGrid, threadsPerBlock>>>(d_innings, numRecords, d_totalRuns, d_totalNotOuts);
    cudaDeviceSynchronize();

    cudaMemcpy(&h_totalRuns, d_totalRuns, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_totalNotOuts, d_totalNotOuts, sizeof(int), cudaMemcpyDeviceToHost);

    int totalInningsPlayed = numRecords;
    float battingAverage = (totalInningsPlayed - h_totalNotOuts) > 0 ? (float)h_totalRuns / (totalInningsPlayed - h_totalNotOuts) : 0;

    printf("Total Runs scored: %d\n", h_totalRuns);
    printf("Total Not Outs: %d\n", h_totalNotOuts);
    printf("Batting Average: %.2f\n", battingAverage);

    cudaFree(d_innings);
    cudaFree(d_totalRuns);
    cudaFree(d_totalNotOuts);
    free(filteredInnings);

    return EXIT_SUCCESS;
}


Overwriting 5.cu


In [5]:
!nvcc -o 5 5.cu

In [8]:
!./5

Enter the player's name: V Kohli
Total Runs scored: 11867
Total Not Outs: 39
Batting Average: 59.33
