## Permutation Test using One-Way ANOVA in CUDA

In [11]:
import os

# Add the directory containing the executable to the PATH
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin"

# Check if the directory is added to the PATH
print(os.environ["PATH"])

/opt/tljh/user/bin:/bin:/usr/bin:/usr/local/cuda/bin:/usr/local/cuda/bin


### Random Dataset Generation

In [13]:
!python -m pip install scikit-learn
!python -m pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [7]:
from sklearn.datasets import make_classification

k = 3
N = 6
X, y = make_classification(
    n_samples = N,             # row number
    n_features = 5,            # feature numbers
    n_informative = 3,         # The number of informative features
    n_redundant = 0,           # The number of redundant features
    n_repeated = 0,            # The number of duplicated features
    n_classes = k,             # The number of classes 
    n_clusters_per_class = 1,  # The number of clusters per class
    random_state = 42,         # random seed 
    scale=100                  # scale of the data
)

In [8]:
import pandas as pd

df = pd.concat([pd.DataFrame(X)[[0]], pd.DataFrame(y).astype(int)], axis=1)
df.columns = [0, 1]
df = df.sort_values(1).reset_index().iloc[:,1:]
df.to_csv("dataset.csv", header=False, index=False)

### Exact Permutation Test

#### C Version (Serial)

In [69]:
%%writefile c_exact_perm.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* EXACT BINOMIAL & MULTINOMIAL FUNCTIONS */
// Compute C(n,k) exactly using 128-bit integers
unsigned long long int binom(unsigned int n, unsigned int k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;

    unsigned long long int result = 1;
    for (unsigned int i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

/* EXACT multinomial coefficient using sequential binomial method */
unsigned long long int getCountPerm(int total_elements, size_t *repeats, int k) {
    unsigned long long int result = 1;
    int remaining = total_elements;

    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        unsigned long long int c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

// helper function for swapping values
void Exchange(size_t* data, size_t a, size_t b) {
    size_t temp = data[a];
    data[a] = data[b];
    data[b] = temp;
}

/* PERMUTATION GENERATOR */
int permute(size_t a[], int n) {
    int l, j;
    for (j = --n; j > 0 && a[j-1] >= a[j]; --j) { ; }
    if (j == 0) return 0;
    for (l = n; a[j-1] >= a[l]; --l) { ; }
    Exchange(a, j-1, l);
    while (j < n) { Exchange(a, j++, n--); }
    return 1;
}

/* ONE WAY ANALYSIS OF VARIANCE */
double OneWayAnova(size_t N, size_t k, size_t *n_i, size_t *group, double *feature){
    double *group_ave = calloc(k, sizeof(double));

    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    free(group_ave);

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}

int main() {
    size_t N;
    size_t k;
    clock_t start, end;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    double *feature = malloc(N * sizeof(double));
    size_t *group = malloc(N * sizeof(size_t));
    size_t *group_copy = malloc(N * sizeof(size_t));
    size_t *perm_array = malloc(N * 10 * sizeof(size_t));
    size_t *n_i = calloc(k, sizeof(size_t));

    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;

    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;
        char *token = strtok(line, ",");
        int j = 0;

        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);

    memcpy(group_copy, group, N * sizeof(size_t));

    /* EXACT PERMUTATION COUNT*/
    unsigned long long int perm_count = getCountPerm(N, n_i, k);

    // Can overflow for large n
    double *F_dist = malloc(perm_count * sizeof(double));

    // Execution time start here: CPU Permutation
    /* CPU PERMUTATION */
    double elapse = 0.0f, 
           time_taken;

    /* PERMUTATION TEST */
    for (int c=0; c<counter; c++){
        start = clock();
        for (i = 0; i < perm_count; i++){
            // compute One Way ANOVA
            F_dist[i] = OneWayAnova(N, k, n_i, group, feature);

            // save the permutation for sanity check
            if (i < 5 || i > perm_count - 6)
                memcpy(&perm_array[i * N], group, N * sizeof(size_t));

            // change grouping assignment
            permute(group, N);
        }
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
        memcpy(group, group_copy, N * sizeof(size_t));
    }

    FILE *fptr;
    fptr = fopen("c_exact_perm.csv", "w");
    // saving N and average execution time
    fprintf(fptr, "%lu, %f\n", N, elapse/counter);

    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to execute an array size of %llu permutations.\n", counter, elapse/counter, perm_count);
    printf("\n");

    // Print first 5 and last 5 permutations
    printf("Printing First 5 and Last 5 Permutations\n");
    for (size_t i = 0; i < 5; i++) {
        printf("CPU Permutation %zu: ", i+1);
        for (size_t j = 0; j < N; j++) {
            printf("%zu ", perm_array[i*N + j]);
            fprintf(fptr, "%zu,", perm_array[i*N + j]);
        }
        printf("\n");
        fprintf(fptr, "\n");
    }
    printf("=================================\n");
    for (size_t i = perm_count-5; i < perm_count; i++) {
        printf("CPU Permutation %zu: ", i+1);
        for (size_t j = 0; j < N; j++) {
            printf("%zu ", perm_array[i*N + j]);
            fprintf(fptr, "%zu,", perm_array[i*N + j]);
        }
        printf("\n");
        fprintf(fptr, "\n");
    }

    printf("\nPrinting First 5 and Last 5 Results\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf ("F_dist %d: %lf\n", i+1, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf ("F_dist %d: %lf\n", i+1, F_dist[i]);
    } 

    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
       }
    }

    // Calculating the p-value for the permutation test
    double p_value = (double)extreme_count/perm_count;
    printf("\nNull F: %lf\n", F_dist[0]);
    printf ("Extreme Count: %lu\n", extreme_count);
    p_value = (double)extreme_count / perm_count;
    printf("p-value: %lf\n", p_value);

    // saving extreme count and p-value
    fprintf(fptr, "%zu,%lf\n", extreme_count, p_value);
    fclose(fptr);

    // free the allocated memory
    free(feature);
    free(group);
    free(n_i);
    free(F_dist);

    return 0;
}

Overwriting c_exact_perm.c


In [70]:
%%bash
gcc c_exact_perm.c -o c_exact_perm -lm

In [71]:
%%bash
./c_exact_perm < input.txt

Number of Rows: Number of Groups: 
Function (in C) average time for 10 loops is 0.013300 milliseconds to execute an array size of 90 permutations.

Printing First 5 and Last 5 Permutations
CPU Permutation 1: 0 0 1 1 2 2 
CPU Permutation 2: 0 0 1 2 1 2 
CPU Permutation 3: 0 0 1 2 2 1 
CPU Permutation 4: 0 0 2 1 1 2 
CPU Permutation 5: 0 0 2 1 2 1 
CPU Permutation 86: 2 2 0 1 0 1 
CPU Permutation 87: 2 2 0 1 1 0 
CPU Permutation 88: 2 2 1 0 0 1 
CPU Permutation 89: 2 2 1 0 1 0 
CPU Permutation 90: 2 2 1 1 0 0 

Printing First 5 and Last 5 Results
F_dist 1: 0.005957
F_dist 2: 6.960385
F_dist 3: 0.020042
F_dist 4: 0.020042
F_dist 5: 6.960385
F_dist 86: 6.960385
F_dist 87: 0.020042
F_dist 88: 0.020042
F_dist 89: 6.960385
F_dist 90: 0.005957

Null F: 0.005957
Extreme Count: 87
p-value: 0.966667


#### CUDA (using lexicographic permutation)

In [72]:
%%writefile cuda_exact_lexico_perm_anova.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* EXACT BINOMIAL & MULTINOMIAL FUNCTIONS */
// Compute C(n,k) exactly using 128-bit integers
unsigned long long binom(unsigned long long n, unsigned long long k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;

    unsigned long long result = 1;
    for (int i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

/* EXACT multinomial coefficient using sequential binomial method */
unsigned long long getCountPerm(int total_elements, size_t *repeats, int k) {
    unsigned long long result = 1;
    int remaining = total_elements;

    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        unsigned long long c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

// helper function for swapping values
void Exchange(size_t* data, size_t a, size_t b) {
    size_t temp = data[a];
    data[a] = data[b];
    data[b] = temp;
}

/* PERMUTATION GENERATOR */
int permute(size_t a[], size_t n) {
    int l, j;
    for (j = --n; j > 0 && a[j-1] >= a[j]; --j) { ; }
    if (j == 0) return 0;
    for (l = n; a[j-1] >= a[l]; --l) { ; }
    Exchange(a, j-1, l);
    while (j < n) { Exchange(a, j++, n--); }
    return 1;
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, size_t k, size_t *n_i, size_t *group, double *feature){
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}

__global__ void gpu_anova(size_t *perm_array, size_t N, int k, size_t perm_count, double *feature, size_t *n_i, double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        size_t *current_group = &perm_array[perm_idx * N];
        F_dist[perm_idx] = OneWayAnova(N, k, n_i, current_group, feature);
    }
}

int main() {
    size_t N;
    size_t k;
    clock_t start, end;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *group_copy = (size_t*)malloc(N * sizeof(size_t));
    size_t *n_i;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));

    // Initialize n_i to zero
    memset(n_i, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);

    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);    

    memcpy(group_copy, group, N * sizeof(size_t));

    /* EXACT PERMUTATION COUNT USING 128-BIT INTEGER */
    unsigned long long perm_count = getCountPerm(N, n_i, k);
    
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));

    /* CPU PERMUTATION */
    double elapse = 0.0f, 
           time_taken;

    // STEP 1: CPU PERMUTATION
    for (int c = 0; c < counter; c++){
        start = clock();
        memcpy(perm_array, group, N * sizeof(size_t)); // Initialize first permutation
        for (i = 0; i < perm_count; i++) {
            permute(group, N);
            memcpy(&perm_array[(i + 1) * N], group, N * sizeof(size_t));
        }
        memcpy(group, group_copy, N * sizeof(size_t)); // Reset group array
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
    }
    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to generate %llu permutations\n", counter, elapse/counter, perm_count);

    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    // STEP 2: GPU ANOVA
    printf("\nSTEP 2: Computing F-statistic\n");
    printf("Launching kernel with %zu blocks and %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        gpu_anova<<<numBlocks, numThreads>>>(perm_array, N, k, perm_count, feature, n_i, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    // PRINT RESULTS
    FILE *fptr;
    fptr = fopen("cuda_exact_lexico_perm_anova.csv", "w");
    // saving N and average execution time
    fprintf(fptr, "%lu, %f\n", N, elapse/counter);

    printf("Printing First 5 and Last 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }

    printf("Printing First 5 and Last 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nOriginal F-statistic: %lf\n", F_dist[0]);
    printf("Extreme count: %zu out of %llu permutations", extreme_count, perm_count);
    printf(" permutations\n");
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu,%lf", extreme_count, p_value);

    // Free memory
    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(perm_array);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_exact_lexico_perm_anova.cu


In [73]:
%%bash
nvcc cuda_exact_lexico_perm_anova.cu -o cuda_exact_lexico_perm_anova -Wno-deprecated-gpu-targets

cuda_exact_lexico_perm_anova.cu(131): error: no suitable constructor exists to convert from "int" to "cudaMemLocation"
      cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, ((int)-1));
                                                                                    ^

cuda_exact_lexico_perm_anova.cu(132): error: no suitable constructor exists to convert from "int" to "cudaMemLocation"
      cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, ((int)-1));
                                                                                  ^

cuda_exact_lexico_perm_anova.cu(133): error: no suitable constructor exists to convert from "int" to "cudaMemLocation"
      cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, ((int)-1));
                                                                                ^

cuda_exact_lexico_perm_anova.cu(136): error: no suitable constructor exists to convert from "int" to "cuda

CalledProcessError: Command 'b'nvcc cuda_exact_lexico_perm_anova.cu -o cuda_exact_lexico_perm_anova -Wno-deprecated-gpu-targets\n'' returned non-zero exit status 2.

In [26]:
%%bash
nvprof ./cuda_exact_lexico_perm_anova < input.txt

==302997== NVPROF is profiling process 302997, command: ./cuda_exact_lexico_perm_anova


Number of Rows: Number of Groups: 
Function (in C) average time for 10 loops is 0.011200 milliseconds to generate 90 permutations

STEP 2: Computing F-statistic
Launching kernel with 1 blocks and 256 threads per block

DEBUG: First 5 permutations
GPU Permutation 1: 0 0 1 1 2 2 
GPU Permutation 2: 0 0 1 2 1 2 
GPU Permutation 3: 0 0 1 2 2 1 
GPU Permutation 4: 0 0 2 1 1 2 
GPU Permutation 5: 0 0 2 1 2 1 
First 5 F-statistics:
F_dist[0]: 0.005957
F_dist[1]: 6.960385
F_dist[2]: 0.020042
F_dist[3]: 0.020042
F_dist[4]: 6.960385

Original F-statistic: 0.005957
Extreme count: 87 out of 90 permutations permutations
p-value: 0.966667


==302997== Profiling application: ./cuda_exact_lexico_perm_anova
==302997== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  44.544us        10  4.4540us  4.0320us  7.1360us  gpu_anova(unsigned long*, unsigned long, int, unsigned long, double*, unsigned long*, double*)
      API calls:   99.45%  1.50241s         5  300.48ms  24.995us  1.50169s  cudaMallocManaged
                    0.18%  2.7121ms        10  271.21us  13.847us  2.5476ms  cudaLaunchKernel
                    0.15%  2.2833ms        10  228.33us  17.078us  928.38us  cudaMemPrefetchAsync
                    0.07%  1.0980ms         5  219.61us  30.727us  815.64us  cudaFree
                    0.05%  721.01us       114  6.3240us     161ns  291.34us  cuDeviceGetAttribute
                    0.04%  662.13us         1  662.13us  662.13us  662.13us  cudaGetDevice
                    0.03%  455.24us         3  151.75us  12.253us  429.54us  cudaMemAdvise

#### CUDA Version (using rank indexing)

In [76]:
%%writefile cuda_exact_rank_perm_anova.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LINE 1024
#define MAX_GROUPS 100

__device__ unsigned long long factorial(size_t n) {
    unsigned long long result = 1;
    for (int i = 2; i <= n; i++)
        result *= i;
    return result;
}

__device__ unsigned long long multinomial(size_t total, size_t *counts, size_t k) {
    unsigned long long result = factorial(total);
    for (int i = 0; i < k; i++) {
        result /= factorial(counts[i]);
    }
    return result;
}

__device__ void rank_to_permutation(size_t *keys, size_t *n_i, size_t k, size_t N, unsigned long long rank, size_t *perm) {
    size_t n_i_copy[MAX_GROUPS];
    
    for (int i = 0; i < k; i++) {
        n_i_copy[i] = n_i[i];
    }
    
    int total = N;
    
    for (int pos = 0; pos < N; pos++) {
        for (int i = 0; i < k; i++) {
            if (n_i_copy[i] == 0)
                continue;
            
            n_i_copy[i]--;
            unsigned long long num = multinomial(total - 1, n_i_copy, k);
            
            if (rank < num) {
                perm[pos] = keys[i];
                total--;
                break;
            } else {
                rank -= num;
                n_i_copy[i]++;
            }
        }
    }
}

__device__ double one_way_anova(size_t N, size_t k, size_t *n_i, size_t *group, double *feature) {
    double group_ave[MAX_GROUPS] = {0.0};
    
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    
    for (int i = 0; i < k; i++) {
        if (n_i[i] > 0)
            group_ave[i] /= n_i[i];
    }
    
    double SSE = 0.0;
    for (int i = 0; i < N; i++) {
        double temp = feature[i] - group_ave[group[i]];
        SSE += temp * temp;
    }
    
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        double temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }
    
    return (SSR / (k - 1)) / (SSE / (N - k));
}

__global__ void permutation_test_gpu(size_t N, size_t k, size_t *keys, size_t *n_i, double *features, unsigned long long perm_count, size_t *perm_array,  double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (unsigned long long perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        unsigned long long rank = perm_idx;
        
        // Each thread gets its own section of the perm_array
        size_t *perm = &perm_array[idx * N];
        if (rank >= perm_count)
            return;
        
        rank_to_permutation(keys, n_i, k, N, rank, perm);
        
        double F_stat = one_way_anova(N, k, n_i, perm, features);
        
        F_dist[rank] = F_stat;
    }
}

unsigned long long binom(size_t n, size_t k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;
    
    unsigned long long result = 1;
    for (int i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

unsigned long long get_perm_count(size_t total_elements, size_t *n_i, size_t k) {
    unsigned long long result = 1;
    int remaining = total_elements;
    
    for (int i = 0; i < k; i++) {
        int ni = n_i[i];
        unsigned long long c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

int main() {
    size_t N, k;
    size_t counter = 10;
    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    int device = -1;
    cudaGetDevice(&device);

    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *keys;
    double *F_dist;
    size_t *perm_array;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&keys, k * sizeof(size_t));
    
    memset(n_i, 0, k * sizeof(size_t));

    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(keys, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(keys, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }
    
    char line[MAX_LINE];
    int i = 0;
    
    while (fgets(line, sizeof(line), fp) && i < N) {
        line[strcspn(line, "\n")] = 0;
        char *token = strtok(line, ",");
        int j = 0;
        
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k) {
                    fprintf(stderr, "Error: group index out of range\n");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]]++;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    for (size_t i = 0; i < k; i++) {
        keys[i] = i;
    }
    
    unsigned long long perm_count = get_perm_count(N, n_i, k);
    printf("Total permutations: %llu\n", perm_count);

    cudaMallocManaged(&F_dist, perm_count * sizeof(double));
    cudaMallocManaged(&perm_array, perm_count * N * sizeof(size_t));

    cudaMemPrefetchAsync(keys, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    
    // Prefetch output array to GPU
    cudaMemPrefetchAsync(perm_array, perm_count * N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);
    
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;
    
    printf("Launching %zu blocks with %zu threads per block\n", numBlocks, numThreads);

    for (size_t c = 0; c < counter; c++){
        permutation_test_gpu<<<numBlocks, numThreads>>>(
            N, k, keys, n_i, feature, perm_count, perm_array, F_dist
        );
    }
    
    cudaDeviceSynchronize();
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);
    
    // PRINT RESULTS
    FILE *fptr;
    fptr = fopen("cuda_exact_rank_perm_anova.csv", "w");

    printf("Printing First 5 and Last 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }

    printf("Printing First 5 and Last 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nOriginal F-statistic: %lf\n", F_dist[0]);
    printf("Extreme count: %zu out of %llu permutations", extreme_count, perm_count);
    printf(" permutations\n");
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu,%lf", extreme_count, p_value);

    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(keys);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_exact_rank_perm_anova.cu


In [75]:
%%bash
nvcc cuda_exact_rank_perm_anova.cu -o cuda_exact_rank_perm_anova -Wno-deprecated-gpu-targets

cc1plus: fatal error: cuda_exact_perm_anova.cu: No such file or directory
compilation terminated.


CalledProcessError: Command 'b'nvcc cuda_exact_perm_anova.cu -o cuda_exact_perm_anova -Wno-deprecated-gpu-targets\n'' returned non-zero exit status 1.

In [37]:
%%bash
nvprof ./cuda_exact_rank_perm_anova < input.txt

==303375== NVPROF is profiling process 303375, command: ./cuda_exact_perm_anova


Number of Rows: Number of Groups: Total permutations: 90
Launching 1 blocks with 256 threads per block

Printing First 5 and Last 5 Results
F_dist 0: 0.005957
F_dist 1: 6.960385
F_dist 2: 0.020042
F_dist 3: 0.020042
F_dist 4: 6.960385
F_dist 85: 6.960385
F_dist 86: 0.020042
F_dist 87: 0.020042
F_dist 88: 6.960385
F_dist 89: 0.005957

Original F-statistic: 0.005957
Extreme count: 87 out of 90 permutations permutations
p-value: 0.966667


==303375== Profiling application: ./cuda_exact_perm_anova
==303375== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  231.26us        10  23.126us  22.815us  25.376us  permutation_test_gpu(unsigned long, unsigned long, unsigned long*, unsigned long*, double*, __int64, unsigned long*, double*)
      API calls:   99.42%  1.44856s         6  241.43ms  10.364us  1.44778s  cudaMallocManaged
                    0.23%  3.3262ms        10  332.62us  11.359us  3.1624ms  cudaLaunchKernel
                    0.16%  2.2740ms        10  227.40us  18.358us  681.10us  cudaMemPrefetchAsync
                    0.04%  631.01us         5  126.20us  44.052us  442.21us  cudaFree
                    0.04%  582.66us       114  5.1110us     145ns  230.21us  cuDeviceGetAttribute
                    0.04%  573.52us         4  143.38us  6.7980us  523.80us  cudaMemAdvise
                    0.04%  565.48us         1  565.48us  565.48us 

#### CUDA Version (using rank indexing w/ shared memory)

In [None]:
%%writefile cuda_shared_exact_rank_perm_anova.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LINE 1024
#define MAX_GROUPS 10

__device__ unsigned long long factorial(size_t n) {
    unsigned long long result = 1;
    for (size_t i = 2; i <= n; i++)
        result *= i;
    return result;
}

__device__ unsigned long long multinomial(size_t total, size_t *counts, size_t k) {
    unsigned long long result = factorial(total);
    for (size_t i = 0; i < k; i++) {
        result /= factorial(counts[i]);
    }
    return result;
}

__device__ void rank_to_permutation(size_t *keys, size_t *n_i, size_t k, size_t N, unsigned long long rank, size_t *perm) {
    size_t n_i_copy[MAX_GROUPS];
    
    for (int i = 0; i < k; i++) {
        n_i_copy[i] = n_i[i];
    }
    
    size_t total = N;
    
    for (int pos = 0; pos < N; pos++) {
        for (int i = 0; i < k; i++) {
            if (n_i_copy[i] == 0)
                continue;
            
            n_i_copy[i]--;
            unsigned long long num = multinomial(total - 1, n_i_copy, k);
            
            if (rank < num) {
                perm[pos] = keys[i];
                total--;
                break;
            } else {
                rank -= num;
                n_i_copy[i]++;
            }
        }
    }
}

__device__ double one_way_anova(size_t N, size_t k, size_t *n_i, size_t *group, double *feature) {
    double group_ave[MAX_GROUPS] = {0.0};
    
    double average = 0.0;
    for (size_t i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    
    for (size_t i = 0; i < k; i++) {
        if (n_i[i] > 0)
            group_ave[i] /= n_i[i];
    }
    
    double SSE = 0.0;
    for (size_t i = 0; i < N; i++) {
        double temp = feature[i] - group_ave[group[i]];
        SSE += temp * temp;
    }
    
    double SSR = 0.0;
    for (size_t i = 0; i < k; i++) {
        double temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }
    
    return (SSR / (k - 1)) / (SSE / (N - k));
}

__global__ void permutation_test_gpu(size_t N, size_t k, size_t *keys, size_t *group_counts, double *features, unsigned long long total_perms, size_t *perm_buffer,  double *F_dist) {
    extern __shared__ char shared_mem[];
    double* shared_feature = (double*)shared_mem;
    size_t* shared_keys = (size_t*)(shared_mem + N * sizeof(double));
    size_t* shared_group_counts = (size_t*)(shared_mem + N * sizeof(double) + k * sizeof(size_t));

    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int lindex = threadIdx.x;

    // Load data into shared memory
    for (int i = lindex; i < N; i += blockDim.x) {
        shared_feature[i] = features[i];
        if (i < k) {
            shared_keys[i] = keys[i];
            shared_group_counts[i] = group_counts[i];
        }
    }

    __syncthreads();
    int stride = blockDim.x * gridDim.x;
    for (unsigned long long perm_idx = thread_id; perm_idx < total_perms; perm_idx += stride) {
        unsigned long long rank = perm_idx;
        
        // Each thread gets its own section of the perm_buffer
        size_t *perm = &perm_buffer[perm_idx * N];
        if (rank >= total_perms)
            return;
        
        rank_to_permutation(shared_keys, shared_group_counts, k, N, rank, perm);
        
        double F_stat = one_way_anova(N, k, shared_group_counts, perm, shared_feature);
        
        F_dist[rank] = F_stat;
    }
}

unsigned long long binom(size_t n, size_t k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;
    
    unsigned long long result = 1;
    for (size_t i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

unsigned long long get_perm_count(size_t total_elements, size_t *repeats, size_t k) {
    unsigned long long result = 1;
    int remaining = total_elements;
    
    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        unsigned long long c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

int main() {
    size_t N, k;
    size_t counter = 10;
    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    int device = -1;
    cudaGetDevice(&device);

    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *keys;
    double *F_dist;
    size_t *perm_array;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&keys, k * sizeof(size_t));

    memset(n_i, 0, k * sizeof(size_t));

    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(keys, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(keys, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }
    
    char line[MAX_LINE];
    int i = 0;
    
    while (fgets(line, sizeof(line), fp) && i < N) {
        line[strcspn(line, "\n")] = 0;
        char *token = strtok(line, ",");
        int j = 0;
        
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k) {
                    fprintf(stderr, "Error: group index out of range\n");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]]++;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    for (int i = 0; i < k; i++) {
        keys[i] = i;
    }
    
    unsigned long long perm_count = get_perm_count(N, n_i, k);
    printf("Total permutations: %llu\n", perm_count);

    cudaMallocManaged(&F_dist, perm_count * sizeof(double));
    cudaMallocManaged(&perm_array, perm_count * N * sizeof(size_t));

    cudaMemPrefetchAsync(keys, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    
    // Prefetch output array to GPU
    cudaMemPrefetchAsync(perm_array, perm_count * N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);
    
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;
    
    printf("Launching %zu blocks with %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        permutation_test_gpu<<<numBlocks, numThreads, N + 2*k>>>(
            N, k, keys, n_i, feature, perm_count, perm_array, F_dist
        );
    }
    
    cudaDeviceSynchronize();
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);
    
    // PRINT RESULTS
    FILE *fptr;
    fptr = fopen("cuda_shared_exact_rank_perm_anova.csv", "w");

    printf("Printing First 5 and Last 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }

    printf("Printing First 5 and Last 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nOriginal F-statistic: %lf\n", F_dist[0]);
    printf("Extreme count: %zu out of %llu permutations", extreme_count, perm_count);
    printf(" permutations\n");
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu,%lf", extreme_count, p_value);

    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(keys);
    cudaFree(F_dist);

    return 0;
}

In [None]:
%%bash
nvcc cuda_shared_exact_rank_perm_anova.cu -o cuda_shared_exact_rank_perm_anova -Wno-deprecated-gpu-targets

In [None]:
%%bash
nvprof ./cuda_shared_exact_rank_perm_anova < input.txt

### Monte Carlo Permutation Test

#### C Version (Serial)

In [38]:
%%writefile c_permutation_anova.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);  // pick random index [0, i]
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}
/* One Way ANOVA */
double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    /* AVERAGE & GROUP AVERAGE */
    double *group_ave = (double *) calloc(k, sizeof(double));
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }
    free(group_ave);
    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}
int main() {
    size_t perm_count;
    size_t N;   // number of rows
    size_t k;   // number of groups
    clock_t start, end;
    size_t counter = 10;

    /* GET THE NUMBER OF ROWS */
    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    double *feature = (double*) malloc(N * sizeof(double));
    size_t *group = (size_t*) malloc(N * sizeof(size_t));
    size_t *temp_group = (size_t*) malloc(N * sizeof(size_t));
    size_t *n_i = (size_t*) calloc(k, sizeof(size_t));
    double *F_dist = (double*) malloc(perm_count * sizeof(double));

    /* READ THE DATA */
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;  // prevent overflow

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token); // convert to float and save
            else {
                group[i] = atoi(token); // convert to int and save
                if (group[i] >= k){
                    perror("Error group count");
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
     // fill-in cache
    permute(group, N, i, temp_group);
    OneWayAnova(N, k, n_i, group, feature);

    // Execution time start here: CPU Permutation
    /* CPU PERMUTATION */
    double elapse, time_taken;
    elapse = 0.0f;
    
    for (int c=0; c<counter; c++){
        start = clock();
        for (size_t i = 0; i < perm_count; i++) {
            if (i == 0)
                F_dist[i] = OneWayAnova(N, k, n_i, group, feature);
            else 
                F_dist[i] = OneWayAnova(N, k, n_i, temp_group, feature);
            
            // Always permute from the ORIGINAL group array
            permute(group, N, i, temp_group);
        }
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
    }
    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to execute an array size %lu \n", counter, elapse/counter, perm_count);



    // Print first 5 permutations
    printf("First 5 permutations\n");
    for (size_t i = 0; i < 5; i++) {
        printf("CPU Permutation %zu: ", i+1);
        for (size_t j = 0; j < N; j++) {
            if (i == 0)
                printf("%zu ", group[j]);
            else
                printf("%zu ", temp_group[j]);
        }
        printf("\n");
    }

    printf("\nSTEP 3: Printing First 5 and Last 5 Results\n");
    for (int i = 0; i < 5; i++) {
        printf ("F_dist %d: %lf\n", i, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf ("F_dist %d: %lf\n", i, F_dist[i]);
    } 

    size_t extreme_count = 0;
    double p_value = 0.0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
       }
    }
    printf ("Extreme Count: %lu\n", extreme_count);
    p_value = (double)extreme_count / perm_count;
    printf("p-value: %lf\n", p_value);

    /* free memory */
    free(feature);
    free(group);
    free(n_i);
    free(F_dist);

    return 0;
}

Overwriting c_permutation_anova.c


In [39]:
%%bash
gcc c_permutation_anova.c -o c_permutation_anova -lm

In [40]:
%%bash
./c_permutation_anova < input.txt

Number of Rows: Number of Groups: Number of Permutations: 
Function (in C) average time for 10 loops is 37.399100 milliseconds to execute an array size 100000 
First 5 permutations
CPU Permutation 1: 0 0 1 1 2 2 
CPU Permutation 2: 1 1 0 2 0 2 
CPU Permutation 3: 1 1 0 2 0 2 
CPU Permutation 4: 1 1 0 2 0 2 
CPU Permutation 5: 1 1 0 2 0 2 

STEP 3: Printing First 5 and Last 5 Results
F_dist 0: 0.005957
F_dist 1: 1.975504
F_dist 2: 2.508775
F_dist 3: 1.975504
F_dist 4: 1.398650
F_dist 99995: 0.020986
F_dist 99996: 0.063378
F_dist 99997: 1.398650
F_dist 99998: 1.975504
F_dist 99999: 0.071876
Extreme Count: 98332
p-value: 0.983320


#### CUDA Version

In [41]:
%%writefile cuda_permutation_anova_no_dependency.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
__device__ unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
__device__ void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}


__device__ void gpu_anova(size_t *perm_array, size_t N, int k, size_t perm_count, double *feature, size_t *n_i, double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        size_t *current_group = &perm_array[perm_idx * N];
        F_dist[perm_idx] = OneWayAnova(N, k, n_i, current_group, feature);
    }
}

__global__ void gpu_permute_and_anova(size_t *array, size_t perm_count, size_t N, int k, 
    double *feature, size_t *n_i, size_t *perm_array, double *F_dist) {
    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = thread_id; i < perm_count; i += stride) {
        size_t *current_perm = &perm_array[i * N];
        
        if (i == 0) {
            // Copy original data
            for (int j = 0; j < N; j++) {
                current_perm[j] = array[j];
            }
        } else {
            // Generate permutation
            permute(array, N, i-1, current_perm);
        }
        
        // Compute ANOVA immediately
        F_dist[i] = OneWayAnova(N, k, n_i, current_perm, feature);
    }
}

int main() {
    size_t perm_count;
    size_t N;
    size_t k;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));

    // Initialize n_i to zero
    memset(n_i, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);


    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);


    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    
    printf("\n Generating Permutations and Computing F-statistic\n");
    printf("Launching kernel with %zu blocks and %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        gpu_permute_and_anova<<<numBlocks, numThreads>>>(
            group, perm_count, N, k, feature, n_i, perm_array, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    // PRINT RESULTS
    printf("\nDEBUG: First 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N && j < 20; j++) {  // Limit output for readability
            printf("%zu ", perm_array[i * N + j]);
        }
        if (N > 20) printf("...");
        printf("\n");
    }

    printf("\n Printing Results\n");
    printf("First 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nOriginal F-statistic: %lf\n", F_dist[0]);
    printf("Extreme count: %zu out of %zu permutations\n", extreme_count, perm_count);
    printf("p-value: %lf\n", p_value);
    

    // Free memory
    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(perm_array);
    cudaFree(F_dist);

    return 0;
}

Overwriting cuda_permutation_anova_no_dependency.cu


In [42]:
%%bash
nvcc cuda_permutation_anova_no_dependency.cu -o cuda_permutation_anova_no_dependency -Wno-deprecated-gpu-targets

In [43]:
%%bash
nvprof ./cuda_permutation_anova_no_dependency < input.txt

==303610== NVPROF is profiling process 303610, command: ./cuda_permutation_anova_no_dependency


Number of Rows: Number of Groups: Number of Permutations: 
 Generating Permutations and Computing F-statistic
Launching kernel with 391 blocks and 256 threads per block

DEBUG: First 5 permutations
GPU Permutation 1: 0 0 1 1 2 2 
GPU Permutation 2: 1 2 2 0 0 1 
GPU Permutation 3: 0 1 2 1 2 0 
GPU Permutation 4: 0 2 2 1 1 0 
GPU Permutation 5: 2 1 0 0 1 2 

 Printing Results
First 5 F-statistics:
F_dist[0]: 0.005957
F_dist[1]: 1.975504
F_dist[2]: 2.508775
F_dist[3]: 1.975504
F_dist[4]: 1.398650

Original F-statistic: 0.005957
Extreme count: 98332 out of 100000 permutations
p-value: 0.983320


==303610== Profiling application: ./cuda_permutation_anova_no_dependency
==303610== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  771.17us        10  77.116us  74.432us  83.135us  gpu_permute_and_anova(unsigned long*, unsigned long, unsigned long, int, double*, unsigned long*, unsigned long*, double*)
      API calls:   97.90%  1.33580s         5  267.16ms  115.69us  1.33455s  cudaMallocManaged
                    1.12%  15.330ms        10  1.5330ms  25.357us  10.098ms  cudaMemPrefetchAsync
                    0.40%  5.4400ms        10  544.00us  11.007us  4.9600ms  cudaLaunchKernel
                    0.35%  4.8371ms         5  967.42us  58.660us  2.4165ms  cudaFree
                    0.06%  816.06us         1  816.06us  816.06us  816.06us  cudaGetDevice
                    0.06%  788.75us       114  6.9180us     129ns  445.89us  cuDeviceGetAttribute
                    0.05%  636.30us         2  318.15u

#### Monte Carlo CUDA Version (Parallel w/ shared)

In [44]:
%%writefile cuda_monte_cache.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
__device__ unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
__device__ void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}


__device__ void gpu_anova(size_t *perm_array, size_t N, int k, size_t perm_count, double *feature, size_t *n_i, double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        size_t *current_group = &perm_array[perm_idx * N];
        F_dist[perm_idx] = OneWayAnova(N, k, n_i, current_group, feature);
    }
}

__global__ void gpu_permute_and_anova(size_t *array, size_t perm_count, size_t N, int k, 
    double *feature, size_t *n_i, size_t *perm_array, double *F_dist) {
    extern __shared__ char shared_mem[];
    size_t* shared_group = (size_t*)shared_mem;
    double* shared_feature = (double*)(shared_mem + N * sizeof(size_t));

    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int lindex = threadIdx.x;

    // Load data into shared memory
    for (int i = lindex; i < N; i += blockDim.x) {
        shared_group[i] = array[i];
        shared_feature[i] = feature[i];
    }

    __syncthreads();

    int stride = blockDim.x * gridDim.x;
    for (int i = thread_id; i < perm_count; i += stride) {
        size_t *current_perm = &perm_array[i * N];
        
        if (i == 0) {
            // Copy original data
            for (int j = 0; j < N; j++) {
                current_perm[j] = shared_group[j];
            }
        } else {
            // Generate permutation
            permute(shared_group, N, i-1, current_perm);
        }
        F_dist[i] = OneWayAnova(N, k, n_i, current_perm, feature);
    }
}

int main() {
    size_t perm_count;
    size_t N;
    size_t k;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));

    // Initialize n_i to zero
    memset(n_i, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);


    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    
    printf("\n Generating Permutations and Computing F-statistic\n");
    printf("Launching kernel with %zu blocks and %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        gpu_permute_and_anova<<<numBlocks, numThreads, N*2>>>(
            group, perm_count, N, k, feature, n_i, perm_array, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    // PRINT RESULTS
    printf("\nDEBUG: First 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N && j < 20; j++) {  // Limit output for readability
            printf("%zu ", perm_array[i * N + j]);
        }
        if (N > 20) printf("...");
        printf("\n");
    }

    printf("\n Printing Results\n");
    printf("First 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nOriginal F-statistic: %lf\n", F_dist[0]);
    printf("Extreme count: %zu out of %zu permutations\n", extreme_count, perm_count);
    printf("p-value: %lf\n", p_value);
    
    // Free memory
    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(perm_array);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_monte_cache.cu


In [45]:
%%bash
nvcc cuda_monte_cache.cu -o cuda_monte_cache -Wno-deprecated-gpu-targets

In [46]:
%%bash
nvprof ./cuda_monte_cache < input.txt

==303714== NVPROF is profiling process 303714, command: ./cuda_monte_cache


Number of Rows: Number of Groups: Number of Permutations: 
 Generating Permutations and Computing F-statistic
Launching kernel with 391 blocks and 256 threads per block

DEBUG: First 5 permutations
GPU Permutation 1: 0 0 1 1 2 2 
GPU Permutation 2: 1 2 2 0 0 1 
GPU Permutation 3: 0 1 2 1 2 0 
GPU Permutation 4: 0 2 2 1 1 0 
GPU Permutation 5: 2 1 0 0 1 2 

 Printing Results
First 5 F-statistics:
F_dist[0]: 0.005957
F_dist[1]: 1.975504
F_dist[2]: 2.508775
F_dist[3]: 1.975504
F_dist[4]: 1.398650

Original F-statistic: 0.005957
Extreme count: 98332 out of 100000 permutations
p-value: 0.983320


==303714== Profiling application: ./cuda_monte_cache
==303714== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  758.33us        10  75.833us  74.719us  81.023us  gpu_permute_and_anova(unsigned long*, unsigned long, unsigned long, int, double*, unsigned long*, unsigned long*, double*)
      API calls:   71.85%  3.30464s        10  330.46ms  20.348us  3.30414s  cudaLaunchKernel
                   27.71%  1.27441s         5  254.88ms  43.484us  1.27385s  cudaMallocManaged
                    0.26%  12.044ms        10  1.2044ms  22.884us  8.7157ms  cudaMemPrefetchAsync
                    0.07%  3.2158ms         5  643.15us  48.195us  1.6938ms  cudaFree
                    0.07%  3.0437ms       114  26.698us     201ns  2.4562ms  cuDeviceGetAttribute
                    0.01%  530.43us         1  530.43us  530.43us  530.43us  cudaGetDevice
                    0.01%  499.79us         2  249.89us  151.92us  347.87u