## Permutation Test using One-Way ANOVA in CUDA

In [1]:
import os

# Add the directory containing the executable to the PATH
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin"

# Check if the directory is added to the PATH
print(os.environ["PATH"])

/opt/tljh/user/bin:/bin:/usr/bin:/usr/local/cuda/bin


### Random Dataset Generation

In [2]:
!python -m pip install scikit-learn
!python -m pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Using cached scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.7.2
Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
Installing collected packages: pandas
Successfully installed pandas-2.3.3


In [2]:
from sklearn.datasets import make_classification

k = 3
N = 6
X, y = make_classification(
    n_samples = N,             # row number
    n_features = 5,            # feature numbers
    n_informative = 3,         # The number of informative features
    n_redundant = 0,           # The number of redundant features
    n_repeated = 0,            # The number of duplicated features
    n_classes = k,             # The number of classes 
    n_clusters_per_class = 1,  # The number of clusters per class
    random_state = 42,         # random seed 
    scale=100                  # scale of the data
)

In [3]:
import pandas as pd

df = pd.concat([pd.DataFrame(X)[[0]], pd.DataFrame(y).astype(int)], axis=1)
df.columns = [0, 1]
df = df.sort_values(1).reset_index().iloc[:,1:]
df.to_csv("dataset.csv", header=False, index=False)

### Exact Permutation Test

#### C Version (Serial)

In [None]:
%%writefile c_exact_perm.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* UTILITIES FOR 128-BIT INTEGER EXACT PRINTING */
typedef unsigned __int128 u128;

// Print u128 EXACTLY
void print_u128(u128 x) {
    if (x == 0) { printf("0"); return; }
    char buf[128];
    int p = 0;
    while (x > 0) {
        buf[p++] = '0' + (x % 10);
        x /= 10;
    }
    while (p--) putchar(buf[p]);
}


/* EXACT BINOMIAL & MULTINOMIAL FUNCTIONS */
// Compute C(n,k) exactly using 128-bit integers
u128 binom_u128(u128 n, u128 k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;

    u128 result = 1;
    for (u128 i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

/* EXACT multinomial coefficient using sequential binomial method */
u128 getCountPerm_u128(int total_elements, size_t *repeats, int k) {
    u128 result = 1;
    int remaining = total_elements;

    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        u128 c = binom_u128(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

// helper function for swapping values
void Exchange(size_t* data, size_t a, size_t b) {
    size_t temp = data[a];
    data[a] = data[b];
    data[b] = temp;
}

/* PERMUTATION GENERATOR */
int permute(size_t a[], int n) {
    int l, j;
    for (j = --n; j > 0 && a[j-1] >= a[j]; --j) { ; }
    if (j == 0) return 0;
    for (l = n; a[j-1] >= a[l]; --l) { ; }
    Exchange(a, j-1, l);
    while (j < n) { Exchange(a, j++, n--); }
    return 1;
}

/* ONE WAY ANALYSIS OF VARIANCE */
double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    double *group_ave = calloc(k, sizeof(double));

    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    free(group_ave);

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}

int main() {
    size_t N;
    int k;
    clock_t start, end;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%d", &k);

    double *feature = malloc(N * sizeof(double));
    size_t *group = malloc(N * sizeof(size_t));
    size_t *group_copy = malloc(N * sizeof(size_t));
    size_t *group_duplicates = calloc(k, sizeof(size_t));

    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;

    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;
        char *token = strtok(line, ",");
        int j = 0;

        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    return 1;
                }
                group_duplicates[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);

    memcpy(group_copy, group, N * sizeof(size_t));

    /* EXACT PERMUTATION COUNT USING 128-BIT INTEGER */
    u128 perm_count = getCountPerm_u128(N, group_duplicates, k);

    // Can over flow for large n
    double *F_dist = malloc(perm_count * sizeof(double));

    // Execution time start here: CPU Permutation
    /* CPU PERMUTATION */
    double elapse = 0.0f, 
           time_taken;

    /* PERMUTATION TEST */
    for (int c=0; c<counter; c++){
        start = clock();
        for (i = 0; i < perm_count; i++){
            // compute One Way ANOVA
            F_dist[i] = OneWayAnova(N, k, group_duplicates, group, feature);

            // This block is for debugging only
            // printf("%zu: ", i);
            // for (int j = 0; j < N; j++){
            //     printf("%d ", group[j]);
            // }
            // printf("; F = %lf\n", F_dist[i]);

            // change grouping assignment
            permute(group, N);
        }
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
        memcpy(group, group_copy, N * sizeof(size_t));
    }

    FILE *fptr;
    fptr = fopen("example.csv", "w");
    
    fprintf(fptr, "%lu, %f\n", N, elapse/counter);
    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to execute an array size ", counter, elapse/counter);
    print_u128(perm_count);
    printf("\n");

    printf("\nPrinting First 5 and Last 5 Results\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d, %lf\n", i, F_dist[i]);
        printf ("F_dist %d: %lf\n", i, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d, %lf\n", i, F_dist[i]);
        printf ("F_dist %d: %lf\n", i, F_dist[i]);
    } 

    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
       }
    }

    // Calculating the p-value for the permutation test
    double p_value = (double)extreme_count/perm_count;
    printf("\nNull F: %lf\n", F_dist[0]);
    printf ("Extreme Count: %lu\n", extreme_count);
    p_value = (double)extreme_count / perm_count;
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu, %lf\n", extreme_count, p_value);
    fclose(fptr);

    // free the allocated memory
    free(feature);
    free(group);
    free(group_duplicates);
    free(F_dist);

    return 0;
}

Overwriting c_exact_perm.c


In [3]:
%%bash
gcc c_exact_perm.c -o c_exact_perm -lm

In [4]:
%%bash
./c_exact_perm < input.txt

Number of Rows: Number of Groups: 
Function (in C) average time for 10 loops is 5945.491800 milliseconds to execute an array size 17153136

Printing First 5 and Last 5 Results
F_dist 0: 4.387509
F_dist 1: 2.223316
F_dist 2: 1.945471
F_dist 3: 2.200337
F_dist 4: 4.446505
F_dist 17153131: 4.446505
F_dist 17153132: 2.200337
F_dist 17153133: 1.945471
F_dist 17153134: 2.223316
F_dist 17153135: 4.387509

Null F: 4.387509
Extreme Count: 533831
p-value: 0.031121


#### CUDA (using lexicographic permutation)

In [4]:
%%writefile cuda_exact_lexico_perm_anova.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* UTILITIES FOR 128-BIT INTEGER EXACT PRINTING */
typedef unsigned __int128 u128;

// Print u128 EXACTLY
void print_u128(u128 x) {
    if (x == 0) { printf("0"); return; }
    char buf[128];
    int p = 0;
    while (x > 0) {
        buf[p++] = '0' + (x % 10);
        x /= 10;
    }
    while (p--) putchar(buf[p]);
}


/* EXACT BINOMIAL & MULTINOMIAL FUNCTIONS */
// Compute C(n,k) exactly using 128-bit integers
u128 binom_u128(u128 n, u128 k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;

    u128 result = 1;
    for (u128 i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

/* EXACT multinomial coefficient using sequential binomial method */
u128 getCountPerm_u128(int total_elements, size_t *repeats, int k) {
    u128 result = 1;
    int remaining = total_elements;

    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        u128 c = binom_u128(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

// helper function for swapping values
void Exchange(size_t* data, size_t a, size_t b) {
    size_t temp = data[a];
    data[a] = data[b];
    data[b] = temp;
}

/* PERMUTATION GENERATOR */
int permute(size_t a[], size_t n) {
    int l, j;
    for (j = --n; j > 0 && a[j-1] >= a[j]; --j) { ; }
    if (j == 0) return 0;
    for (l = n; a[j-1] >= a[l]; --l) { ; }
    Exchange(a, j-1, l);
    while (j < n) { Exchange(a, j++, n--); }
    return 1;
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}

__global__ void gpu_anova(size_t *perm_array, size_t N, int k, size_t perm_count, double *feature, size_t *n_i, double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        size_t *current_group = &perm_array[perm_idx * N];
        F_dist[perm_idx] = OneWayAnova(N, k, n_i, current_group, feature);
    }
}

int main() {
    size_t N;
    size_t k;
    clock_t start, end;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *group_copy = (size_t*)malloc(N * sizeof(size_t));
    size_t *group_duplicates;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&group_duplicates, k * sizeof(size_t));

    // Initialize n_i to zero
    memset(group_duplicates, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group_duplicates, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group_duplicates, k * sizeof(size_t), cudaCpuDeviceId, NULL);

    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                group_duplicates[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);    

    memcpy(group_copy, group, N * sizeof(size_t));

    /* EXACT PERMUTATION COUNT USING 128-BIT INTEGER */
    u128 perm_count = getCountPerm_u128(N, group_duplicates, k);
    
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));


    /* CPU PERMUTATION */
    double elapse = 0.0f, 
           time_taken;

    // STEP 1: CPU PERMUTATION
    for (int c = 0; c < counter; c++){
        start = clock();
        memcpy(perm_array, group, N * sizeof(size_t)); // Initialize first permutation
        for (i = 0; i < perm_count; i++) {
            permute(group, N);
            memcpy(&perm_array[(i + 1) * N], group, N * sizeof(size_t));
        }
        memcpy(group, group_copy, N * sizeof(size_t)); // Reset group array
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
    }
    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to generate ", counter, elapse/counter);
    print_u128(perm_count);
    printf(" permutations\n");
    
    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(group_duplicates, k * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    // STEP 2: GPU ANOVA
    printf("\nSTEP 2: Computing F-statistic\n");
    printf("Launching kernel with %zu blocks and %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        gpu_anova<<<numBlocks, numThreads>>>(perm_array, N, k, perm_count, feature, group_duplicates, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    // PRINT RESULTS
    printf("\nDEBUG: First 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N && j < 20; j++) {  // Limit output for readability
            printf("%zu ", perm_array[i * N + j]);
        }
        if (N > 20) printf("...");
        printf("\n");
    }

    // printf("\n Printing Results\n");
    printf("First 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nOriginal F-statistic: %lf\n", F_dist[0]);
    printf("Extreme count: %zu out of ", extreme_count);
    print_u128(perm_count);
    printf(" permutations\n");
    printf("p-value: %lf\n", p_value);

    // Free memory
    cudaFree(feature);
    cudaFree(group);
    cudaFree(group_duplicates);
    cudaFree(perm_array);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_exact_lexico_perm_anova.cu


In [None]:
%%bash
nvcc cuda_exact_lexico_perm_anova.cu -o cuda_exact_lexico_perm_anova -Wno-deprecated-gpu-targets

In [None]:
%%bash
nvprof ./cuda_exact_lexico_perm_anova < input.txt

#### CUDA Version (using rank indexing)

In [None]:
%%writefile cuda_exact_perm_anova.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LINE 1024
#define MAX_GROUPS 10

__device__ unsigned long long factorial(int n) {
    unsigned long long result = 1;
    for (int i = 2; i <= n; i++)
        result *= i;
    return result;
}

__device__ unsigned long long multinomial(int total, int *counts, int k) {
    unsigned long long result = factorial(total);
    for (int i = 0; i < k; i++) {
        result /= factorial(counts[i]);
    }
    return result;
}

__device__ void rank_to_permutation(int *keys, int *n_i, int k, int N, unsigned long long rank, int *perm) {
    int n_i_copy[MAX_GROUPS];
    
    for (int i = 0; i < k; i++) {
        n_i_copy[i] = n_i[i];
    }
    
    int total = N;
    
    for (int pos = 0; pos < N; pos++) {
        for (int i = 0; i < k; i++) {
            if (n_i_copy[i] == 0)
                continue;
            
            n_i_copy[i]--;
            unsigned long long num = multinomial(total - 1, n_i_copy, k);
            
            if (rank < num) {
                perm[pos] = keys[i];
                total--;
                break;
            } else {
                rank -= num;
                n_i_copy[i]++;
            }
        }
    }
}

__device__ double one_way_anova(int N, int k, int *n_i, int *group, double *feature) {
    double group_ave[MAX_GROUPS] = {0.0};
    
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    
    for (int i = 0; i < k; i++) {
        if (n_i[i] > 0)
            group_ave[i] /= n_i[i];
    }
    
    double SSE = 0.0;
    for (int i = 0; i < N; i++) {
        double temp = feature[i] - group_ave[group[i]];
        SSE += temp * temp;
    }
    
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        double temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }
    
    return (SSR / (k - 1)) / (SSE / (N - k));
}

__global__ void permutation_test_gpu(int N, int k, int *keys, int *group_counts, double *features, unsigned long long total_perms, int *perm_buffer,  double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;


    for (unsigned long long perm_idx = idx; perm_idx < total_perms; perm_idx += stride) {
        unsigned long long rank = perm_idx;
        
        // Each thread gets its own section of the perm_buffer
        int *perm = &perm_buffer[idx * N];
        if (rank >= total_perms)
            return;
        
        rank_to_permutation(keys, group_counts, k, N, rank, perm);
        
        double F_stat = one_way_anova(N, k, group_counts, perm, features);
        
        F_dist[rank] = F_stat;
    }
}

unsigned long long binom(int n, int k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;
    
    unsigned long long result = 1;
    for (int i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

unsigned long long get_perm_count(int total_elements, int *repeats, int k) {
    unsigned long long result = 1;
    int remaining = total_elements;
    
    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        unsigned long long c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

int main() {
    int N, k;
    size_t counter = 10;
    printf("Number of Rows: ");
    scanf("%d", &N);
    printf("Number of Groups: ");
    scanf("%d", &k);

    int device = -1;
    cudaGetDevice(&device);

    double *feature;
    int *group;
    int *n_i;
    int *keys;
    double *F_dist;
    int *perm_buffer;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(int));
    cudaMallocManaged(&n_i, k * sizeof(int));
    cudaMallocManaged(&keys, k * sizeof(int));
    

    memset(n_i, 0, k * sizeof(int));

    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(keys, k * sizeof(int), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(keys, k * sizeof(int), cudaCpuDeviceId, NULL);
    
    
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }
    
    char line[MAX_LINE];
    int i = 0;
    
    while (fgets(line, sizeof(line), fp) && i < N) {
        line[strcspn(line, "\n")] = 0;
        char *token = strtok(line, ",");
        int j = 0;
        
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k) {
                    fprintf(stderr, "Error: group index out of range\n");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]]++;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    for (int i = 0; i < k; i++) {
        keys[i] = i;
    }
    
    unsigned long long total_perms = get_perm_count(N, n_i, k);
    printf("Total permutations: %llu\n", total_perms);

    cudaMallocManaged(&F_dist, total_perms * sizeof(double));
    cudaMallocManaged(&perm_buffer, total_perms * N * sizeof(int));

    cudaMemPrefetchAsync(keys, k * sizeof(int), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    
    // Prefetch output array to GPU
    cudaMemPrefetchAsync(perm_buffer, total_perms * N * sizeof(int), device, NULL);
    cudaMemPrefetchAsync(F_dist, total_perms * sizeof(double), device, NULL);
    
    
    size_t numThreads = 256;
    size_t numBlocks = (total_perms + numThreads - 1) / numThreads;
    
    printf("Launching %zu blocks with %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        permutation_test_gpu<<<numBlocks, numThreads>>>(
            N, k, keys, n_i, feature, total_perms, perm_buffer, F_dist
        );
    }
    
    cudaDeviceSynchronize();

    cudaMemPrefetchAsync(F_dist, total_perms * sizeof(double), cudaCpuDeviceId, NULL);
    
    cudaDeviceSynchronize();
    
    printf("\nPrinting First 5 and Last 5 Results\n");
    for (int i = 0; i < 5; i++) {
        printf ("F_dist %d: %lf\n", i, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = total_perms-5; i < total_perms; i++) {
        printf ("F_dist %d: %lf\n", i, F_dist[i]);
    } 

    size_t extreme_count = 0;
    for (size_t i = 1; i < total_perms; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
       }
    }
    
    double p_value = (double)extreme_count/total_perms;
    printf("\nNull F: %lf\n", F_dist[0]);
    printf ("Extreme Count: %lu\n", extreme_count);
    printf("p-value: %lf\n", p_value);

    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(keys);
    cudaFree(F_dist);

    return 0;
}

Overwriting cuda_exact_perm_anova.cu


In [12]:
%%bash
nvcc cuda_exact_perm_anova.cu -o cuda_exact_perm_anova -Wno-deprecated-gpu-targets

In [13]:
%%bash
nvprof ./cuda_exact_perm_anova < input.txt

==191786== NVPROF is profiling process 191786, command: ./cuda_exact_perm_anova


Number of Rows: Number of Groups: Total permutations: 17153136
Launching 67005 blocks with 256 threads per block

Printing First 5 and Last 5 Results
F_dist 0: 4.387509
F_dist 1: 2.223316
F_dist 2: 1.945471
F_dist 3: 2.200337
F_dist 4: 4.446505
F_dist 17153131: 4.446505
F_dist 17153132: 2.200337
F_dist 17153133: 1.945471
F_dist 17153134: 2.223316
F_dist 17153135: 4.387509

Null F: 4.387509
Extreme Count: 533831
p-value: 0.031121


==191786== Profiling application: ./cuda_exact_perm_anova
==191786== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  245.78ms        10  24.578ms  23.210ms  25.448ms  permutation_test_gpu(int, int, int*, int*, double*, __int64, double*)
      API calls:   72.23%  971.81ms         5  194.36ms  8.2110us  971.35ms  cudaMallocManaged
                   18.28%  245.88ms         2  122.94ms  98.921us  245.78ms  cudaDeviceSynchronize
                    8.76%  117.90ms         9  13.100ms  18.454us  115.34ms  cudaMemPrefetchAsync
                    0.51%  6.8653ms         5  1.3731ms  22.526us  5.3490ms  cudaFree
                    0.11%  1.4550ms        10  145.50us  7.7900us  1.3661ms  cudaLaunchKernel
                    0.04%  605.19us       114  5.3080us     128ns  266.71us  cuDeviceGetAttribute
                    0.03%  338.20us         1  338.20us  338.20us  338.20us  cudaGetDevice
                    0.0

### Monte Carlo Permutation Test

#### C Version (Serial)

In [7]:
%%writefile c_permutation_anova.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);  // pick random index [0, i]
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}
/* One Way ANOVA */
double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    /* AVERAGE & GROUP AVERAGE */
    double *group_ave = (double *) calloc(k, sizeof(double));
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }
    free(group_ave);
    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}
int main() {
    size_t perm_count;
    size_t N;   // number of rows
    size_t k;   // number of groups
    clock_t start, end;
    size_t counter = 10;

    /* GET THE NUMBER OF ROWS */
    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    double *feature = (double*) malloc(N * sizeof(double));
    size_t *group = (size_t*) malloc(N * sizeof(size_t));
    size_t *temp_group = (size_t*) malloc(N * sizeof(size_t));
    size_t *n_i = (size_t*) calloc(k, sizeof(size_t));
    double *F_dist = (double*) malloc(perm_count * sizeof(double));

    /* READ THE DATA */
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;  // prevent overflow

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token); // convert to float and save
            else {
                group[i] = atoi(token); // convert to int and save
                if (group[i] >= k){
                    perror("Error group count");
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
     // fill-in cache
    permute(group, N, i, temp_group);
    OneWayAnova(N, k, n_i, group, feature);

    // Execution time start here: CPU Permutation
    /* CPU PERMUTATION */
    double elapse, time_taken;
    elapse = 0.0f;
    
    for (int c=0; c<counter; c++){
        start = clock();
        for (size_t i = 0; i < perm_count; i++) {
            if (i == 0)
                F_dist[i] = OneWayAnova(N, k, n_i, group, feature);
            else 
                F_dist[i] = OneWayAnova(N, k, n_i, temp_group, feature);
            
            // Always permute from the ORIGINAL group array
            permute(group, N, i, temp_group);
        }
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
    }
    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to execute an array size %lu \n", counter, elapse/counter, perm_count);



    // Print first 5 permutations
    printf("First 5 permutations\n");
     for (size_t i = 0; i < 5; i++) {
        printf("CPU Permutation %zu: ", i+1);
        for (size_t j = 0; j < N; j++) {
            if (i == 0)
                printf("%zu ", group[j]);
            else
                printf("%zu ", temp_group[j]);
        }
        printf("\n");
    }

    printf("\nSTEP 3: Printing First 5 and Last 5 Results\n");
    for (int i = 0; i < 5; i++) {
        printf ("F_dist %d: %lf\n", i, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf ("F_dist %d: %lf\n", i, F_dist[i]);
    } 

    size_t extreme_count = 0;
    double p_value = 0.0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
       }
    }
    printf ("Extreme Count: %lu\n", extreme_count);
    p_value = (double)extreme_count / perm_count;
    printf("p-value: %lf\n", p_value);

    /* free memory */
    free(feature);
    free(group);
    free(n_i);
    free(F_dist);

    return 0;
}

Writing c_permutation_anova.c


In [8]:
%%bash
gcc c_permutation_anova.c -o c_permutation_anova -lm

In [9]:
%%bash
./c_permutation_anova < input.txt

Number of Rows: Number of Groups: Number of Permutations: 
Function (in C) average time for 10 loops is 207.039600 milliseconds to execute an array size 1000000 
First 5 permutations
CPU Permutation 1: 0 0 1 1 2 2 
CPU Permutation 2: 2 1 2 0 0 1 
CPU Permutation 3: 2 1 2 0 0 1 
CPU Permutation 4: 2 1 2 0 0 1 
CPU Permutation 5: 2 1 2 0 0 1 

STEP 3: Printing First 5 and Last 5 Results
F_dist 0: 0.005957
F_dist 1: 2.508775
F_dist 2: 1.975504
F_dist 3: 2.508775
F_dist 4: 1.398650
F_dist 999995: 0.071876
F_dist 999996: 1.975504
F_dist 999997: 2.508775
F_dist 999998: 0.071876
F_dist 999999: 0.020986
Extreme Count: 983330
p-value: 0.983330


#### CUDA Version

In [12]:
%%writefile cuda_permutation_anova_no_dependency.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
__device__ unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
__device__ void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}


__device__ void gpu_anova(size_t *perm_array, size_t N, int k, size_t perm_count, double *feature, size_t *n_i, double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        size_t *current_group = &perm_array[perm_idx * N];
        F_dist[perm_idx] = OneWayAnova(N, k, n_i, current_group, feature);
    }
}

__global__ void gpu_permute_and_anova(size_t *array, size_t perm_count, size_t N, int k, 
    double *feature, size_t *n_i, size_t *perm_array, double *F_dist) {
    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = thread_id; i < perm_count; i += stride) {
        size_t *current_perm = &perm_array[i * N];
        
        if (i == 0) {
            // Copy original data
            for (int j = 0; j < N; j++) {
                current_perm[j] = array[j];
            }
        } else {
            // Generate permutation
            permute(array, N, i-1, current_perm);
        }
        
        // Compute ANOVA immediately
        F_dist[i] = OneWayAnova(N, k, n_i, current_perm, feature);
    }
}

int main() {
    size_t perm_count;
    size_t N;
    size_t k;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));

    // Initialize n_i to zero
    memset(n_i, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);


    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);


    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    
    printf("\n Generating Permutations and Computing F-statistic\n");
    printf("Launching kernel with %zu blocks and %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        gpu_permute_and_anova<<<numBlocks, numThreads>>>(
            group, perm_count, N, k, feature, n_i, perm_array, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    // PRINT RESULTS
    printf("\nDEBUG: First 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N && j < 20; j++) {  // Limit output for readability
            printf("%zu ", perm_array[i * N + j]);
        }
        if (N > 20) printf("...");
        printf("\n");
    }

    printf("\n Printing Results\n");
    printf("First 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nOriginal F-statistic: %lf\n", F_dist[0]);
    printf("Extreme count: %zu out of %zu permutations\n", extreme_count, perm_count);
    printf("p-value: %lf\n", p_value);
    

    // Free memory
    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(perm_array);
    cudaFree(F_dist);

    return 0;
}

Overwriting cuda_permutation_anova_no_dependency.cu


In [13]:
%%bash
nvcc cuda_permutation_anova_no_dependency.cu -o cuda_permutation_anova_no_dependency -Wno-deprecated-gpu-targets

In [14]:
%%bash
nvprof ./cuda_permutation_anova_no_dependency < input.txt

==189913== NVPROF is profiling process 189913, command: ./cuda_permutation_anova_no_dependency


Number of Rows: Number of Groups: Number of Permutations: 
 Generating Permutations and Computing F-statistic
Launching kernel with 391 blocks and 256 threads per block

DEBUG: First 5 permutations
GPU Permutation 1: 0 0 0 0 0 0 1 1 1 1 1 1 2 2 2 2 2 2 
GPU Permutation 2: 0 2 2 1 2 1 0 0 1 0 2 2 1 1 0 1 0 2 
GPU Permutation 3: 0 0 1 0 0 2 2 1 2 1 2 2 1 0 0 2 1 1 
GPU Permutation 4: 0 1 0 2 2 0 0 2 0 1 1 2 2 1 1 0 1 2 
GPU Permutation 5: 0 1 2 2 1 1 1 2 1 2 0 2 2 0 1 0 0 0 

 Printing Results
First 5 F-statistics:
F_dist[0]: 4.387509
F_dist[1]: 0.098111
F_dist[2]: 10.622456
F_dist[3]: 0.567116
F_dist[4]: 0.628312

Original F-statistic: 4.387509
Extreme count: 5017 out of 100000 permutations
p-value: 0.050170


==189913== Profiling application: ./cuda_permutation_anova_no_dependency
==189913== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  10.491ms        10  1.0491ms  1.0030ms  1.1034ms  gpu_permute_and_anova(unsigned long*, unsigned long, unsigned long, int, double*, unsigned long*, unsigned long*, double*)
      API calls:   57.04%  1.17269s         5  234.54ms  60.641us  1.17193s  cudaMallocManaged
                   41.39%  850.80ms        10  85.080ms  10.100us  850.66ms  cudaLaunchKernel
                    0.86%  17.739ms        10  1.7739ms  32.686us  14.478ms  cudaMemPrefetchAsync
                    0.51%  10.446ms         2  5.2232ms  98.759us  10.348ms  cudaDeviceSynchronize
                    0.11%  2.3058ms         5  461.17us  31.241us  1.3953ms  cudaFree
                    0.03%  559.71us       114  4.9090us     111ns  221.69us  cuDeviceGetAttribute
                    0.03%  556.40us         3 

#### Monte Carlo CUDA Version (Parallel w/ shared)

In [5]:
%%writefile cuda_monte_cache.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
__device__ unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
__device__ void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}


__device__ void gpu_anova(size_t *perm_array, size_t N, int k, size_t perm_count, double *feature, size_t *n_i, double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        size_t *current_group = &perm_array[perm_idx * N];
        F_dist[perm_idx] = OneWayAnova(N, k, n_i, current_group, feature);
    }
}

__global__ void gpu_permute_and_anova(size_t *array, size_t perm_count, size_t N, int k, 
    double *feature, size_t *n_i, size_t *perm_array, double *F_dist) {
    extern __shared__ char shared_mem[];
    size_t* shared_group = (size_t*)shared_mem;
    double* shared_feature = (double*)(shared_mem + N * sizeof(size_t));

    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int lindex = threadIdx.x;

    // Load data into shared memory
    for (int i = lindex; i < N; i += blockDim.x) {
        shared_group[i] = array[i];
        shared_feature[i] = feature[i];
    }

    __syncthreads();

    int stride = blockDim.x * gridDim.x;
    for (int i = thread_id; i < perm_count; i += stride) {
        size_t *current_perm = &perm_array[i * N];
        
        if (i == 0) {
            // Copy original data
            for (int j = 0; j < N; j++) {
                current_perm[j] = shared_group[j];
            }
        } else {
            // Generate permutation
            permute(shared_group, N, i-1, current_perm);
        }
        F_dist[i] = OneWayAnova(N, k, n_i, current_perm, feature);
    }
}

int main() {
    size_t perm_count;
    size_t N;
    size_t k;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));

    // Initialize n_i to zero
    memset(n_i, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);


    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    
    printf("\n Generating Permutations and Computing F-statistic\n");
    printf("Launching kernel with %zu blocks and %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        gpu_permute_and_anova<<<numBlocks, numThreads, N*2>>>(
            group, perm_count, N, k, feature, n_i, perm_array, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    // PRINT RESULTS
    printf("\nDEBUG: First 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N && j < 20; j++) {  // Limit output for readability
            printf("%zu ", perm_array[i * N + j]);
        }
        if (N > 20) printf("...");
        printf("\n");
    }

    printf("\n Printing Results\n");
    printf("First 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nOriginal F-statistic: %lf\n", F_dist[0]);
    printf("Extreme count: %zu out of %zu permutations\n", extreme_count, perm_count);
    printf("p-value: %lf\n", p_value);
    
    // Free memory
    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(perm_array);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_monte_cache.cu


In [6]:
%%bash
nvcc cuda_monte_cache.cu -o cuda_monte_cache -Wno-deprecated-gpu-targets

cuda_monte_cache.cu(142): error: no suitable constructor exists to convert from "int" to "cudaMemLocation"
      cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, ((int)-1));
                                                                                    ^

cuda_monte_cache.cu(143): error: no suitable constructor exists to convert from "int" to "cudaMemLocation"
      cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, ((int)-1));
                                                                                  ^

cuda_monte_cache.cu(144): error: no suitable constructor exists to convert from "int" to "cudaMemLocation"
      cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, ((int)-1));
                                                                                ^

cuda_monte_cache.cu(147): error: no suitable constructor exists to convert from "int" to "cudaMemLocation"
      cudaMemPrefetchAsync(feature,

CalledProcessError: Command 'b'nvcc cuda_monte_cache.cu -o cuda_monte_cache -Wno-deprecated-gpu-targets\n'' returned non-zero exit status 2.

In [17]:
%%bash
nvprof ./cuda_monte_cache < input.txt

==189993== NVPROF is profiling process 189993, command: ./cuda_permutation_anova_1


Number of Rows: Number of Groups: Number of Permutations: 
STEP 1: Generating Permutations
Launching kernel with 391 blocks and 256 threads per block

STEP 2: Computing F-statistic
Launching kernel with 391 blocks and 256 threads per block

DEBUG: First 5 permutations
GPU Permutation 1: 0 0 0 0 0 0 1 1 1 1 1 1 2 2 2 2 2 2 
GPU Permutation 2: 0 2 2 1 2 1 0 0 1 0 2 2 1 1 0 1 0 2 
GPU Permutation 3: 0 0 1 0 0 2 2 1 2 1 2 2 1 0 0 2 1 1 
GPU Permutation 4: 0 1 0 2 2 0 0 2 0 1 1 2 2 1 1 0 1 2 
GPU Permutation 5: 0 1 2 2 1 1 1 2 1 2 0 2 2 0 1 0 0 0 

 Printing Results
First 5 F-statistics:
F_dist[0]: 4.387509
F_dist[1]: 0.098111
F_dist[2]: 10.622456
F_dist[3]: 0.567116
F_dist[4]: 0.628312

Original F-statistic: 4.387509
Extreme count: 5017 out of 100000 permutations
p-value: 0.050170


==189993== Profiling application: ./cuda_permutation_anova_1
==189993== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   84.91%  10.524ms        10  1.0524ms  1.0321ms  1.0851ms  gpu_permute(unsigned long*, unsigned long, unsigned long, unsigned long*)
                   15.09%  1.8696ms        10  186.96us  179.97us  195.77us  gpu_anova(unsigned long*, unsigned long, int, unsigned long, double*, unsigned long*, double*)
      API calls:   96.96%  1.03464s         5  206.93ms  15.776us  1.03437s  cudaMallocManaged
                    1.25%  13.377ms        12  1.1148ms  17.199us  10.489ms  cudaMemPrefetchAsync
                    1.11%  11.842ms         4  2.9605ms  35.453us  9.9376ms  cudaDeviceSynchronize
                    0.27%  2.9208ms         5  584.15us  33.840us  1.4179ms  cudaFree
                    0.27%  2.8935ms        20  144.68us  9.8580us  2.2253ms  cudaLaunchKernel
                    0.04%  410.25