## Permutation Test using One-Way ANOVA in CUDA

In [1]:
import os

# Add the directory containing the executable to the PATH
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin"

# Check if the directory is added to the PATH
print(os.environ["PATH"])

/opt/tljh/user/bin:/bin:/usr/bin:/usr/local/cuda/bin


### Random Dataset Generation

In [2]:
!python -m pip install scikit-learn
!python -m pip install pandas
!python -m pip install scipy

In [2]:
from sklearn.datasets import make_classification

k = 4
N = 10
X, y = make_classification(
    n_samples = N,             # row number
    n_features = 5,            # feature numbers
    n_informative = 3,         # The number of informative features
    n_redundant = 0,           # The number of redundant features
    n_repeated = 0,            # The number of duplicated features
    n_classes = k,             # The number of classes 
    n_clusters_per_class = 1,  # The number of clusters per class
    random_state = 42,         # random seed 
    scale=100                  # scale of the data
)

In [3]:
import pandas as pd

df = pd.concat([pd.DataFrame(X)[[0]], pd.DataFrame(y).astype(int)], axis=1)
df.columns = [0, 1]
df = df.sort_values(1).reset_index().iloc[:,1:]
df.to_csv("dataset.csv", header=False, index=False)

### Exact Permutation Test

#### C Version (Serial)

In [4]:
%%writefile c_exact_perm.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* EXACT BINOMIAL & MULTINOMIAL FUNCTIONS */
// Compute C(n,k) exactly using 128-bit integers
unsigned long long int binom(unsigned int n, unsigned int k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;

    unsigned long long int result = 1;
    for (unsigned int i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

/* EXACT multinomial coefficient using sequential binomial method */
unsigned long long int getCountPerm(int total_elements, size_t *repeats, int k) {
    unsigned long long int result = 1;
    int remaining = total_elements;

    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        unsigned long long int c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

// helper function for swapping values
void Exchange(size_t* data, size_t a, size_t b) {
    size_t temp = data[a];
    data[a] = data[b];
    data[b] = temp;
}

/* PERMUTATION GENERATOR */
int permute(size_t a[], int n) {
    int l, j;
    for (j = --n; j > 0 && a[j-1] >= a[j]; --j) { ; }
    if (j == 0) return 0;
    for (l = n; a[j-1] >= a[l]; --l) { ; }
    Exchange(a, j-1, l);
    while (j < n) { Exchange(a, j++, n--); }
    return 1;
}

/* ONE WAY ANALYSIS OF VARIANCE */
double OneWayAnova(size_t N, size_t k, size_t *n_i, size_t *group, double *feature){
    double *group_ave = calloc(k, sizeof(double));

    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    free(group_ave);

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}

int main() {
    size_t N;
    size_t k;
    clock_t start, end;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    double *feature = malloc(N * sizeof(double));
    size_t *group = malloc(N * sizeof(size_t));
    size_t *group_copy = malloc(N * sizeof(size_t));
    size_t *perm_array = malloc(N * 10 * sizeof(size_t));
    size_t *n_i = calloc(k, sizeof(size_t));

    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;

    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;
        char *token = strtok(line, ",");
        int j = 0;

        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);

    memcpy(group_copy, group, N * sizeof(size_t));

    /* EXACT PERMUTATION COUNT*/
    unsigned long long int perm_count = getCountPerm(N, n_i, k);

    // Can overflow for large n
    double *F_dist = malloc(perm_count * sizeof(double));

    // Execution time start here: CPU Permutation
    /* CPU PERMUTATION */
    double elapse = 0.0f, 
        time_taken;

    /* PERMUTATION TEST */
    for (int c=0; c<counter; c++){
        start = clock();
        size_t p = 0;
        for (i = 0; i < perm_count; i++){
            // compute One Way ANOVA
            F_dist[i] = OneWayAnova(N, k, n_i, group, feature);

            // save the permutation for sanity check
            if (i < 5 || i > perm_count - 6) {
            memcpy(&perm_array[p * N], group, N * sizeof(size_t));
            p++;
            }

            // change grouping assignment
            permute(group, N);
        }
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
        memcpy(group, group_copy, N * sizeof(size_t));
    }

    FILE *fptr;
    fptr = fopen("c_exact_perm.csv", "w");

    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to execute an array size of %llu permutations.\n", counter, elapse/counter, perm_count);
    printf("\n");

    // Print first 5 and last 5 permutations
    printf("Printing First 5 and Last 5 Permutations\n");
    for (size_t i = 0; i < 5; i++) {
        printf("CPU Permutation %zu: ", i+1);
        for (size_t j = 0; j < N; j++) {
            printf("%zu ", perm_array[i*N + j]);
            fprintf(fptr, "%zu,", perm_array[i*N + j]);
        }
        printf("\n");
        fprintf(fptr, "\n");
    }
    printf("=================================\n");
    for (size_t i = 5; i < 10; i++) {
        printf("CPU Permutation %zu: ", i+1);
        for (size_t j = 0; j < N; j++) {
            printf("%zu ", perm_array[i*N + j]);
            fprintf(fptr, "%zu,", perm_array[i*N + j]);
        }
        printf("\n");
        fprintf(fptr, "\n");
    }

    printf("\nPrinting First 5 and Last 5 Results\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf ("F_dist %d: %lf\n", i+1, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf ("F_dist %d: %lf\n", i+1, F_dist[i]);
    } 

    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
    }
    }

    // Calculating the p-value for the permutation test
    double p_value = (double)extreme_count/perm_count;
    printf("\nNull F: %lf\n", F_dist[0]);
    printf ("Extreme Count: %lu\n", extreme_count);
    p_value = (double)extreme_count / perm_count;
    printf("p-value: %lf\n", p_value);

    // saving extreme count and p-value
    fprintf(fptr, "%zu,%lf\n", extreme_count, p_value);
    fclose(fptr);

    // free the allocated memory
    free(feature);
    free(group);
    free(n_i);
    free(F_dist);

    return 0;
}

Writing c_exact_perm.c


In [5]:
%%bash
gcc c_exact_perm.c -o c_exact_perm -lm

In [6]:
%%bash
./c_exact_perm < input.txt

Number of Rows: Number of Groups: 
Function (in C) average time for 10 loops is 4.963100 milliseconds to execute an array size of 25200 permutations.

Printing First 5 and Last 5 Permutations
CPU Permutation 1: 0 0 0 1 1 1 2 2 3 3 
CPU Permutation 2: 0 0 0 1 1 1 2 3 2 3 
CPU Permutation 3: 0 0 0 1 1 1 2 3 3 2 
CPU Permutation 4: 0 0 0 1 1 1 3 2 2 3 
CPU Permutation 5: 0 0 0 1 1 1 3 2 3 2 
CPU Permutation 6: 3 3 2 2 1 0 1 1 0 0 
CPU Permutation 7: 3 3 2 2 1 1 0 0 0 1 
CPU Permutation 8: 3 3 2 2 1 1 0 0 1 0 
CPU Permutation 9: 3 3 2 2 1 1 0 1 0 0 
CPU Permutation 10: 3 3 2 2 1 1 1 0 0 0 

Printing First 5 and Last 5 Results
F_dist 1: 1.042848
F_dist 2: 0.534092
F_dist 3: 0.534315
F_dist 4: 0.534315
F_dist 5: 0.534092
F_dist 25196: 1.580566
F_dist 25197: 1.658859
F_dist 25198: 0.194464
F_dist 25199: 0.273984
F_dist 25200: 0.273915

Null F: 1.042848
Extreme Count: 10351
p-value: 0.410754


#### CUDA (using lexicographic permutation)

In [7]:
%%writefile cuda_exact_lexico_perm_anova.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* EXACT BINOMIAL & MULTINOMIAL FUNCTIONS */
// Compute C(n,k) exactly using 128-bit integers
unsigned long long binom(unsigned long long n, unsigned long long k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;

    unsigned long long result = 1;
    for (int i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

/* EXACT multinomial coefficient using sequential binomial method */
unsigned long long getCountPerm(int total_elements, size_t *repeats, int k) {
    unsigned long long result = 1;
    int remaining = total_elements;

    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        unsigned long long c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

// helper function for swapping values
void Exchange(size_t* data, size_t a, size_t b) {
    size_t temp = data[a];
    data[a] = data[b];
    data[b] = temp;
}

/* PERMUTATION GENERATOR */
int permute(size_t a[], size_t n) {
    int l, j;
    for (j = --n; j > 0 && a[j-1] >= a[j]; --j) { ; }
    if (j == 0) return 0;
    for (l = n; a[j-1] >= a[l]; --l) { ; }
    Exchange(a, j-1, l);
    while (j < n) { Exchange(a, j++, n--); }
    return 1;
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, size_t k, size_t *n_i, size_t *group, double *feature){
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}

__global__ void gpu_anova(size_t *perm_array, size_t N, int k, size_t perm_count, double *feature, size_t *n_i, double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        size_t *current_group = &perm_array[perm_idx * N];
        F_dist[perm_idx] = OneWayAnova(N, k, n_i, current_group, feature);
    }
}

int main() {
    size_t N;
    size_t k;
    clock_t start, end;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *group_copy = (size_t*)malloc(N * sizeof(size_t));
    size_t *n_i;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));

    // Initialize n_i to zero
    memset(n_i, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);

    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);    

    memcpy(group_copy, group, N * sizeof(size_t));

    /* EXACT PERMUTATION COUNT USING 128-BIT INTEGER */
    unsigned long long perm_count = getCountPerm(N, n_i, k);
    
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));

    
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    /* CPU PERMUTATION */
    double elapse = 0.0f, 
           time_taken;
    
    // STEP 1: CPU PERMUTATION
    for (int c = 0; c < counter; c++){
        start = clock();
        memcpy(perm_array, group, N * sizeof(size_t)); // Initialize first permutation
        for (i = 0; i < perm_count; i++) {
            permute(group, N);
            memcpy(&perm_array[(i + 1) * N], group, N * sizeof(size_t));
        }
        memcpy(group, group_copy, N * sizeof(size_t)); // Reset group array
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
    }
    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to generate %llu permutations\n", counter, elapse/counter, perm_count);

    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;
    
    for (size_t c = 0; c < counter; c++){
        gpu_anova<<<numBlocks, numThreads>>>(perm_array, N, k, perm_count, feature, n_i, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    // PRINT RESULTS
    FILE *fptr;
    fptr = fopen("cuda_exact_lexico_perm_anova.csv", "w");

    printf("\nPrinting First 5 and Last 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }

    printf("\nPrinting First 5 and Last 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist[%d]: %lf\n", i, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nNull: %lf\n", F_dist[0]);
    printf("Extreme count: %zu\n", extreme_count);
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu,%lf", extreme_count, p_value);

    // Free memory
    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(perm_array);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_exact_lexico_perm_anova.cu


In [8]:
%%bash
nvcc cuda_exact_lexico_perm_anova.cu -o cuda_exact_lexico_perm_anova -Wno-deprecated-gpu-targets

In [9]:
%%bash
nvprof ./cuda_exact_lexico_perm_anova < input.txt

==460534== NVPROF is profiling process 460534, command: ./cuda_exact_lexico_perm_anova


Number of Rows: Number of Groups: 
Function (in C) average time for 10 loops is 1.978300 milliseconds to generate 25200 permutations

Printing First 5 and Last 5 permutations
GPU Permutation 1: 0 0 0 1 1 1 2 2 3 3 
GPU Permutation 2: 0 0 0 1 1 1 2 3 2 3 
GPU Permutation 3: 0 0 0 1 1 1 2 3 3 2 
GPU Permutation 4: 0 0 0 1 1 1 3 2 2 3 
GPU Permutation 5: 0 0 0 1 1 1 3 2 3 2 
GPU Permutation 25196: 3 3 2 2 1 0 1 1 0 0 
GPU Permutation 25197: 3 3 2 2 1 1 0 0 0 1 
GPU Permutation 25198: 3 3 2 2 1 1 0 0 1 0 
GPU Permutation 25199: 3 3 2 2 1 1 0 1 0 0 
GPU Permutation 25200: 3 3 2 2 1 1 1 0 0 0 

Printing First 5 and Last 5 F-statistics:
F_dist[0]: 1.042848
F_dist[1]: 0.534092
F_dist[2]: 0.534315
F_dist[3]: 0.534315
F_dist[4]: 0.534092
F_dist[25195]: 1.580566
F_dist[25196]: 1.658859
F_dist[25197]: 0.194464
F_dist[25198]: 0.273984
F_dist[25199]: 0.273915

Null: 1.042848
Extreme count: 10351
p-value: 0.410754


==460534== Profiling application: ./cuda_exact_lexico_perm_anova
==460534== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  111.14us        10  11.113us  10.656us  13.536us  gpu_anova(unsigned long*, unsigned long, int, unsigned long, double*, unsigned long*, double*)
      API calls:   98.99%  990.41ms         5  198.08ms  9.8230us  989.84ms  cudaMallocManaged
                    0.56%  5.6154ms        11  510.49us  20.359us  1.6770ms  cudaMemPrefetchAsync
                    0.23%  2.3007ms        10  230.07us  11.669us  2.1487ms  cudaLaunchKernel
                    0.11%  1.1027ms         5  220.54us  38.737us  637.05us  cudaFree
                    0.03%  321.58us       114  2.8200us     110ns  143.65us  cuDeviceGetAttribute
                    0.03%  296.14us         1  296.14us  296.14us  296.14us  cudaGetDevice
                    0.03%  268.62us         3  89.541us  6.7000us  250.99us  cudaMemAdvise

#### CUDA Version (using rank indexing)

In [10]:
%%writefile cuda_exact_rank_perm_anova.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LINE 1024
#define MAX_GROUPS 100

__device__ unsigned long long factorial(size_t n) {
    unsigned long long result = 1;
    for (int i = 2; i <= n; i++)
        result *= i;
    return result;
}

__device__ unsigned long long multinomial(size_t total, size_t *counts, size_t k) {
    unsigned long long result = factorial(total);
    for (int i = 0; i < k; i++) {
        result /= factorial(counts[i]);
    }
    return result;
}

__device__ void rank_to_permutation(size_t *keys, size_t *n_i, size_t k, size_t N, unsigned long long rank, size_t *perm) {
    size_t n_i_copy[MAX_GROUPS];
    
    for (int i = 0; i < k; i++) {
        n_i_copy[i] = n_i[i];
    }
    
    int total = N;
    
    for (int pos = 0; pos < N; pos++) {
        for (int i = 0; i < k; i++) {
            if (n_i_copy[i] == 0)
                continue;
            
            n_i_copy[i]--;
            unsigned long long num = multinomial(total - 1, n_i_copy, k);
            
            if (rank < num) {
                perm[pos] = keys[i];
                total--;
                break;
            } else {
                rank -= num;
                n_i_copy[i]++;
            }
        }
    }
}

__device__ double one_way_anova(size_t N, size_t k, size_t *n_i, size_t *group, double *feature) {
    double group_ave[MAX_GROUPS] = {0.0};
    
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    
    for (int i = 0; i < k; i++) {
        if (n_i[i] > 0)
            group_ave[i] /= n_i[i];
    }
    
    double SSE = 0.0;
    for (int i = 0; i < N; i++) {
        double temp = feature[i] - group_ave[group[i]];
        SSE += temp * temp;
    }
    
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        double temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }
    
    return (SSR / (k - 1)) / (SSE / (N - k));
}

__global__ void permutation_test_gpu(size_t N, size_t k, size_t *keys, size_t *n_i, double *features, unsigned long long perm_count, size_t *perm_array,  double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (unsigned long long perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        unsigned long long rank = perm_idx;
        
        // Each thread gets its own section of the perm_array
        size_t *perm = &perm_array[idx * N];
        if (rank >= perm_count)
            return;
        
        rank_to_permutation(keys, n_i, k, N, rank, perm);
        
        double F_stat = one_way_anova(N, k, n_i, perm, features);
        
        F_dist[rank] = F_stat;
    }
}

unsigned long long binom(size_t n, size_t k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;
    
    unsigned long long result = 1;
    for (int i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

unsigned long long get_perm_count(size_t total_elements, size_t *n_i, size_t k) {
    unsigned long long result = 1;
    int remaining = total_elements;
    
    for (int i = 0; i < k; i++) {
        int ni = n_i[i];
        unsigned long long c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

int main() {
    size_t N, k;
    size_t counter = 10;
    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    int device = -1;
    cudaGetDevice(&device);

    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *keys;
    double *F_dist;
    size_t *perm_array;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&keys, k * sizeof(size_t));
    
    memset(n_i, 0, k * sizeof(size_t));

    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(keys, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(keys, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }
    
    char line[MAX_LINE];
    int i = 0;
    
    while (fgets(line, sizeof(line), fp) && i < N) {
        line[strcspn(line, "\n")] = 0;
        char *token = strtok(line, ",");
        int j = 0;
        
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k) {
                    fprintf(stderr, "Error: group index out of range\n");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]]++;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    for (size_t i = 0; i < k; i++) {
        keys[i] = i;
    }
    
    unsigned long long perm_count = get_perm_count(N, n_i, k);

    cudaMallocManaged(&F_dist, perm_count * sizeof(double));
    cudaMallocManaged(&perm_array, perm_count * N * sizeof(size_t));

    cudaMemPrefetchAsync(keys, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    
    // Prefetch output array to GPU
    cudaMemPrefetchAsync(perm_array, perm_count * N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);
    
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    for (size_t c = 0; c < counter; c++){
        permutation_test_gpu<<<numBlocks, numThreads>>>(
            N, k, keys, n_i, feature, perm_count, perm_array, F_dist
        );
    }
    
    cudaDeviceSynchronize();
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);
    
    // PRINT RESULTS
    FILE *fptr;
    fptr = fopen("cuda_exact_rank_perm_anova.csv", "w");

    printf("\nPrinting First 5 and Last 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }

    printf("\nPrinting First 5 and Last 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist %d: %lf\n", i+1, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist %d: %lf\n", i+1, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nNull: %lf\n", F_dist[0]);
    printf("Extreme count: %zu\n", extreme_count);
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu,%lf", extreme_count, p_value);

    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(keys);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_exact_rank_perm_anova.cu


In [11]:
%%bash
nvcc cuda_exact_rank_perm_anova.cu -o cuda_exact_rank_perm_anova -Wno-deprecated-gpu-targets

In [12]:
%%bash
nvprof ./cuda_exact_rank_perm_anova < input.txt

==460587== NVPROF is profiling process 460587, command: ./cuda_exact_rank_perm_anova


Number of Rows: Number of Groups: 
Printing First 5 and Last 5 permutations
GPU Permutation 1: 0 0 0 1 1 1 2 2 3 3 
GPU Permutation 2: 0 0 0 1 1 1 2 3 2 3 
GPU Permutation 3: 0 0 0 1 1 1 2 3 3 2 
GPU Permutation 4: 0 0 0 1 1 1 3 2 2 3 
GPU Permutation 5: 0 0 0 1 1 1 3 2 3 2 
GPU Permutation 25196: 3 3 2 2 1 0 1 1 0 0 
GPU Permutation 25197: 3 3 2 2 1 1 0 0 0 1 
GPU Permutation 25198: 3 3 2 2 1 1 0 0 1 0 
GPU Permutation 25199: 3 3 2 2 1 1 0 1 0 0 
GPU Permutation 25200: 3 3 2 2 1 1 1 0 0 0 

Printing First 5 and Last 5 F-statistics:
F_dist 1: 1.042848
F_dist 2: 0.534092
F_dist 3: 0.534315
F_dist 4: 0.534315
F_dist 5: 0.534092
F_dist 25196: 1.580566
F_dist 25197: 1.658859
F_dist 25198: 0.194464
F_dist 25199: 0.273984
F_dist 25200: 0.273915

Null: 1.042848
Extreme count: 10351
p-value: 0.410754


==460587== Profiling application: ./cuda_exact_rank_perm_anova
==460587== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  901.72us        10  90.172us  87.968us  92.415us  permutation_test_gpu(unsigned long, unsigned long, unsigned long*, unsigned long*, double*, __int64, unsigned long*, double*)
      API calls:   99.41%  1.19080s         6  198.47ms  8.9290us  1.19016s  cudaMallocManaged
                    0.16%  1.9068ms        10  190.68us  7.0250us  1.7969ms  cudaLaunchKernel
                    0.16%  1.8881ms        10  188.81us  17.944us  640.21us  cudaMemPrefetchAsync
                    0.10%  1.2216ms         5  244.31us  21.962us  874.87us  cudaFree
                    0.06%  771.86us         1  771.86us  771.86us  771.86us  cudaDeviceSynchronize
                    0.04%  478.40us       114  4.1960us     104ns  191.34us  cuDeviceGetAttribute
                    0.03%  330.20us         1  330.20

#### CUDA Version (using rank indexing w/ shared memory)

In [13]:
%%writefile cuda_shared_exact_rank_perm_anova.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LINE 1024
#define MAX_GROUPS 10

__device__ unsigned long long factorial(size_t n) {
    unsigned long long result = 1;
    for (size_t i = 2; i <= n; i++)
        result *= i;
    return result;
}

__device__ unsigned long long multinomial(size_t total, size_t *counts, size_t k) {
    unsigned long long result = factorial(total);
    for (size_t i = 0; i < k; i++) {
        result /= factorial(counts[i]);
    }
    return result;
}

__device__ void rank_to_permutation(size_t *keys, size_t *n_i, size_t k, size_t N, unsigned long long rank, size_t *perm) {
    size_t n_i_copy[MAX_GROUPS];
    
    for (int i = 0; i < k; i++) {
        n_i_copy[i] = n_i[i];
    }
    
    size_t total = N;
    
    for (int pos = 0; pos < N; pos++) {
        for (int i = 0; i < k; i++) {
            if (n_i_copy[i] == 0)
                continue;
            
            n_i_copy[i]--;
            unsigned long long num = multinomial(total - 1, n_i_copy, k);
            
            if (rank < num) {
                perm[pos] = keys[i];
                total--;
                break;
            } else {
                rank -= num;
                n_i_copy[i]++;
            }
        }
    }
}

__device__ double one_way_anova(size_t N, size_t k, size_t *n_i, size_t *group, double *feature) {
    double group_ave[MAX_GROUPS] = {0.0};
    
    double average = 0.0;
    for (size_t i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    
    for (size_t i = 0; i < k; i++) {
        if (n_i[i] > 0)
            group_ave[i] /= n_i[i];
    }
    
    double SSE = 0.0;
    for (size_t i = 0; i < N; i++) {
        double temp = feature[i] - group_ave[group[i]];
        SSE += temp * temp;
    }
    
    double SSR = 0.0;
    for (size_t i = 0; i < k; i++) {
        double temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }
    
    return (SSR / (k - 1)) / (SSE / (N - k));
}

__global__ void permutation_test_gpu(size_t N, size_t k, size_t *keys, size_t *group_counts, double *features, unsigned long long total_perms, size_t *perm_buffer,  double *F_dist) {
    extern __shared__ char shared_mem[];
    double* shared_feature = (double*)shared_mem;
    size_t* shared_keys = (size_t*)(shared_mem + N * sizeof(double));
    size_t* shared_group_counts = (size_t*)(shared_mem + N * sizeof(double) + k * sizeof(size_t));

    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int lindex = threadIdx.x;

    // Load data into shared memory
    for (int i = lindex; i < N; i += blockDim.x) {
        shared_feature[i] = features[i];
        if (i < k) {
            shared_keys[i] = keys[i];
            shared_group_counts[i] = group_counts[i];
        }
    }

    __syncthreads();
    int stride = blockDim.x * gridDim.x;
    for (unsigned long long perm_idx = thread_id; perm_idx < total_perms; perm_idx += stride) {
        unsigned long long rank = perm_idx;
        
        // Each thread gets its own section of the perm_buffer
        size_t *perm = &perm_buffer[perm_idx * N];
        if (rank >= total_perms)
            return;
        
        rank_to_permutation(shared_keys, shared_group_counts, k, N, rank, perm);
        
        double F_stat = one_way_anova(N, k, shared_group_counts, perm, shared_feature);
        
        F_dist[rank] = F_stat;
    }
}

unsigned long long binom(size_t n, size_t k) {
    if (k > n) return 0;
    if (k > n - k) k = n - k;
    
    unsigned long long result = 1;
    for (size_t i = 1; i <= k; i++) {
        result = result * (n - k + i) / i;
    }
    return result;
}

unsigned long long get_perm_count(size_t total_elements, size_t *repeats, size_t k) {
    unsigned long long result = 1;
    int remaining = total_elements;
    
    for (int i = 0; i < k; i++) {
        int ni = repeats[i];
        unsigned long long c = binom(remaining, ni);
        result *= c;
        remaining -= ni;
    }
    return result;
}

int main() {
    size_t N, k;
    size_t counter = 10;
    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);

    int device = -1;
    cudaGetDevice(&device);

    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *keys;
    double *F_dist;
    size_t *perm_array;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&keys, k * sizeof(size_t));

    memset(n_i, 0, k * sizeof(size_t));

    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(keys, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(keys, k * sizeof(size_t), cudaCpuDeviceId, NULL);
    
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL) {
        perror("Error opening file");
        return 1;
    }
    
    char line[MAX_LINE];
    int i = 0;
    
    while (fgets(line, sizeof(line), fp) && i < N) {
        line[strcspn(line, "\n")] = 0;
        char *token = strtok(line, ",");
        int j = 0;
        
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k) {
                    fprintf(stderr, "Error: group index out of range\n");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]]++;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    for (int i = 0; i < k; i++) {
        keys[i] = i;
    }
    
    unsigned long long perm_count = get_perm_count(N, n_i, k);

    cudaMallocManaged(&F_dist, perm_count * sizeof(double));
    cudaMallocManaged(&perm_array, perm_count * N * sizeof(size_t));

    cudaMemPrefetchAsync(keys, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    
    // Prefetch output array to GPU
    cudaMemPrefetchAsync(perm_array, perm_count * N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);
    
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;
    
    for (size_t c = 0; c < counter; c++){
        permutation_test_gpu<<<numBlocks, numThreads, N + 2*k>>>(
            N, k, keys, n_i, feature, perm_count, perm_array, F_dist
        );
    }
    
    cudaDeviceSynchronize();
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);
    
    // PRINT RESULTS
    FILE *fptr;
    fptr = fopen("cuda_shared_exact_rank_perm_anova.csv", "w");

    printf("\nPrinting First 5 and Last 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }

    printf("\nPrinting First 5 and Last 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist %d: %lf\n", i+1, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist %d: %lf\n", i+1, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nNull: %lf\n", F_dist[0]);
    printf("Extreme count: %zu\n", extreme_count);
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu,%lf", extreme_count, p_value);

    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(keys);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_shared_exact_rank_perm_anova.cu


In [14]:
%%bash
nvcc cuda_shared_exact_rank_perm_anova.cu -o cuda_shared_exact_rank_perm_anova -Wno-deprecated-gpu-targets

In [15]:
%%bash
nvprof ./cuda_shared_exact_rank_perm_anova < input.txt

==460643== NVPROF is profiling process 460643, command: ./cuda_shared_exact_rank_perm_anova


Number of Rows: Number of Groups: 
Printing First 5 and Last 5 permutations
GPU Permutation 1: 0 0 0 1 1 1 2 2 3 3 
GPU Permutation 2: 0 0 0 1 1 1 2 3 2 3 
GPU Permutation 3: 0 0 0 1 1 1 2 3 3 2 
GPU Permutation 4: 0 0 0 1 1 1 3 2 2 3 
GPU Permutation 5: 0 0 0 1 1 1 3 2 3 2 
GPU Permutation 25196: 3 3 2 2 1 0 1 1 0 0 
GPU Permutation 25197: 3 3 2 2 1 1 0 0 0 1 
GPU Permutation 25198: 3 3 2 2 1 1 0 0 1 0 
GPU Permutation 25199: 3 3 2 2 1 1 0 1 0 0 
GPU Permutation 25200: 3 3 2 2 1 1 1 0 0 0 

Printing First 5 and Last 5 F-statistics:
F_dist 1: 1.042848
F_dist 2: 0.534092
F_dist 3: 0.534315
F_dist 4: 0.534315
F_dist 5: 0.534092
F_dist 25196: 1.580566
F_dist 25197: 1.658859
F_dist 25198: 0.194464
F_dist 25199: 0.273984
F_dist 25200: 0.273915

Null: 1.042848
Extreme count: 10351
p-value: 0.410754


==460643== Profiling application: ./cuda_shared_exact_rank_perm_anova
==460643== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  675.84us        10  67.583us  66.400us  70.592us  permutation_test_gpu(unsigned long, unsigned long, unsigned long*, unsigned long*, double*, __int64, unsigned long*, double*)
      API calls:   99.36%  1.13390s         6  188.98ms  8.4520us  1.13333s  cudaMallocManaged
                    0.19%  2.1577ms        10  215.77us  13.832us  1.9761ms  cudaLaunchKernel
                    0.19%  2.1135ms        10  211.35us  17.772us  581.63us  cudaMemPrefetchAsync
                    0.12%  1.4008ms         5  280.16us  31.824us  966.83us  cudaFree
                    0.04%  433.26us         4  108.32us  7.2890us  400.93us  cudaMemAdvise
                    0.04%  416.85us         1  416.85us  416.85us  416.85us  cudaDeviceSynchronize
                    0.04%  406.36us       114  3.5640

#### Output Check for Exact Permutation Test

In [17]:
import numpy as np
from scipy.stats import f_oneway

features = pd.read_csv("dataset.csv",header=None)[0].tolist()
filenames = [
    'c_exact_perm.csv', 
    'cuda_exact_lexico_perm_anova.csv', 
    'cuda_exact_rank_perm_anova.csv', 
    'cuda_shared_exact_rank_perm_anova.csv'
]
extreme_counts = []
p_values = []
maes = []

for filename in filenames:
    file_object = open(filename)
    content = file_object.read()
    permutations = []
    f_stats = []
    indices = []
    for i, row in enumerate(content.split('\n')):
        if i < 10:
            permutations.append(row[:-1]) 
        elif i < 20:
            index, f = row.split(',')
            indices.append(int(index))
            f_stats.append(float(f))
        elif row != '':
            extreme_count, p_value = row.split(',')
            extreme_counts.append(int(extreme_count))
            p_values.append(float(p_value))
            
    permutations = np.array(permutations)
    f_stats = np.array(f_stats)
    indices = np.array(indices)
    output_df = pd.DataFrame(np.vstack([indices, permutations, f_stats])).T
    output_df.columns = ['i', 'perm', 'f']
    output_df['i'] = output_df['i'].astype(int)
    output_df['f'] = output_df['f'].astype(float)
    actual_fs = []
    for perm in output_df['perm']:
        permuted_df = pd.DataFrame(np.vstack([np.array(perm.split(',')), features])).T
        permuted_df[0] = permuted_df[0].astype(int)
        permuted_df[1] = permuted_df[1].astype(float)
        keys = permuted_df[0].unique().tolist()
        input_features = []
        for key in keys:
            group_feature = permuted_df[permuted_df[0] == key][1].tolist()
            input_features.append(group_feature)
        actual_f, _ = f_oneway(*input_features)
        actual_fs.append(float(actual_f))
    output_df['actual_f'] = actual_fs
    output_df['abs_error'] = abs(output_df['actual_f'] - output_df['f'])
    maes.append(output_df['abs_error'].sum() / output_df.shape[0])
filenames = np.array(filenames)
extreme_counts = np.array(extreme_counts)
p_values = np.array(p_values)
maes = np.array(maes)

compiled_df = pd.DataFrame(np.vstack([filenames, extreme_counts, p_values, maes])).T
compiled_df.columns = ['method', 'extremes', 'p-values', 'MAE']
compiled_df['method'] = compiled_df['method'].apply(lambda x: x[:-4])
compiled_df

Unnamed: 0,method,extremes,p-values,MAE
0,c_exact_perm,10351,0.410754,3.012558511539254e-07
1,cuda_exact_lexico_perm_anova,10351,0.410754,3.012558511539254e-07
2,cuda_exact_rank_perm_anova,10351,0.410754,3.012558511539254e-07
3,cuda_shared_exact_rank_perm_anova,10351,0.410754,3.012558511539254e-07


### Monte Carlo Permutation Test

#### C Version (Serial)

In [18]:
%%writefile c_monte_perm.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);  // pick random index [0, i]
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}
/* One Way ANOVA */
double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    /* AVERAGE & GROUP AVERAGE */
    double *group_ave = (double *) calloc(k, sizeof(double));
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }
    free(group_ave);
    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}
int main() {
    size_t perm_count;
    size_t N;   // number of rows
    size_t k;   // number of groups
    clock_t start, end;
    size_t counter = 10;

    /* GET THE NUMBER OF ROWS */
    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    double *feature = (double*) malloc(N * sizeof(double));
    size_t *group = (size_t*) malloc(N * sizeof(size_t));
    size_t *temp_group = (size_t*) malloc(N * sizeof(size_t));
    size_t *perm_array = (size_t*) malloc(N * 10 * sizeof(size_t));
    size_t *n_i = (size_t*) calloc(k, sizeof(size_t));
    double *F_dist = (double*) malloc(perm_count * sizeof(double));

    /* READ THE DATA */
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;  // prevent overflow

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token); // convert to float and save
            else {
                group[i] = atoi(token); // convert to int and save
                if (group[i] >= k){
                    perror("Error group count");
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);

    // fill-in cache
    permute(group, N, i, temp_group);
    OneWayAnova(N, k, n_i, group, feature);

    // Execution time start here: CPU Permutation
    /* CPU PERMUTATION */
    double elapse, time_taken;
    elapse = 0.0f;

    for (int c=0; c<counter; c++){
        memcpy(perm_array, group, N * sizeof(size_t));
        size_t p = 1;
        start = clock();
        for (size_t i = 0; i < perm_count; i++) {            
            // Always permute from the ORIGINAL group array
            permute(group, N, i, temp_group);
            if ((i > 0 && i < 5) || (i > perm_count - 6)){
                memcpy(&perm_array[p * N], temp_group, N * sizeof(size_t));
                p++;
            }

            if (i == 0)
                F_dist[i] = OneWayAnova(N, k, n_i, group, feature);
            else 
                F_dist[i] = OneWayAnova(N, k, n_i, temp_group, feature);
        }
        end = clock();
        time_taken = ((double)(end-start))*1E3/CLOCKS_PER_SEC;
        elapse = elapse + time_taken;
    }
    FILE *fptr;
    fptr = fopen("c_monte_perm.csv", "w");

    printf("\nFunction (in C) average time for %lu loops is %f milliseconds to execute an array size %lu \n", counter, elapse/counter, perm_count);

    // Print first 5 and last 5 permutations
    printf("\nPrinting First 5 and Last 5 Permutations\n");
    for (size_t i = 0; i < 5; i++) {
        printf("CPU Permutation %zu: ", i+1);
        for (size_t j = 0; j < N; j++) {
            printf("%zu ", perm_array[i*N + j]);
            fprintf(fptr, "%zu,", perm_array[i*N + j]);
        }
        printf("\n");
        fprintf(fptr, "\n");
    }
    printf("=================================\n");
    for (size_t i = 5; i < 10; i++) {
        printf("CPU Permutation %zu: ", i+1);
        for (size_t j = 0; j < N; j++) {
            printf("%zu ", perm_array[i*N + j]);
            fprintf(fptr, "%zu,", perm_array[i*N + j]);
        }
        printf("\n");
        fprintf(fptr, "\n");
    }

    printf("\nPrinting First 5 and Last 5 Results\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf ("F_dist %d: %lf\n", i+1, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf ("F_dist %d: %lf\n", i+1, F_dist[i]);
    } 

    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
       }
    }

    // Calculating the p-value for the permutation test
    double p_value = (double)extreme_count/perm_count;
    printf("\nNull F: %lf\n", F_dist[0]);
    printf ("Extreme Count: %lu\n", extreme_count);
    p_value = (double)extreme_count / perm_count;
    printf("p-value: %lf\n", p_value);

    // saving extreme count and p-value
    fprintf(fptr, "%zu,%lf\n", extreme_count, p_value);
    fclose(fptr);

    // free the allocated memory
    free(feature);
    free(group);
    free(n_i);
    free(F_dist);

    return 0;
}

Writing c_monte_perm.c


In [19]:
%%bash
gcc c_monte_perm.c -o c_monte_perm -lm

In [20]:
%%bash
./c_monte_perm < input.txt

Number of Rows: Number of Groups: Number of Permutations: 
Function (in C) average time for 10 loops is 523.254100 milliseconds to execute an array size 1000000 

Printing First 5 and Last 5 Permutations
CPU Permutation 1: 0 0 0 1 1 1 2 2 3 3 
CPU Permutation 2: 0 1 2 0 1 3 1 3 2 0 
CPU Permutation 3: 0 3 1 3 2 0 0 1 1 2 
CPU Permutation 4: 1 1 0 2 2 3 3 0 1 0 
CPU Permutation 5: 2 1 3 0 1 2 0 1 0 3 
CPU Permutation 6: 2 1 3 1 3 2 0 0 1 0 
CPU Permutation 7: 0 0 1 2 1 3 2 1 0 3 
CPU Permutation 8: 1 0 3 2 1 3 2 0 0 1 
CPU Permutation 9: 0 2 0 1 3 1 1 2 3 0 
CPU Permutation 10: 3 1 1 0 0 0 2 1 3 2 

Printing First 5 and Last 5 Results
F_dist 1: 1.042848
F_dist 2: 0.499146
F_dist 3: 1.637202
F_dist 4: 0.598572
F_dist 5: 4.404092
F_dist 999996: 0.803456
F_dist 999997: 5.749775
F_dist 999998: 2.709922
F_dist 999999: 1.868969
F_dist 1000000: 0.541557

Null F: 1.042848
Extreme Count: 448792
p-value: 0.448792


#### CUDA Version

In [21]:
%%writefile cuda_monte_perm.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
__device__ unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
__device__ void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}

__global__ void gpu_permute_and_anova(size_t *array, size_t perm_count, size_t N, int k, 
    double *feature, size_t *n_i, size_t *perm_array, double *F_dist) {
    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = thread_id; i < perm_count; i += stride) {
        size_t *current_perm = &perm_array[i * N];
        
        if (i == 0) {
            // Copy original data
            for (int j = 0; j < N; j++) {
                current_perm[j] = array[j];
            }
        } else {
            // Generate permutation
            permute(array, N, i, current_perm);
        }
        
        // Compute ANOVA immediately
        F_dist[i] = OneWayAnova(N, k, n_i, current_perm, feature);
    }
}

int main() {
    size_t perm_count;
    size_t N;
    size_t k;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));

    // Initialize n_i to zero
    memset(n_i, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);


    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);

    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    printf("\n Generating Permutations and Computing F-statistic\n");
    printf("Launching kernel with %zu blocks and %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        gpu_permute_and_anova<<<numBlocks, numThreads>>>(
            group, perm_count, N, k, feature, n_i, perm_array, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    FILE *fptr;
    fptr = fopen("cuda_monte_perm.csv", "w");

    printf("\nPrinting First 5 and Last 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }

    printf("\nPrinting First 5 and Last 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist %d: %lf\n", i+1, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist %d: %lf\n", i+1, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nNull: %lf\n", F_dist[0]);
    printf("Extreme count: %zu\n", extreme_count);
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu,%lf", extreme_count, p_value);

    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_monte_perm.cu


In [22]:
%%bash
nvcc cuda_monte_perm.cu -o cuda_monte_perm -Wno-deprecated-gpu-targets

In [23]:
%%bash
nvprof ./cuda_monte_perm < input.txt

==460719== NVPROF is profiling process 460719, command: ./cuda_monte_perm


Number of Rows: Number of Groups: Number of Permutations: 
 Generating Permutations and Computing F-statistic
Launching kernel with 3907 blocks and 256 threads per block

Printing First 5 and Last 5 permutations
GPU Permutation 1: 0 0 0 1 1 1 2 2 3 3 
GPU Permutation 2: 0 1 2 0 1 3 1 3 2 0 
GPU Permutation 3: 0 3 1 3 2 0 0 1 1 2 
GPU Permutation 4: 1 1 0 2 2 3 3 0 1 0 
GPU Permutation 5: 2 1 3 0 1 2 0 1 0 3 
GPU Permutation 999996: 2 1 3 1 3 2 0 0 1 0 
GPU Permutation 999997: 0 0 1 2 1 3 2 1 0 3 
GPU Permutation 999998: 1 0 3 2 1 3 2 0 0 1 
GPU Permutation 999999: 0 2 0 1 3 1 1 2 3 0 
GPU Permutation 1000000: 3 1 1 0 0 0 2 1 3 2 

Printing First 5 and Last 5 F-statistics:
F_dist 1: 1.042848
F_dist 2: 0.499146
F_dist 3: 1.637202
F_dist 4: 0.598572
F_dist 5: 4.404092
F_dist 999996: 0.803456
F_dist 999997: 5.749775
F_dist 999998: 2.709922
F_dist 999999: 1.868969
F_dist 1000000: 0.541557

Null: 1.042848
Extreme count: 448792
p-value: 0.448792


==460719== Profiling application: ./cuda_monte_perm
==460719== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  13.991ms        10  1.3991ms  1.3875ms  1.4125ms  gpu_permute_and_anova(unsigned long*, unsigned long, unsigned long, int, double*, unsigned long*, unsigned long*, double*)
      API calls:   94.57%  1.26822s         5  253.64ms  52.087us  1.26751s  cudaMallocManaged
                    3.82%  51.232ms        10  5.1232ms  28.220us  42.775ms  cudaMemPrefetchAsync
                    1.03%  13.879ms         2  6.9395ms  47.759us  13.831ms  cudaDeviceSynchronize
                    0.23%  3.0264ms        10  302.64us  12.051us  2.8537ms  cudaLaunchKernel
                    0.14%  1.8906ms         4  472.66us  35.732us  700.25us  cudaFree
                    0.14%  1.8111ms       114  15.886us     133ns  1.3005ms  cuDeviceGetAttribute
                    0.03%  452.77us         1  452.77us  452.77us  

#### CUDA Version (Parallel w/ shared)

In [24]:
%%writefile cuda_shared_monte_perm.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_LINE 1024

/* Linear Congruential Generator (LCG) */
__device__ unsigned int lcg_random(unsigned int seed) {
    return (1103515245U * (seed) + 12345U) & 0x7fffffffU;
}

/* Fisher–Yates Shuffling Algorithm */
__device__ void permute(size_t *array, size_t N, unsigned int seed, size_t *result) {
    for (size_t i = 0; i < N; i++) {
        result[i] = array[i];
    }
    for (size_t i = N - 1; i > 0; i--) {
        size_t j = lcg_random(seed) % (i + 1);
        size_t temp = result[i];
        result[i] = result[j];
        result[j] = temp;
    }
}

/* One Way ANOVA */
__device__ double OneWayAnova(size_t N, int k, size_t *n_i, size_t *group, double *feature){
    
    double group_ave[100];
    for (int i = 0; i < k; i++) {
        group_ave[i] = 0.0;
    }
        
    double average = 0.0;
    for (int i = 0; i < N; i++) {
        group_ave[group[i]] += feature[i];
        average += feature[i];
    }
    average /= N;
    for (int i = 0; i < k; i++) {
        group_ave[i] /= n_i[i];
    }

    /* SUM OF SQUARED ERROR (SSE) */
    double SSE = 0.0;
    double temp;
    for (int i = 0; i < N; i++) {
        temp = feature[i] - group_ave[group[i]];
        SSE += temp*temp;
    }

    /* SSR (SUM OF SQUARED RESIDUALS) */
    double SSR = 0.0;
    for (int i = 0; i < k; i++) {
        temp = group_ave[i] - average;
        SSR += n_i[i] * (temp * temp);
    }

    /* F-statistic */
    return (SSR/(k-1))/(SSE/(N-k));
}


__device__ void gpu_anova(size_t *perm_array, size_t N, int k, size_t perm_count, double *feature, size_t *n_i, double *F_dist) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int perm_idx = idx; perm_idx < perm_count; perm_idx += stride) {
        size_t *current_group = &perm_array[perm_idx * N];
        F_dist[perm_idx] = OneWayAnova(N, k, n_i, current_group, feature);
    }
}

__global__ void gpu_permute_and_anova(size_t *array, size_t perm_count, size_t N, int k, 
    double *feature, size_t *n_i, size_t *perm_array, double *F_dist) {
    extern __shared__ char shared_mem[];
    size_t* shared_group = (size_t*)shared_mem;
    double* shared_feature = (double*)(shared_mem + N * sizeof(size_t));

    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    int lindex = threadIdx.x;

    // Load data into shared memory
    for (int i = lindex; i < N; i += blockDim.x) {
        shared_group[i] = array[i];
        shared_feature[i] = feature[i];
    }

    __syncthreads();

    int stride = blockDim.x * gridDim.x;
    for (int i = thread_id; i < perm_count; i += stride) {
        size_t *current_perm = &perm_array[i * N];
        
        if (i == 0) {
            // Copy original data
            for (int j = 0; j < N; j++) {
                current_perm[j] = shared_group[j];
            }
        } else {
            // Generate permutation
            permute(shared_group, N, i, current_perm);
        }
        F_dist[i] = OneWayAnova(N, k, n_i, current_perm, feature);
    }
}

int main() {
    size_t perm_count;
    size_t N;
    size_t k;
    size_t counter = 10;

    printf("Number of Rows: ");
    scanf("%zu", &N);
    printf("Number of Groups: ");
    scanf("%zu", &k);
    printf("Number of Permutations: ");
    scanf("%zu", &perm_count);

    // Get GPU device
    int device = -1;
    cudaGetDevice(&device);

    // Memory allocation
    double *feature;
    size_t *group;
    size_t *n_i;
    size_t *perm_array;
    double *F_dist;

    cudaMallocManaged(&feature, N * sizeof(double));
    cudaMallocManaged(&group, N * sizeof(size_t));
    cudaMallocManaged(&n_i, k * sizeof(size_t));
    cudaMallocManaged(&perm_array, N * perm_count * sizeof(size_t));
    cudaMallocManaged(&F_dist, perm_count * sizeof(double));

    // Initialize n_i to zero
    memset(n_i, 0, k * sizeof(size_t));

    // MEMORY ADVISE: Set up for input data (feature, group, n_i)
    cudaMemAdvise(feature, N * sizeof(double), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(group, N * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(n_i, k * sizeof(size_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);

    // Prefetch data to CPU memory
    cudaMemPrefetchAsync(feature, N * sizeof(double), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), cudaCpuDeviceId, NULL);


    // READ DATA FROM FILE
    FILE *fp = fopen("dataset.csv", "r");
    if (fp == NULL){
        perror("Error opening file");
        return 1;
    }

    char line[MAX_LINE];
    size_t i = 0;
    while (fgets(line, sizeof(line), fp)) {
        if (i >= N) break;

        line[strcspn(line, "\n")] = 0;

        char *token = strtok(line, ",");
        int j = 0;
        while (token != NULL) {
            if (j == 0)
                feature[i] = atof(token);
            else {
                group[i] = atoi(token);
                if (group[i] >= k){
                    perror("Error group count");
                    fclose(fp);
                    return 1;
                }
                n_i[group[i]] += 1;
            }
            token = strtok(NULL, ",");
            j++;
        }
        i++;
    }
    fclose(fp);
    
    // PREFETCH: Move input data to GPU before computation
    cudaMemPrefetchAsync(feature, N * sizeof(double), device, NULL);
    cudaMemPrefetchAsync(group, N * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(n_i, k * sizeof(size_t), device, NULL);
    
    // Prefetch output arrays to GPU
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), device, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), device, NULL);

    // Wait for prefetch to complete
    cudaDeviceSynchronize();

    // Number of Threads and Blocks
    size_t numThreads = 256;
    size_t numBlocks = (perm_count + numThreads - 1) / numThreads;

    printf("\n Generating Permutations and Computing F-statistic\n");
    printf("Launching kernel with %zu blocks and %zu threads per block\n", numBlocks, numThreads);
    
    for (size_t c = 0; c < counter; c++){
        gpu_permute_and_anova<<<numBlocks, numThreads, N*2>>>(
            group, perm_count, N, k, feature, n_i, perm_array, F_dist);
    }
    cudaDeviceSynchronize();

    // PREFETCH: Move results back to CPU for printing
    cudaMemPrefetchAsync(perm_array, N * perm_count * sizeof(size_t), cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(F_dist, perm_count * sizeof(double), cudaCpuDeviceId, NULL);

    FILE *fptr;
    fptr = fopen("cuda_shared_monte_perm.csv", "w");

    printf("\nPrinting First 5 and Last 5 permutations\n");
    for (int i = 0; i < 5; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        printf("GPU Permutation %d: ", i + 1);
        for (int j = 0; j < N; j++) {
            fprintf(fptr, "%zu,", perm_array[i * N + j]);
            printf("%zu ", perm_array[i * N + j]);
        }
        fprintf(fptr, "\n");
        printf("\n");
    }

    printf("\nPrinting First 5 and Last 5 F-statistics:\n");
    for (int i = 0; i < 5; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist %d: %lf\n", i+1, F_dist[i]);
    }
    printf("=================================\n");
    for (int i = perm_count-5; i < perm_count; i++) {
        fprintf(fptr, "%d,%lf\n", i, F_dist[i]);
        printf("F_dist %d: %lf\n", i+1, F_dist[i]);
    }

    // Calculate p-value
    size_t extreme_count = 0;
    for (size_t i = 1; i < perm_count; i++) {
        if (F_dist[i] >= F_dist[0]) {
            extreme_count++;
        }
    }
    double p_value = (double)extreme_count / (double)perm_count;
    printf("\nNull: %lf\n", F_dist[0]);
    printf("Extreme count: %zu\n", extreme_count);
    printf("p-value: %lf\n", p_value);

    fprintf(fptr, "%zu,%lf", extreme_count, p_value);
    
    // Free memory
    cudaFree(feature);
    cudaFree(group);
    cudaFree(n_i);
    cudaFree(perm_array);
    cudaFree(F_dist);

    return 0;
}

Writing cuda_shared_monte_perm.cu


In [25]:
%%bash
nvcc cuda_shared_monte_perm.cu -o cuda_shared_monte_perm -Wno-deprecated-gpu-targets

In [26]:
%%bash
nvprof ./cuda_shared_monte_perm < input.txt

==460780== NVPROF is profiling process 460780, command: ./cuda_shared_monte_perm


Number of Rows: Number of Groups: Number of Permutations: 
 Generating Permutations and Computing F-statistic
Launching kernel with 3907 blocks and 256 threads per block

Printing First 5 and Last 5 permutations
GPU Permutation 1: 0 0 0 1 1 1 2 2 3 3 
GPU Permutation 2: 0 1 2 0 1 3 1 3 2 0 
GPU Permutation 3: 0 3 1 3 2 0 0 1 1 2 
GPU Permutation 4: 1 1 0 2 2 3 3 0 1 0 
GPU Permutation 5: 2 1 3 0 1 2 0 1 0 3 
GPU Permutation 999996: 2 1 3 1 3 2 0 0 1 0 
GPU Permutation 999997: 0 0 1 2 1 3 2 1 0 3 
GPU Permutation 999998: 1 0 3 2 1 3 2 0 0 1 
GPU Permutation 999999: 0 2 0 1 3 1 1 2 3 0 
GPU Permutation 1000000: 3 1 1 0 0 0 2 1 3 2 

Printing First 5 and Last 5 F-statistics:
F_dist 1: 1.042848
F_dist 2: 0.499146
F_dist 3: 1.637202
F_dist 4: 0.598572
F_dist 5: 4.404092
F_dist 999996: 0.803456
F_dist 999997: 5.749775
F_dist 999998: 2.709922
F_dist 999999: 1.868969
F_dist 1000000: 0.541557

Null: 1.042848
Extreme count: 448792
p-value: 0.448792


==460780== Profiling application: ./cuda_shared_monte_perm
==460780== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  14.041ms        10  1.4041ms  1.3923ms  1.4127ms  gpu_permute_and_anova(unsigned long*, unsigned long, unsigned long, int, double*, unsigned long*, unsigned long*, double*)
      API calls:   95.05%  1.54488s         5  308.98ms  47.819us  1.54423s  cudaMallocManaged
                    3.51%  57.035ms        10  5.7035ms  22.949us  48.808ms  cudaMemPrefetchAsync
                    0.86%  14.048ms         2  7.0238ms  71.603us  13.976ms  cudaDeviceSynchronize
                    0.36%  5.7774ms         5  1.1555ms  45.699us  3.5776ms  cudaFree
                    0.14%  2.3464ms        10  234.64us  9.7750us  2.2127ms  cudaLaunchKernel
                    0.03%  436.95us       114  3.8320us     138ns  178.53us  cuDeviceGetAttribute
                    0.02%  364.30us         3  121.43us  10.

#### Output Check for Monte Carlo Permutation Test

In [29]:
import numpy as np
from scipy.stats import f_oneway

features = pd.read_csv("dataset.csv",header=None)[0].tolist()
filenames = [
    'c_monte_perm.csv', 
    'cuda_monte_perm.csv', 
    'cuda_shared_monte_perm.csv'
]
extreme_counts = []
p_values = []
maes = []

for filename in filenames:
    file_object = open(filename)
    content = file_object.read()
    permutations = []
    f_stats = []
    indices = []
    for i, row in enumerate(content.split('\n')):
        if i < 10:
            permutations.append(row[:-1]) 
        elif i < 20:
            index, f = row.split(',')
            indices.append(int(index))
            f_stats.append(float(f))
        elif row != '':
            extreme_count, p_value = row.split(',')
            extreme_counts.append(int(extreme_count))
            p_values.append(float(p_value))
            
    permutations = np.array(permutations)
    f_stats = np.array(f_stats)
    indices = np.array(indices)
    output_df = pd.DataFrame(np.vstack([indices, permutations, f_stats])).T
    output_df.columns = ['i', 'perm', 'f']
    output_df['i'] = output_df['i'].astype(int)
    output_df['f'] = output_df['f'].astype(float)
    actual_fs = []
    for perm in output_df['perm']:
        permuted_df = pd.DataFrame(np.vstack([np.array(perm.split(',')), features])).T
        permuted_df[0] = permuted_df[0].astype(int)
        permuted_df[1] = permuted_df[1].astype(float)
        keys = permuted_df[0].unique().tolist()
        input_features = []
        for key in keys:
            group_feature = permuted_df[permuted_df[0] == key][1].tolist()
            input_features.append(group_feature)
        actual_f, _ = f_oneway(*input_features)
        actual_fs.append(float(actual_f))
    output_df['actual_f'] = actual_fs
    output_df['abs_error'] = abs(output_df['actual_f'] - output_df['f'])
    maes.append(output_df['abs_error'].sum() / output_df.shape[0])
filenames = np.array(filenames)
extreme_counts = np.array(extreme_counts)
p_values = np.array(p_values)
maes = np.array(maes)

compiled_df = pd.DataFrame(np.vstack([filenames, extreme_counts, p_values, maes])).T
compiled_df.columns = ['method', 'extremes', 'p-values', 'MAE']
compiled_df['method'] = compiled_df['method'].apply(lambda x: x[:-4])
compiled_df

Unnamed: 0,method,extremes,p-values,MAE
0,c_monte_perm,448792,0.448792,2.291769371443042e-07
1,cuda_monte_perm,448792,0.448792,2.291769371443042e-07
2,cuda_shared_monte_perm,448792,0.448792,2.291769371443042e-07
