# Parallel Computing with Directive Language

## SAXPY with OpenMP

In [1]:
%%file saxpy_openmp.cc

#include <stdio.h>
#include <stdlib.h>

void saxpy_parallel(int n, float a, float *x, float *y)
{
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        y[i] = a*x[i] + y[i];
}    

int main(int argc, char **argv)
{
    float *x, *y, tmp;
    int n = 1<<16, i;

    x = (float*)malloc(n*sizeof(float));
    y = (float*)malloc(n*sizeof(float));

    #pragma omp parallel for
    for( i = 0; i < n; i++)
    {
        x[i] = 0.5f * i;
        y[i] = 0.2f * i;
    }

    saxpy_parallel(n, 2.0, x, y);

    /*
    for (i = 0; i < n; ++i) {
        printf("%f ", y[i]);
    }
    */
    
    free(x);
    free(y);
    
    return 0;
}

Overwriting saxpy_openmp.cc


In [2]:
%time
! ./saxpy_openmp

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.1 µs


## SAXPY with openACC

In [3]:
%%file saxpy_openacc.c

#include <stdio.h>
#include <stdlib.h>

void saxpy_parallel(int n, float a, float *x, float *y)
{
    #pragma acc kernel
    for (int i = 0; i < n; ++i)
        y[i] = a*x[i] + y[i];
}

int main(int argc, char **argv)
{
    float *x, *y, tmp;
    int n = 1<<16, i;
    
    x = (float*)malloc(n*sizeof(float));
    y = (float*)malloc(n*sizeof(float));

    #pragma acc kernel
    for( i = 0; i < n; i++)
    {
        x[i] = 0.5f * i;
        y[i] = 0.2f * i;
    }

    saxpy_parallel(n, 2.0, x, y);

    /*
    for (i = 0; i < n; ++i) {
        printf("%f ", y[i]);
    }
    */
    
    return 0;
}

Overwriting saxpy_openacc.c


In [4]:
%time
! ./saxpy_openacc

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.11 µs


## Matrix Sum

In [5]:
%%file mat_sum_mp.cc

#include <stdio.h>

/* matrix-sum-acc.c */
#define SIZE 1000
float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];

int main() {
    int i,j,k;

    // Initialize matrices.
    for (i = 0; i < SIZE; ++i) {
      for (j = 0; j < SIZE; ++j) {
          a[i][j] = (float)i + j;
          b[i][j] = (float)i - j;
          c[i][j] = 0.0f;
      }
    }

    // Compute matrix multiply
    #pragma omp parallel for
    for (i = 0; i < SIZE; ++i) {
      for (j = 0; j < SIZE; ++j) {
        //for (k = 0; k < SIZE; ++k) {
        //  c[i][j] = a[i][k] * b[k][j];
        //}
        c[i][j] = a[i][j] + b[i][j];
      }
    }

    // Print the result matrix.
    /*
    for (i = 0; i < SIZE; ++i) {
      for (j = 0; j < SIZE; ++j)
        printf("%f ", c[i][j]);
      printf("\n");
    }
    */
    printf("OpenMP matrix sum test was successful!\n");

    return 0;
}

Overwriting mat_sum_mp.cc


In [6]:
%time
! ./mat_sum_mp

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.86 µs
OpenMP matrix sum test was successful!


In [7]:
%%file mat_sum_acc.c

#include <stdio.h>

/* matrix-sum-acc.c */
#define SIZE 1000
float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];

int main() {
    int i,j,k;

    // Initialize matrices.
    for (i = 0; i < SIZE; ++i) {
      for (j = 0; j < SIZE; ++j) {
          a[i][j] = (float)i + j;
          b[i][j] = (float)i - j;
          c[i][j] = 0.0f;
      }
    }

    // Compute matrix multiply
    #pragma acc kernel
    for (i = 0; i < SIZE; ++i) {
      for (j = 0; j < SIZE; ++j) {
        //for (k = 0; k < SIZE; ++k) {
        //  c[i][j] = a[i][k] * b[k][j];
        //}
        c[i][j] = a[i][j] + b[i][j];
      }
    }

    // Print the result matrix.
    /*
    for (i = 0; i < SIZE; ++i) {
      for (j = 0; j < SIZE; ++j)
        printf("%f ", c[i][j]);
      printf("\n");
    }
    */
    printf("OpenACC matrix sum test was successful!\n");

    return 0;
}

Overwriting mat_sum_acc.c


In [8]:
%time
! ./mat_sum_acc

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.81 µs
OpenACC matrix sum test was successful!


### Makefile

In [9]:
%%file Makefile

CC=gcc

openmp: saxpy_openmp.cc
	$(CC) -fopenmp saxpy_openmp.cc -O3 -o saxpy_openmp

openacc: saxpy_openacc.c
	$(CC) saxpy_openacc.c -fopenacc -foffload=nvptx-none -foffload="-O3" -O3 -o saxpy_openacc 
    
sum_mp: mat_sum_mp.cc
	$(CC) -fopenmp mat_sum_mp.cc -O3 -o mat_sum_mp

sum_acc: mat_sum_acc.c
	$(CC) mat_sum_acc.c -fopenacc -foffload=nvptx-none -foffload="-O3" -O3 -o mat_sum_acc 

Overwriting Makefile
