In [1]:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <performance.hpp>
#include <immintrin.h>

In [2]:
void add_index_vec(double *x, int n) {
    __m512d index, x_vec;
    for (int i = 0; i < n; i+=8) {
        x_vec = _mm512_load_pd(x+i); // load 8 double
        index = _mm512_set_pd(i+7, i+6, i+5, i+4, i+3, i+2, i+1, i);
        x_vec = _mm512_add_pd(x_vec, index); // add the two
        _mm512_store_pd(x+i, x_vec); // store back
    }
} 

In [3]:
void add_index(double *x, int n) {
    for (int i = 0; i < n; i+=1) {
        x[i] = x[i] + i;
    }
} 

In [4]:
void add_index_omp(double *x, int n) {
    #pragma omp simd
    for (int i = 0; i < n; i+=1) {
        x[i] = x[i] + i;
    }
} 

In [5]:
// start_main
double *x;
int n = 1024;
if (x != NULL) 
    free(x);

x = (double*) aligned_alloc(4096, sizeof(double)*n);
for(int i=0; i<n; i++){
  x[i]=3.4*i;
}
add_index(x,n);

// start_timing
for(int k = 0; k<5000; k++)
add_index(x,n);
// end_timing
// start_timing
for(int k = 0; k<5000; k++)
add_index_vec(x,n);
// end_timing
// start_timing
for(int k = 0; k<5000; k++)
add_index_omp(x,n);
// end_timing
// end_main

In [6]:
performance p{"vectorization.ipynb"};
display::lazy_image timingIm;

[NbConvertApp] Converting notebook /home/jovyan/materials/performance/test/notebooks/api/vectorization.ipynb to script
[NbConvertApp] Writing 1328 bytes to /home/jovyan/materials/performance/test/notebooks/api/vectorization_results/vectorization.cpp



In [7]:
p.print_compiler_optimization( "-O3 -fopt-info-vec -fopenmp-simd  -ftree-vectorizer-verbose=6  -march=skylake-avx512");


### Python file command_runner.py started:
g++ /home/jovyan/materials/performance/test/notebooks/api/vectorization_results/vectorization.cpp -o /home/jovyan/materials/performance/test/notebooks/api/vectorization_results/vectorization.exe -fopenmp -lm -O3 -fopt-info-vec -fopenmp-simd  -ftree-vectorizer-verbose=6  -march=skylake-avx512 -fopt-info-vec
/home/jovyan/materials/performance/test/notebooks/api/vectorization_results/vectorization.cpp:18:23: optimized: loop vectorized using 32 byte vectors
/home/jovyan/materials/performance/test/notebooks/api/vectorization_results/vectorization.cpp:18:23: optimized: loop vectorized using 16 byte vectors
/home/jovyan/materials/performance/test/notebooks/api/vectorization_results/vectorization.cpp:26:18: optimized: loop vectorized using 32 byte vectors
/home/jovyan/materials/performance/test/notebooks/api/vectorization_results/vectorization.cpp:26:18: optimized: loop vectorized using 16 byte vectors

