# Install nvcc_plugin - Prepare Enviroment

In [1]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git #from=> https://medium.com/@iphoenix179/running-cuda-c-c-in-jupyter-or-how-to-run-nvcc-in-google-colab-663d33f53772

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-yz1wwwf6
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-yz1wwwf6
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=81d3e2c71424cc22f13338e90c7194ecfd35cd6c7590a338220e4c91119ff770
  Stored in directory: /tmp/pip-ephem-wheel-cache-4dthln1a/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin 

created output directory at /content/src
Out bin /content/result.out


In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [5]:
#create directories for project
!mkdir src/
!mkdir output/
!mkdir benchmarks/
!mkdir validation/
!mkdir inc/
!mkdir lib/

mkdir: cannot create directory ‘src/’: File exists


In [0]:
#copy conf folder with its files to my dir
!cp -r /content/gdrive/My\ Drive/ising/conf /content/

In [0]:
#copy header file
!cp -r /content/gdrive/My\ Drive/ising/inc/ising.h /content/inc/

In [0]:
#copy extra header file
!cp -r /content/gdrive/My\ Drive/ising/inc/cuda.h /content/inc/

# Source files(v0,v1,v2,v3)

In [82]:
%%writefile src/ising-sequential.c
/*
*       V0. Sequential: Simulation of an Ising model in two dimensions of size nxn for k iterations, staring from a uniform random initial state
*       Author:Tsalidis Georgios 27/12/2019
*       gtsalidis@ece.auth.gr
*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

// Ising model evolution
/*
  \param G      Spins on the square lattice             [n-by-n]
  \param w      Weight matrix                           [5-by-5]
  \param k      Number of iterations                    [scalar]
  \param n      Number of lattice points per dim        [scalar]
  NOTE: Both matrices G and w are stored in row-major format.
*/
void ising(int *G, double *w, int k, int n)
{
  int *g_tmp = new int[n*n]; //read from one

	// swap G and g_tmp pointeers for both matrixes
	int *swapg;

	// Variable to store the value of each moment
	double influence = 0;
 
  // Indexes of neibghors checked
	int idx_X, idx_Y;

	//Iterate k times
	for(int i = 0; i < k; i++)
	{
		//loop through G
		for(int x=0; x<n; x++)
			for(int y=0; y<n; y++)
			{
				influence = 0;

				// loop through the moment neighbors
				for(int X=0; X<5; X++)
					for(int Y=0; Y<5; Y++)
					{
						// find idx of checked point
						idx_X = (x + (X-2) + n) % n;
						idx_Y = (y + (Y-2) + n) % n;

						influence += *(w + Y*5 + X) * *(G +idx_Y*n + idx_X);
					}

		    //the value of the sign of influence If positive -> 1,If negative -> -1
				if(influence > 0.001)
					*(g_tmp + y*n + x) = 1;
				else if(influence < -0.001)
					*(g_tmp + y*n + x) = -1;
				else
         //remains the same
					*(g_tmp + y*n + x) = *(G + y*n + x);

  }

		// Swap pointers for next iteration
		swapg = G;
		G = g_tmp;
		g_tmp = swapg;
	}

  // Handle situation for odd 
	if(k%2 != 0)
		memcpy(g_tmp, G, n*n*sizeof(int));
 
}

Overwriting src/ising-sequential.c


In [83]:
%%writefile src/ising-v1.cu
/*
*       V1. GPU with one thread per moment 
*       Author:Tsalidis Georgios 2/1/2020
*       gtsalidis@ece.auth.gr
*/

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

#define BLOCK_SIZE 128 // value usually chosen by tuning and hardware constraints

#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)

// See: http://codeyarns.com/2011/03/02/how-to-do-error-checking-in-cuda/
inline void
__cuda_check_errors (const char *filename, const int line_number)
{
  cudaError err = cudaDeviceSynchronize ();
  if (err != cudaSuccess)
    {
      printf ("CUDA error %i at %s:%i: %s\n",
          err, filename, line_number, cudaGetErrorString (err));
      exit (-1);
    }
}


__global__ void ising_kernel(double* gpu_w, int* gpu_G, int* gpu_Gtmp, int n);
bool evaluate(int *G1,int *G2, int n);


//kernel function used to calculate one moment per thread
__global__ void ising_kernel(double* gpu_w, int* gpu_G, int* gpu_Gtmp, int n)
{
	//calculate thread_id
	int thread_id = blockIdx.x*blockDim.x + threadIdx.x;

	// moments x,y coordinates
	int y = thread_id%n;
	int x = thread_id/n;


	// the value of each moment
	double influence;
	
	// Indexes of neibghors checked
	int idx_x, idx_y;

	if( thread_id < n*n )
	{
		// loop through the moment neighbors
	    for(int X=0; X<5; X++)
	        for(int Y=0; Y<5; Y++)
	        {
				// skips the current iteration of the loop and continues with the next iteration.
	            if((X == 2) && (Y == 2))
	                continue;  
				
	            //find idx of checked point
	            idx_x = (x + (X-2) + n) % n;
	            idx_y = (y + (Y-2) + n) % n;

	            influence += *(gpu_w + X*5 + Y) * *(gpu_G +idx_x*n + idx_y);
	        }

	    //the value of the sign of influence If positive -> 1,If negative -> -1
		if(influence > 0.0001)
		{
			*(gpu_Gtmp + x*n + y) = 1;
		}
		else if(influence < -0.0001)
		{
			*(gpu_Gtmp + x*n + y) = -1;
		}
	    else
			//remains the same
	        *(gpu_Gtmp + x*n + y) = *(gpu_G + x*n + y);
	}
}

void ising(int *G, double *w, int k, int n)
{
   
	double *gpu_w;
	int *gpu_G;
	
	// allocate weight array and G array
	cudaMalloc(&gpu_w, 5*5*sizeof(double));
	cudaMalloc(&gpu_G, n*n*sizeof(int));
	
	
	//transfer data to device(GPU)
	cudaMemcpy(gpu_w, w, 5*5*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_G, G, n*n*sizeof(int), cudaMemcpyHostToDevice);

	//GPU array to store the updated values
	int *gpu_Gtmp;
	cudaMalloc(&gpu_Gtmp, n*n*sizeof(int));

	// gpu_G with gpu_Gtmp pointer swap
	int *temp;

	int blocks = (n*n + BLOCK_SIZE - 1)/BLOCK_SIZE;
	
	//run for k iterations
	for(int i = 0; i < k; i++)
	{
		//run kernel function to device
		ising_kernel<<< blocks , BLOCK_SIZE >>>(gpu_w, gpu_G, gpu_Gtmp, n);
		CUDA_CHECK_ERROR ();

		//Synchronize 
		cudaDeviceSynchronize();

		//swap pointers 
		temp = gpu_G;
		gpu_G = gpu_Gtmp;
		gpu_Gtmp = temp;
	}

	cudaMemcpy(G, gpu_G, n*n*sizeof(int), cudaMemcpyDeviceToHost);

	// free GPU memory
	cudaFree(gpu_w);
	cudaFree(gpu_G);
	cudaFree(gpu_Gtmp);
}

Overwriting src/ising-v1.cu


In [84]:
%%writefile src/ising-v2.cu
/*
*       V2. GPU with one thread computing a block of moments
*       Author:Tsalidis Georgios 5/1/2020
*       gtsalidis@ece.auth.gr
*/

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

#define BLOCK_SIZE 128 // value usually chosen by tuning and hardware constraints
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)

// See: http://codeyarns.com/2011/03/02/how-to-do-error-checking-in-cuda/
inline void
__cuda_check_errors (const char *filename, const int line_number)
{
  cudaError err = cudaDeviceSynchronize ();
  if (err != cudaSuccess)
    {
      printf ("CUDA error %i at %s:%i: %s\n",
          err, filename, line_number, cudaGetErrorString (err));
      exit (-1);
    }
}


__global__ void ising_kernel(double* gpu_w, int* gpu_G, int* gpu_Gtmp, int n);
bool evaluate(int *G1,int *G2, int n);


//kernel function used to calculate one thread with a block of moments
__global__ void ising_kernel(double* gpu_w, int* gpu_G, int* gpu_Gtmp, int n)
{
	//calculate thread_id
	int thread_id = blockIdx.x*blockDim.x + threadIdx.x;

	// the value of each moment
	double influence;

	// moments x,y coordinates
	int x, y;
	 
	// Indexes of neibghors checked
	int idx_x, idx_y;
	int next_thr = gridDim.x*blockDim.x;

	//each thread to compute a block of moments
	for(int thread = thread_id; thread<n*n; thread+= next_thr)
	{
		// moments x,y coordinates
		y = thread%n;
		x = thread/n;
		influence = 0;
		// loop through the moment neighbors
	    for(int X=0; X<5; X++)
	        for(int Y=0; Y<5; Y++)
	        {
				// skips the current iteration of the loop and continues with the next iteration.
	            if((X == 2) && (Y == 2))
	                continue;  
				
	            //find idx of checked point
	            idx_x = (x + (X-2) + n) % n;
	            idx_y = (y + (Y-2) + n) % n;

	            influence += *(gpu_w + X*5 + Y) * *(gpu_G +idx_x*n + idx_y);
	        }

	    //the value of the sign of influence If positive -> 1,If negative -> -1
		if(influence > 0.0001)
		{
			*(gpu_Gtmp + x*n + y) = 1;
		}
		else if(influence < -0.0001)
		{
			*(gpu_Gtmp + x*n + y) = -1;
		}
	    else
			//remains the same
	        *(gpu_Gtmp + x*n + y) = *(gpu_G + x*n + y);
	}
}


void ising(int *G, double *w, int k, int n)
{
   
	double *gpu_w;
	int *gpu_G;
	
	// allocate weight array and G array
	cudaMalloc(&gpu_w, 5*5*sizeof(double));
	cudaMalloc(&gpu_G, n*n*sizeof(int));
	
	
	//transfer data to device(GPU)
	cudaMemcpy(gpu_w, w, 5*5*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_G, G, n*n*sizeof(int), cudaMemcpyHostToDevice);

	//GPU array to store the updated values
	int *gpu_Gtmp;
	cudaMalloc(&gpu_Gtmp, n*n*sizeof(int));

	// gpu_G with gpu_Gtmp pointer swap
	int *temp;

	//dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
	//dim3 dimGrid(GRID_DIM_X,GRID_DIM_Y);
	int block = BLOCK_SIZE;
	int grid = (n + block -1)/block;

	//run for k iterations
	for(int i = 0; i < k; i++)
	{
		//run kernel function to device
		ising_kernel<<< grid , block >>>(gpu_w, gpu_G, gpu_Gtmp, n);
		
		//check for device errors
		CUDA_CHECK_ERROR ();
		
		//Synchronize 
		cudaDeviceSynchronize();

		//swap pointers 
		temp = gpu_G;
		gpu_G = gpu_Gtmp;
		gpu_Gtmp = temp;
	}

	cudaMemcpy(G, gpu_G, n*n*sizeof(int), cudaMemcpyDeviceToHost);

	// free GPU memory
	cudaFree(gpu_w);
	cudaFree(gpu_G);
	cudaFree(gpu_Gtmp);
}


Overwriting src/ising-v2.cu


In [85]:
%%writefile src/ising-v3.cu
/*
*       V3. GPU with multiple thread sharing common input moments
*       Author:Tsalidis Georgios 5/1/2020
*       gtsalidis@ece.auth.gr
*/

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define LEN 2

#define BLOCK_X 128
#define BLOCK_Y 24
#define GRID_X 4
#define GRID_Y 4


// See: http://codeyarns.com/2011/03/02/how-to-do-error-checking-in-cuda/
inline void
__cuda_check_errors (const char *filename, const int line_number)
{
  cudaError err = cudaDeviceSynchronize ();
  if (err != cudaSuccess)
    {
      printf ("CUDA error %i at %s:%i: %s\n",
          err, filename, line_number, cudaGetErrorString (err));
      exit (-1);
    }
}


__global__ void ising_kernel(double* gpu_w, int* gpu_G, int* gpu_Gtmp, int n);
bool evaluate(int *G1,int *G2, int n);


//kernel function used to calculate one thread with a block of moments
__global__ void ising_kernel(double* gpu_w, int* gpu_G, int* gpu_Gtmp, int n)
{
    int s_ncols = blockDim.x + 2*LEN;//num of  rows
 
    __shared__ double s_w[5*5];
    __shared__ int s_G[(BLOCK_X+2*LEN) * (BLOCK_Y+2*LEN)];


    for(int i=0; i<5*5; i++){
        *(s_w + i) = *(gpu_w+i);
    }

	// the value of each moment
	double influence;

	// moments x,y coordinates
	int x = blockDim.x*blockIdx.x + threadIdx.x;
    int y = blockDim.y*blockIdx.y + threadIdx.y;

	int s_x = threadIdx.x + LEN;
    int s_y = threadIdx.y + LEN;
 
    //thread increament
    int next_x = blockDim.x *gridDim.x ;
    int next_y = blockDim.y *gridDim.y ;

    //cordinates for neibghors on shared
    int n_x, n_y;

	// Indexes of neibghors checked
	int idx_x, idx_y;

	//each thread to compute a block of moments
	for(int i = x; i< n + LEN; i+= next_x)
	{
        for(int j = y; j< n + LEN; j+= next_y)
        {

            *(s_G + s_x*s_ncols + s_y) = *(gpu_G + ((i + n)%n)*n +  (j + n)%n);
         
         
			//For right and left
			if(threadIdx.x < LEN)
			{
				n_x = s_x;
				idx_x = (i + n)%n;

				for(int p=0; p<2; p++)
				{
					int count = (p-1)*LEN + p*blockDim.x;
					n_y = s_y + count;
					idx_y = (j + count + n)%n;
					s_G[n_x*s_ncols + n_y] = gpu_G[idx_x*n + idx_y];
				}
			}

			//For bot and top
			if(threadIdx.y < LEN)
			{
				n_y = s_y;
				idx_y = (j + n)%n;

				for(int p=0; p<2; p++)
				{
					int count = (p-1)*LEN + p*blockDim.y;
					n_x = s_x + count;
					idx_x = (i + count + n)%n;
					s_G[n_x*s_ncols + n_y] = gpu_G[idx_x*n + idx_y];
				}
			}

			//For corners
			if( (threadIdx.x < LEN) && (threadIdx.y<LEN) )
			{
				for(int p=0; p<4; p++)
				{
					int count_x = (p%2 - 1)*LEN + (p%2)*blockDim.y;
					n_x = s_x + count_x;
					idx_x = (i + count_x + n)%n;

					int count_y = ((p+3)%(p+1)/2 - 1)*LEN + ((p+3)%(p+1)/2)*blockDim.x;
					n_y = s_y + count_y;
					idx_y = (j + count_y + n)%n;

					s_G[n_x*s_ncols + n_y] = gpu_G[idx_x*n + idx_y];
				}
			}

			// Synchronize threads
			__syncthreads();

            if(i<n && j<n){
                influence = 0;
                // loop through the moment neighbors
                for(int X=0; X<5; X++)
                    for(int Y=0; Y<5; Y++)
                    {
                        // skips the current iteration of the loop and continues with the next iteration.
                        if((X == 2) && (Y == 2))
                            continue;  
                        
                        //find idx of checked point
                        idx_x = (x + (X-2) + n) % n;
                        idx_y = (y + (Y-2) + n) % n;
                     
                        influence += *(s_w + X*5 + Y) * *(s_G + (2+X+s_x)*s_ncols + (Y+s_x));
                    }

                //the value of the sign of influence If positive -> 1,If negative -> -1
                if(influence > 0.0001)
                {
                    *(gpu_Gtmp + i*n + j) = 1;
                }
                else if(influence < -0.0001)
                {
                    *(gpu_Gtmp + i*n + j) = -1;
                }
                else
                    //remains the same
                    *(gpu_Gtmp + i*n + j) = *(s_G + s_x*s_ncols + s_y);
            }
         __syncthreads();
        }
	}
}

void ising(int *G, double *w, int k, int n)
{
   
	double *gpu_w;
	int *gpu_G;
	
	// allocate weight array and G array
	cudaMalloc(&gpu_w, 5*5*sizeof(double));
	cudaMalloc(&gpu_G, n*n*sizeof(int));
	
	
	//transfer data to device(GPU)
	cudaMemcpy(gpu_w, w, 5*5*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_G, G, n*n*sizeof(int), cudaMemcpyHostToDevice);

	//GPU array to store the updated values
	int *gpu_Gtmp;
	cudaMalloc(&gpu_Gtmp, n*n*sizeof(int));

	// gpu_G with gpu_Gtmp pointer swap
	int *temp;

    //initialize blocks and threads => dim3 is an integer vector type based on uint3 
    //that is used to specify dimensions. When defining a variable of type dim3, any component left unspecified is initialized to 1

    dim3 block(BLOCK_X,BLOCK_Y); // blockDim
    dim3 grid(GRID_X,GRID_Y); // gridDim

	//run for k iterations
	for(int i = 0; i < k; i++)
	{

		//run kernel function to device
		ising_kernel<<< grid , block >>>(gpu_w, gpu_G, gpu_Gtmp, n);
  
		//check for device errors
		CUDA_CHECK_ERROR ();
		
		//Synchronize 
		cudaDeviceSynchronize();

		//swap pointers 
		temp = gpu_G;
		gpu_G = gpu_Gtmp;
		gpu_Gtmp = temp;
	}

	cudaMemcpy(G, gpu_G, n*n*sizeof(int), cudaMemcpyDeviceToHost);

	// free GPU memory
	cudaFree(gpu_w);
	cudaFree(gpu_G);
	cudaFree(gpu_Gtmp);
}


Overwriting src/ising-v3.cu


# Validation Source

In [86]:
%%writefile validation/main_validate.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

bool evaluate(int *G1,int *G2, int n ){
  for(int i=0;i<n*n;i++){
    if(G1[i]!=G2[i]){
      return false;
    }
  }
  return true;
}


int main() {
  
    int n = 517;
    //int k = 11;

    // weight matrix 𝑤 
    double weights[] = {0.004, 0.016, 0.026, 0.016, 0.004,
                		0.016, 0.071, 0.117, 0.071, 0.016,
            			0.026, 0.117, 0    , 0.117, 0.026,
            			0.016, 0.071, 0.117, 0.071, 0.016,
            			0.004, 0.016, 0.026, 0.016, 0.004};


    //Getting the initial situation of the lattice
    int *data =(int *)malloc((size_t)sizeof(int)*n*n);
    FILE *f = fopen("conf/conf-init.bin", "rb");
    fread(data,sizeof(int),n*n,f); // read bytes to our buffer

    int *stateInit =(int *)malloc((size_t)sizeof(int)*n*n);
    memcpy (stateInit, data, (size_t)sizeof(int)*n*n);

    //validate for k=1 
    memcpy (stateInit, data, (size_t)sizeof(int)*n*n);
    //Run ising proc
    ising(stateInit, weights,1, n);

    int *stateNxt_1 =(int *)malloc((size_t)sizeof(int)*n*n);
    FILE * fptr_1 = fopen("conf/conf-1.bin", "rb");
    fread(stateNxt_1,sizeof(int),n*n,fptr_1); // read file
    
    bool result_1 = evaluate(stateInit,stateNxt_1,n);
    if(result_1)
      printf("k=1: CORRECT \n");
    else
      printf("k=1: WRONG \n");
    
    free(stateNxt_1);

    //validate for k=4 
    memcpy (stateInit, data, (size_t)sizeof(int)*n*n);
    //Run ising proc
    ising(stateInit, weights,4, n);

    int *stateNxt_4 =(int *)malloc((size_t)sizeof(int)*n*n);
    FILE * fptr_4 = fopen("conf/conf-4.bin", "rb");
    fread(stateNxt_4,sizeof(int),n*n,fptr_4); // read file
   
    bool result_4=evaluate(stateInit,stateNxt_4,n);
    if(result_4)
      printf("k=4: CORRECT \n");
    else
      printf("k=4: WRONG \n");
  
    free(stateNxt_4);

    //validate for k=11 
    memcpy (stateInit, data, (size_t)sizeof(int)*n*n);
    //Run ising proc
    ising(stateInit, weights,11, n);

    int *stateNxt_11 =(int *)malloc((size_t)sizeof(int)*n*n);
    FILE * fptr_11 = fopen("conf/conf-11.bin", "rb");
    fread(stateNxt_11,sizeof(int),n*n,fptr_11); // read file
   
    bool result_11=evaluate(stateInit,stateNxt_11,n);
    if(result_11)
       printf("k=11: CORRECT \n");
    else
       printf("k=11: WRONG \n");

    free(stateNxt_11);

    return 0;
}

Overwriting validation/main_validate.c


In [87]:
%%writefile validation/main_validate_cuda.cu
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

//function for validation 
bool evaluate(int *G1,int *G2, int n ){
  for(int i=0;i<n*n;i++){
    if(G1[i]!=G2[i]){
      return false;
    }
  }
  return true;
}

int main() {
    
    int n = 517;
    //int k = 11;

    // weight matrix 𝑤 
    double weights[] = {0.004, 0.016, 0.026, 0.016, 0.004,
                		0.016, 0.071, 0.117, 0.071, 0.016,
            			0.026, 0.117, 0    , 0.117, 0.026,
            			0.016, 0.071, 0.117, 0.071, 0.016,
            			0.004, 0.016, 0.026, 0.016, 0.004};


    //Getting the initial situation of the lattice
    int *data =(int *)malloc((size_t)sizeof(int)*n*n);
    FILE *f = fopen("conf/conf-init.bin", "rb");
    fread(data,sizeof(int),n*n,f); // read bytes to our buffer

    int *stateInit =(int *)malloc((size_t)sizeof(int)*n*n);
    memcpy (stateInit, data, (size_t)sizeof(int)*n*n);

    //validate for k=1 
    memcpy (stateInit, data, (size_t)sizeof(int)*n*n);
    
	cudaEvent_t start1, stop1;
	cudaEventCreate(&start1);
	cudaEventCreate(&stop1);
	cudaEventRecord(start1);

    //Run ising proc
    ising(stateInit, weights,1, n);

	cudaEventRecord(stop1);
	cudaEventSynchronize(stop1);
	float time1 = 0;
	cudaEventElapsedTime(&time1, start1, stop1);

    
    //Run ising proc
    //ising(stateInit, weights,1, n);

    int *stateNxt_1 =(int *)malloc((size_t)sizeof(int)*n*n);
    FILE * fptr_1 = fopen("conf/conf-1.bin", "rb");
    fread(stateNxt_1,sizeof(int),n*n,fptr_1); // read file
    
    bool result_1 = evaluate(stateInit,stateNxt_1,n);
    if(result_1)
      printf("k=1: CORRECT \n");
    else
      printf("k=1: WRONG \n");
    
    free(stateNxt_1);

    //validate for k=4 
    memcpy (stateInit, data, (size_t)sizeof(int)*n*n);

	cudaEvent_t start4, stop4;
	cudaEventCreate(&start4);
	cudaEventCreate(&stop4);
	cudaEventRecord(start4);

    //Run ising proc
    ising(stateInit, weights,4, n);

	cudaEventRecord(stop4);
	cudaEventSynchronize(stop4);
	float time4 = 0;
	cudaEventElapsedTime(&time4, start4, stop4);


    int *stateNxt_4 =(int *)malloc((size_t)sizeof(int)*n*n);
    FILE * fptr_4 = fopen("conf/conf-4.bin", "rb");
    fread(stateNxt_4,sizeof(int),n*n,fptr_4); // read file
   
    bool result_4=evaluate(stateInit,stateNxt_4,n);
    if(result_4)
      printf("k=4: CORRECT \n");
    else
      printf("k=4: WRONG \n");
  
    free(stateNxt_4);

    //validate for k=11 
    memcpy (stateInit, data, (size_t)sizeof(int)*n*n);

	cudaEvent_t start11, stop11;
	cudaEventCreate(&start11);
	cudaEventCreate(&stop11);
	cudaEventRecord(start11);
    
    //Run ising proc
    ising(stateInit, weights,11, n);

	cudaEventRecord(stop11);
	cudaEventSynchronize(stop11);
	float time11 = 0;
	cudaEventElapsedTime(&time11, start11, stop11);



    int *stateNxt_11 =(int *)malloc((size_t)sizeof(int)*n*n);
    FILE * fptr_11 = fopen("conf/conf-11.bin", "rb");
    fread(stateNxt_11,sizeof(int),n*n,fptr_11); // read file
   
    bool result_11=evaluate(stateInit,stateNxt_11,n);
    if(result_11)
       printf("k=11: CORRECT \n");
    else
       printf("k=11: WRONG \n");

    free(stateNxt_11);

    return 0;
}

Overwriting validation/main_validate_cuda.cu


# Run Validation


In [96]:
!g++ src/ising-sequential.c validation/main_validate.c -o output/validate_v0 && ./output/validate_v0

k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 


In [97]:
!nvcc src/ising-v1.cu validation/main_validate_cuda.cu -o output/validate_v1 && ./output/validate_v1

k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 


In [98]:
!nvcc src/ising-v2.cu validation/main_validate_cuda.cu -o output/validate_v2 && ./output/validate_v2

k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 


In [99]:
!nvcc src/ising-v3.cu validation/main_validate_cuda.cu -o output/validate_v3 && ./output/validate_v3

k=1: WRONG 
k=4: WRONG 
k=11: WRONG 


# NV Profiling and Mem Checking

In [0]:
!nvprof ./output/validate_v1

==2061== NVPROF is profiling process 2061, command: ./output/validate_v1
k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 
==2061== Profiling application: ./output/validate_v1
==2061== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   90.79%  5.3380ms        16  333.63us  332.74us  334.43us  ising_kernel(double*, int*, int*, int)
                    4.81%  282.53us         6  47.088us  1.8560us  92.640us  [CUDA memcpy HtoD]
                    4.40%  258.91us         3  86.304us  85.120us  88.384us  [CUDA memcpy DtoH]
      API calls:   95.47%  197.65ms         6  32.942ms     770ns  197.64ms  cudaEventCreate
                    2.63%  5.4380ms        32  169.94us  1.4330us  344.50us  cudaDeviceSynchronize
                    0.76%  1.5814ms         9  175.71us  14.475us  292.44us  cudaMemcpy
                    0.40%  834.44us         9  92.715us  8.0500us  178.64us  cudaMalloc
                    0.32%  670.57us         9 

In [0]:
!nvprof ./output/validate_v2

==2132== NVPROF is profiling process 2132, command: ./output/validate_v2
k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 
==2132== Profiling application: ./output/validate_v2
==2132== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.00%  53.830ms        16  3.3644ms  3.3268ms  3.3936ms  ising_kernel(double*, int*, int*, int)
                    0.52%  283.23us         6  47.205us  1.8240us  92.704us  [CUDA memcpy HtoD]
                    0.48%  258.56us         3  86.186us  85.888us  86.720us  [CUDA memcpy DtoH]
      API calls:   76.94%  194.95ms         6  32.491ms     798ns  194.92ms  cudaEventCreate
                   21.31%  54.000ms        32  1.6875ms  1.6200us  3.3976ms  cudaDeviceSynchronize
                    0.67%  1.6934ms         9  188.15us  14.374us  320.60us  cudaMemcpy
                    0.35%  884.41us         9  98.267us  7.8890us  167.42us  cudaMalloc
                    0.28%  720.03us         9 

In [0]:
!nvprof ./output/validate_v3

In [92]:
!cuda-memcheck ./output/validate_v1 |more

k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 


In [93]:
!cuda-memcheck ./output/validate_v2 |more

k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 


In [94]:
!cuda-memcheck ./output/validate_v3 |more

nfiguration argument" on CUDA API call to cudaLaunchKernel. 
e7) [0x21b97]
nfiguration argument" on CUDA API call to cudaLaunchKernel. 
[K

# Benchmark source


In [106]:
%%writefile benchmarks/benchmark_v0.c
/*
*       Benchmarking Sequential: Simulation of an Ising model in two dimensions of size nxn for k iterations, staring from a uniform random initial state
*       Author:Tsalidis Georgios 27/12/2019
*       gtsalidis@ece.auth.gr
*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include  <time.h>
#include "../inc/ising.h"

struct timespec start, finish;
double elapsed;

static int n[]={50,300,600,900,1400,1800,2500,3800,6000};//9
static int k[]={1, 5, 10 , 30 ,60, 100};//6

int main() {

  printf("=====================BENCHMARKING SEQUENTIAL START=====================\n");

  double weights[]={ 0.004,  0.016,  0.026,  0.016,   0.004,
                       0.016,  0.071,  0.117,  0.071,   0.016,
                       0.026,  0.117,    0  ,  0.117,   0.026,
                       0.016,  0.071,  0.117,  0.071,   0.016,
                       0.004,  0.016,  0.026,  0.016,   0.004};





  for (int i = 0; i < 9; i++) {
    for (int j = 0; j < 6; j++) {
      FILE *pointerToFile;
      int * sample;

      sample=(int *)malloc((size_t)n[i]*n[i]*sizeof(int));
      for (size_t ii = 0; ii < n[i]*n[i]; ii++) {
        sample[ii]=(rand()%2 );
        if(sample[ii]==0){
          sample[ii]=-1;
        }
      }


      clock_gettime(CLOCK_MONOTONIC, &start);
      ising(sample, weights,k[j], n[i]);
      clock_gettime(CLOCK_MONOTONIC, &finish);
      elapsed = (finish.tv_sec - start.tv_sec);
      elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;

      pointerToFile=fopen("benchmarks/results_v0.csv","a");
      fprintf(pointerToFile,"%d,%d,%lf\n",n[i],k[j],elapsed);
      printf("Ising model evolution for n=%d, k=%d ,took %lf seconds! \n",n[i],k[j], elapsed );
      free(sample);
    }

  }
  printf("\n");

  printf("=====================BENCHMARKING SEQUENTIAL END=====================\n");


}

Overwriting benchmarks/benchmark_v0.c


In [107]:
%%writefile benchmarks/benchmark_v1.cu
/*
*       Benchmarking GPU with one thread per moment 
*       Author:Tsalidis Georgios 2/1/2020
*       gtsalidis@ece.auth.gr
*/

#include  <time.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

struct timespec start, finish;
double elapsed;


static int n[]={50,300,600,900,1400,1800,2500,3800,6000};//9
static int k[]={1, 5, 10 , 30 ,60, 100};//6


int main() {

  printf("=====================BENCHMARKING CUDA V1 START=====================\n");

  double weights[]={ 0.004,  0.016,  0.026,  0.016,   0.004,
                       0.016,  0.071,  0.117,  0.071,   0.016,
                       0.026,  0.117,    0  ,  0.117,   0.026,
                       0.016,  0.071,  0.117,  0.071,   0.016,
                       0.004,  0.016,  0.026,  0.016,   0.004};


  for (int i = 0; i < 9; i++) {
    for (int j = 0; j < 6; j++) {
      FILE *pointerToFile;
      int * sample;

      sample=(int *)malloc((size_t)n[i]*n[i]*sizeof(int));
      for (size_t ii = 0; ii < n[i]*n[i]; ii++) {
        sample[ii]=(rand()%2 );
        if(sample[ii]==0){
          sample[ii]=-1;
        }
      }


      clock_gettime(CLOCK_MONOTONIC, &start);
      ising(sample, weights,k[j], n[i]);
      clock_gettime(CLOCK_MONOTONIC, &finish);
      elapsed = (finish.tv_sec - start.tv_sec);
      elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;

      pointerToFile=fopen("benchmarks/results_v1.csv","a");
      fprintf(pointerToFile,"%d,%d,%lf\n",n[i],k[j],elapsed);
      printf("Ising model evolution for n=%d, k=%d ,took %lf seconds! \n",n[i],k[j], elapsed );
      free(sample);
    }

  }
  printf("\n");

  printf("=====================BENCHMARKING CUDA V1 END=====================\n");


}

Overwriting benchmarks/benchmark_v1.cu


In [108]:
%%writefile benchmarks/benchmark_v2.cu
/*
*       Benchmarking GPU with one thread per moment 
*       Author:Tsalidis Georgios 2/1/2020
*       gtsalidis@ece.auth.gr
*/

#include  <time.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

struct timespec start, finish;
double elapsed;

static int n[]={50,300,600,900,1400,1800,2500,3800,6000};//9
static int k[]={1, 5, 10 , 30 ,60, 100};//6

int main() {


  printf("=====================BENCHMARKING CUDA V2 START=====================\n");
  double weights[]={ 0.004,  0.016,  0.026,  0.016,   0.004,
                       0.016,  0.071,  0.117,  0.071,   0.016,
                       0.026,  0.117,    0  ,  0.117,   0.026,
                       0.016,  0.071,  0.117,  0.071,   0.016,
                       0.004,  0.016,  0.026,  0.016,   0.004};





  // clock_t t;
  // double time_taken;

  for (int i = 0; i < 9; i++) {
    for (int j = 0; j < 6; j++) {
      FILE *pointerToFile;
      int * sample;

      sample=(int *)malloc((size_t)n[i]*n[i]*sizeof(int));
      for (size_t ii = 0; ii < n[i]*n[i]; ii++) {
        sample[ii]=(rand()%2 );
        if(sample[ii]==0){
          sample[ii]=-1;
        }
      }


      clock_gettime(CLOCK_MONOTONIC, &start);
      ising(sample, weights,k[j], n[i]);
      clock_gettime(CLOCK_MONOTONIC, &finish);
      elapsed = (finish.tv_sec - start.tv_sec);
      elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;

      pointerToFile=fopen("benchmarks/results_v2.csv","a");
      fprintf(pointerToFile,"%d,%d,%lf\n",n[i],k[j],elapsed);
      printf("Ising model evolution for n=%d, k=%d ,took %lf seconds! \n",n[i],k[j], elapsed );
      free(sample);
    }

  }
  printf("\n");

  printf("=====================BENCHMARKING CUDA V2 END=====================\n");


}

Overwriting benchmarks/benchmark_v2.cu


In [109]:
%%writefile benchmarks/benchmark_v3.cu
/*
*       Benchmarking GPU with one thread per moment 
*       Author:Tsalidis Georgios 2/1/2020
*       gtsalidis@ece.auth.gr
*/

#include  <time.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../inc/ising.h"

struct timespec start, finish;
double elapsed;

static int n[]={50,300,600,900,1400,1800,2500,3800,6000};//9
static int k[]={1, 5, 10 , 30 ,60, 100};//6

int main() {


  printf("=====================BENCHMARKING CUDA 3 START=====================\n");
  double weights[]={ 0.004,  0.016,  0.026,  0.016,   0.004,
                       0.016,  0.071,  0.117,  0.071,   0.016,
                       0.026,  0.117,    0  ,  0.117,   0.026,
                       0.016,  0.071,  0.117,  0.071,   0.016,
                       0.004,  0.016,  0.026,  0.016,   0.004};





  // clock_t t;
  // double time_taken;

  for (int i = 0; i < 9; i++) {
    for (int j = 0; j < 6; j++) {
      FILE *pointerToFile;
      int * sample;

      sample=(int *)malloc((size_t)n[i]*n[i]*sizeof(int));
      for (size_t ii = 0; ii < n[i]*n[i]; ii++) {
        sample[ii]=(rand()%2 );
        if(sample[ii]==0){
          sample[ii]=-1;
        }
      }


      clock_gettime(CLOCK_MONOTONIC, &start);
      ising(sample, weights,k[j], n[i]);
      clock_gettime(CLOCK_MONOTONIC, &finish);
      elapsed = (finish.tv_sec - start.tv_sec);
      elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;

      pointerToFile=fopen("benchmarks/results_v3.csv","a");
      fprintf(pointerToFile,"%d,%d,%lf\n",n[i],k[j],elapsed);
      printf("Ising model evolution for n=%d, k=%d ,took %lf seconds! \n",n[i],k[j], elapsed );
      free(sample);
    }

  }
  printf("\n");

  printf("=====================BENCHMARKING CUDA V3 END=====================\n");


}

Overwriting benchmarks/benchmark_v3.cu


# Run Benchmarks and Store Results

In [113]:
!g++ benchmarks/benchmark_v0.c src/ising-sequential.c -o benchmarks/benchmark_v0 && ./benchmarks/benchmark_v0

Ising model evolution for n=50, k=1 ,took 0.000630 seconds! 
Ising model evolution for n=50, k=5 ,took 0.003312 seconds! 
Ising model evolution for n=50, k=10 ,took 0.006908 seconds! 
Ising model evolution for n=50, k=30 ,took 0.019869 seconds! 
Ising model evolution for n=50, k=60 ,took 0.039095 seconds! 
Ising model evolution for n=50, k=100 ,took 0.071215 seconds! 
Ising model evolution for n=300, k=1 ,took 0.024534 seconds! 
Ising model evolution for n=300, k=5 ,took 0.120082 seconds! 
Ising model evolution for n=300, k=10 ,took 0.242228 seconds! 
Ising model evolution for n=300, k=30 ,took 0.718017 seconds! 
Ising model evolution for n=300, k=60 ,took 1.437095 seconds! 
Ising model evolution for n=300, k=100 ,took 2.398209 seconds! 
Ising model evolution for n=600, k=1 ,took 0.101524 seconds! 
Ising model evolution for n=600, k=5 ,took 0.490754 seconds! 
Ising model evolution for n=600, k=10 ,took 0.972159 seconds! 
Ising model evolution for n=600, k=30 ,took 2.891156 seconds! 
Is

In [110]:
!nvcc benchmarks/benchmark_v1.cu src/ising-v1.cu -o benchmarks/benchmark_v1 && ./benchmarks/benchmark_v1

Ising model evolution for n=50, k=1 ,took 0.142608 seconds! 
Ising model evolution for n=50, k=5 ,took 0.000404 seconds! 
Ising model evolution for n=50, k=10 ,took 0.000478 seconds! 
Ising model evolution for n=50, k=30 ,took 0.000891 seconds! 
Ising model evolution for n=50, k=60 ,took 0.001456 seconds! 
Ising model evolution for n=50, k=100 ,took 0.002257 seconds! 
Ising model evolution for n=300, k=1 ,took 0.000587 seconds! 
Ising model evolution for n=300, k=5 ,took 0.001103 seconds! 
Ising model evolution for n=300, k=10 ,took 0.001741 seconds! 
Ising model evolution for n=300, k=30 ,took 0.004173 seconds! 
Ising model evolution for n=300, k=60 ,took 0.008473 seconds! 
Ising model evolution for n=300, k=100 ,took 0.012973 seconds! 
Ising model evolution for n=600, k=1 ,took 0.001554 seconds! 
Ising model evolution for n=600, k=5 ,took 0.003354 seconds! 
Ising model evolution for n=600, k=10 ,took 0.005655 seconds! 
Ising model evolution for n=600, k=30 ,took 0.014854 seconds! 
Is

In [111]:
!nvcc benchmarks/benchmark_v2.cu src/ising-v2.cu -o benchmarks/benchmark_v2 && ./benchmarks/benchmark_v2

Ising model evolution for n=50, k=1 ,took 0.125639 seconds! 
Ising model evolution for n=50, k=5 ,took 0.001108 seconds! 
Ising model evolution for n=50, k=10 ,took 0.001969 seconds! 
Ising model evolution for n=50, k=30 ,took 0.005241 seconds! 
Ising model evolution for n=50, k=60 ,took 0.010068 seconds! 
Ising model evolution for n=50, k=100 ,took 0.016471 seconds! 
Ising model evolution for n=300, k=1 ,took 0.002389 seconds! 
Ising model evolution for n=300, k=5 ,took 0.009881 seconds! 
Ising model evolution for n=300, k=10 ,took 0.019279 seconds! 
Ising model evolution for n=300, k=30 ,took 0.057024 seconds! 
Ising model evolution for n=300, k=60 ,took 0.076371 seconds! 
Ising model evolution for n=300, k=100 ,took 0.110381 seconds! 
Ising model evolution for n=600, k=1 ,took 0.003714 seconds! 
Ising model evolution for n=600, k=5 ,took 0.014296 seconds! 
Ising model evolution for n=600, k=10 ,took 0.018339 seconds! 
Ising model evolution for n=600, k=30 ,took 0.054174 seconds! 
Is

In [112]:
!nvcc benchmarks/benchmark_v3.cu src/ising-v3.cu -o benchmarks/benchmark_v3 && ./benchmarks/benchmark_v3

Ising model evolution for n=50, k=1 ,took 0.126721 seconds! 
Ising model evolution for n=50, k=5 ,took 0.000280 seconds! 
Ising model evolution for n=50, k=10 ,took 0.000307 seconds! 
Ising model evolution for n=50, k=30 ,took 0.000327 seconds! 
Ising model evolution for n=50, k=60 ,took 0.000380 seconds! 
Ising model evolution for n=50, k=100 ,took 0.000468 seconds! 
Ising model evolution for n=300, k=1 ,took 0.000502 seconds! 
Ising model evolution for n=300, k=5 ,took 0.000569 seconds! 
Ising model evolution for n=300, k=10 ,took 0.000592 seconds! 
Ising model evolution for n=300, k=30 ,took 0.000604 seconds! 
Ising model evolution for n=300, k=60 ,took 0.000579 seconds! 
Ising model evolution for n=300, k=100 ,took 0.000712 seconds! 
Ising model evolution for n=600, k=1 ,took 0.001199 seconds! 
Ising model evolution for n=600, k=5 ,took 0.001280 seconds! 
Ising model evolution for n=600, k=10 ,took 0.001148 seconds! 
Ising model evolution for n=600, k=30 ,took 0.001245 seconds! 
Is

# MakeFile

In [0]:
#VALIDATION MAKEFILE: copy Makefile to create libraries
!cp -r /content/gdrive/My\ Drive/ising/val/Makefile /content/

In [162]:
#validate all versions with all
!make all

g++ src/ising-sequential.c validation/main_validate.c -o validate_v0.o  && ./validate_v0.o
k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 
nvcc src/ising-v1.cu validation/main_validate_cuda.cu -o validate_v1.o && ./validate_v1.o
k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 
nvcc src/ising-v2.cu validation/main_validate_cuda.cu -o validate_v2.o && ./validate_v2.o
k=1: CORRECT 
k=4: CORRECT 
k=11: CORRECT 
nvcc src/ising-v3.cu validation/main_validate_cuda.cu -o validate_v3.o && ./validate_v3.o
k=1: WRONG 
k=4: WRONG 
k=11: WRONG 


In [163]:
!make clean

rm -f  *.o lib/*.a v0 v1 v2 v3 
