In [3]:
# 프로세서 정보 조회. (특히 SIMD Instruction Level Parallelism 정보 -> flags에서 확인)
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0xffffffff
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa mmio_stale_data retbleed bhi
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
ad

In [5]:
# Code 1을 문자열로 만들어 아래 블록에서 code1.c 파일로 저장
code1 = """
// SIMD program to Hello World  벡터 뺄셈
// using C language

// SIMD header
#include <immintrin.h>
#include <stdio.h>

int main() {

  /* Initialize the two argument vectors */
  __m256 evens = _mm256_set_ps(2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0);
  __m256 odds = _mm256_set_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);

  /* Compute the difference between the two vectors */
  __m256 result = _mm256_sub_ps(evens, odds);

  /* Display the elements of the result vector */
  float* f = (float*)&result;
  printf("%f %f %f %f %f %f %f %f \\n", f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);

  return 0;
}
"""

In [4]:
# Code 2를 문자열로 만들어 아래 블록에서 code2.c 파일로 저장
code2 = """
// 벡터 A와 B의 합인 C를 구하는 예제
// 이전 시간의 omp parallel 와 omp parallel for도 각각 포함
// OpenMP, SIMD 로 구현한 결과와 serial 코드의 결과를 비교

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#include <iostream>
#include <stack>
#include <ctime>

#include <immintrin.h>

std::stack<clock_t> tictoc_stack;

void tic() {
	tictoc_stack.push(clock());
}

double toc() {
	double time_elapsed = ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC;
	std::cout << "time elapsed : "
		<< time_elapsed
		<< std::endl;
	tictoc_stack.pop();
	return time_elapsed;
}

int main()
{
	int N = 30000000;
	float* A = new float[N];
	float* B = new float[N];
	float* C_omp = new float[N];
	float* C_serial = new float[N];
	float* C_avx = new float[N];
	int thread_count = 4;
	int n_repeat = 5;
	double serial_time = 0.0f;
	double parallel_time = 0.0f;
	double parallel_for_time = 0.0f;
	double avx_time = 0.0f;
	int diff = 0;

	for (int i = 0; i < N; i++)
	{
		A[i] = (float)i;
		B[i] = (float)1;
	}

	for (int c = 0; c < n_repeat; c++) {

		// serial 버전
		tic();
		for (int i = 0; i < N; i++)
		{
			C_serial[i] = A[i] + B[i];
		}
		std::cout << "Serial version ";
		serial_time+=toc();

		// parallel 버전
		tic();

#pragma omp parallel num_threads(thread_count)
		{
			int my_rank = omp_get_thread_num();
			int my_idx_start = my_rank * (N / thread_count);
			int my_idx_end = my_idx_start + (N / thread_count) - 1;

			for (int i = my_idx_start; i <= my_idx_end; i++)
			{
				C_omp[i] = A[i] + B[i];
			}

			//printf("Thread %d computes from %d to %d\\n", my_rank, my_idx_start, my_idx_end);
		}
		std::cout << "OpenMP parallel version ";
		parallel_time += toc();

		// 결과 검증
		diff = 0;
		for (int i = 0; i < N; i++)
		{
			diff += abs(C_serial[i] - C_omp[i]);
		}
		if (diff == 0)
			printf("OpenMP result is equal to serial result.\\n");
		else
			printf("OpenMP result is different to serial result.\\n");

		// parallel for 버전
		tic();
#pragma omp parallel for num_threads(thread_count)
		for (int i = 0; i < N; i++)
		{
			C_omp[i] = A[i] + B[i];
		}
		std::cout << "OpenMP parallel for version ";
		parallel_for_time += toc();

		// 결과 검증
		diff = 0;
		for (int i = 0; i < N; i++)
		{
			diff += abs(C_serial[i] - C_omp[i]);
		}
		if (diff == 0)
			printf("OpenMP result is equal to serial result.\\n");
		else
			printf("OpenMP result is different to serial result.\\n");

		// avx 버전
		tic();
#pragma omp parallel for num_threads(thread_count) //private(a_temp,b_temp,c_temp)
		// 128bit 로 32bit float 4개 동시에 처리함
		// i 는 4 씩 증가
		for (int i = 0; i < N; i += 4)
		{
			// a_temp, b_temp 에 double 4개 로드함
			// c = a + b SIMD 연산 수행
			// 결과를 C_avx 에 저장
      _mm_stream_ps(C_avx + i, _mm_add_ps(_mm_load_ps(&(A[i])), _mm_load_ps(&(B[i]))));
		}
		std::cout << "AVX version ";
		avx_time += toc();

		/* // 결과 검증
		diff = 0;
		for (int i = 0; i < N; i++)
		{
			diff += abs(C_serial[i] - C_avx[i]);
		}
		if (diff == 0)
			printf("AVX result is equal to serial result. \\n");
		else
			printf("AVX result is different to serial result. \\n"); */
	}

	std::cout << "\\nSerial version : " << serial_time / n_repeat << std::endl;
	std::cout << "OpenMP parallel version : " << parallel_time / n_repeat << std::endl;
	std::cout << "OpenMP parallel for version : " << parallel_for_time / n_repeat << std::endl;
	std::cout << "AVX version : " << avx_time / n_repeat << std::endl;

	return 0;
}
"""

In [6]:
text_file = open("code1.c", "w")
text_file.write(code2)
text_file.close()

In [11]:
text_file = open("code2.cpp", "w")
text_file.write(code2)
text_file.close()

In [14]:
# 구글 클라우드 폴더에 파일 생성했는지 확인
!ls

code1.c  code2.cpp  sample_data


In [15]:
# 컴파일
!gcc -o SIMD_1 -mavx2 code1.c

[01m[Kcode1.c:10:10:[m[K [01;31m[Kfatal error: [m[Kiostream: No such file or directory
   10 | #include [01;31m[K<iostream>[m[K
      |          [01;31m[K^~~~~~~~~~[m[K
compilation terminated.


In [None]:
# 실행파일 run
!./SIMD_1

1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 


In [16]:
!g++ -o SIMD_2 -fopenmp -mavx2 code2.cpp

In [None]:
!./SIMD_2

Serial version time elapsed : 0.237269
OpenMP parallel version time elapsed : 0.242508
OpenMP result is equal to serial result.
OpenMP parallel for version time elapsed : 0.141008
OpenMP result is equal to serial result.
AVX version time elapsed : 0.172062
Serial version time elapsed : 0.134893
OpenMP parallel version time elapsed : 0.14472
OpenMP result is equal to serial result.
OpenMP parallel for version time elapsed : 0.1514
OpenMP result is equal to serial result.
AVX version time elapsed : 0.084303
Serial version time elapsed : 0.109403
OpenMP parallel version time elapsed : 0.15823
OpenMP result is equal to serial result.
OpenMP parallel for version time elapsed : 0.144232
OpenMP result is equal to serial result.
AVX version time elapsed : 0.08246
Serial version time elapsed : 0.108958
OpenMP parallel version time elapsed : 0.159495
OpenMP result is equal to serial result.
OpenMP parallel for version time elapsed : 0.150184
OpenMP result is equal to serial result.
AVX version t

In [17]:
code3 ='''
// 벡터 A와 B의 내적(dot product) C를 구하는 예제입니다.
// reduction과 critical session을 각각 적용시켜봅니다.
// 5번 실행하여 결과가 정확한지 검증합니다.

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#include <iostream>
#include <stack>
#include <ctime>

#include <immintrin.h>

std::stack<clock_t> tictoc_stack;

void tic() {
	tictoc_stack.push(clock());
}

double toc() {
	double time_elapsed = ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC;
	std::cout << "time elapsed : "
		<< time_elapsed
		<< std::endl;
	tictoc_stack.pop();
	return time_elapsed;
}

int main()
{
	int N = 30000000;
	double* A = new double[N];
	double* B = new double[N];
	double C_omp = 0.0f;
	double C_serial = 0.0f;
	double C_avx = 0.0f;
	int thread_count = 4;
	int n_repeat = 5;
	double serial_time = 0.0f;
	double reduction_time = 0.0f;
	double avx_time = 0.0f;
	double temp;
	double diff;

	// AVX variables
	__m256d a_temp;
	__m256d b_temp;

	for (int i = 0; i < N; i++)
	{
		A[i] = (double)i;
		B[i] = (double)1;
	}

	// n_repeat번 반복 함
	for (int c = 0; c < n_repeat; c++) {


		// serial 버전
		C_serial = 0.0f;
		tic();
		for (int i = 0; i < N; i++)
		{
			C_serial += A[i] * B[i];
		}
		std::cout << "Serial version ";
		serial_time+=toc();

		// reduction 버전
		C_omp = 0.0f;
		tic();
#pragma omp parallel for num_threads(thread_count) reduction(+:C_omp)
		for (int i = 0; i < N; i++)
		{
			C_omp += A[i] * B[i];
		}
		std::cout << "OpenMP reduction version ";
		reduction_time += toc();

		// 결과 검증
		diff = abs(C_serial - C_omp);
		if (diff == 0)
			printf("OpenMP result is equal to serial result.\\n");
		else
			printf("OpenMP result is different to serial result.\\n");

		// avx 버전
		C_avx = 0.0f;
		tic();
#pragma omp parallel for num_threads(thread_count) reduction(+:C_avx) //private(a_temp, b_temp, c_temp, c_ptr)
		// 128bit 로 64bit double 2개 동시에 처리함
		// i 는 2 씩 증가
		for (int i = 0; i < N; i+=2)
		{
			__m128d c_temp;
			//double* c_ptr = (double*)&c_temp;
			// a_temp, b_temp 에 double 4개 로드함
			// c = a * b SIMD 연산 수행
			c_temp = _mm_mul_pd(_mm_load_pd((double*)&A[i]), _mm_load_pd((double*)&B[i]));
			// 수평방향으로 add 연산 수행
			// c0, c1, c2, c3
			// c0+c1, c0+c1, c2+c3, c2+c3
 			c_temp = _mm_hadd_pd(c_temp, c_temp);
			// 결과를 C_avx 에 추가
			C_avx += *(double*)&c_temp;
			//C_avx += *((double*)&c_temp+2);
		}
		std::cout << "AVX reduction version ";
		avx_time += toc();

		// 결과 검증
		diff = abs(C_serial - C_avx);
		if (diff == 0)
			printf("AVX result is equal to serial result.\\n");
		else
			printf("AVX result is different to serial result.\\n");

	}
	std::cout << "\\nSerial version : " << serial_time / n_repeat << std::endl;
	//std::cout << "OpenMP critical session version : " << critical_time / n_repeat << std::endl;
	std::cout << "OpenMP reduction version : " << reduction_time / n_repeat << std::endl;
	std::cout << "AVX reduction version : " << avx_time / n_repeat << std::endl;

	delete A;
	delete B;

	return 0;

}
'''

In [18]:
text_file = open("code3.cpp", "w")
text_file.write(code3)
text_file.close()

In [19]:
!g++ -o Dot_SIMD -fopenmp -mavx2 code3.cpp

In [None]:
!./Dot_SIMD

Serial version time elapsed : 0.100838
OpenMP reduction version time elapsed : 0.166824
OpenMP result is equal to serial result.
AVX reduction version time elapsed : 0.21089
AVX result is equal to serial result.
Serial version time elapsed : 0.103447
OpenMP reduction version time elapsed : 0.168853
OpenMP result is equal to serial result.
AVX reduction version time elapsed : 0.211612
AVX result is equal to serial result.
Serial version time elapsed : 0.111037
OpenMP reduction version time elapsed : 0.167898
OpenMP result is equal to serial result.
AVX reduction version time elapsed : 0.207934
AVX result is equal to serial result.
Serial version time elapsed : 0.103231
OpenMP reduction version time elapsed : 0.168023
OpenMP result is equal to serial result.
AVX reduction version time elapsed : 0.198502
AVX result is equal to serial result.
Serial version time elapsed : 0.123676
OpenMP reduction version time elapsed : 0.165112
OpenMP result is equal to serial result.
AVX reduction versio