Quene   队列是提交到SYCL运行的命令组，与设备是多对一的关系

In [None]:
q.submit([&](handler& h){
    // command
});

Kernel  内核类封装了用于在实例化命令组时在设备上执行代码的方法和数据。内核对象不是由用户显式构造的，而是在调用内核调度函数（如parallel_for）时构造的

In [None]:
q.submit([&](handler& h){
    h.paraller_for(range<1>[N],[=](id<1> i){
        A[i] = B[i] + C[i];
    });
});

内核的功能可以通过id，range，item访问，item是内核的一个实例

In [None]:
q.submit([&](handler& h){
    h.paraller_for(range<1>[N],[=](item<1> item){
        auto i = item.get_id();
        auto R = item.get_range();
        // command
    });
});

队列运行的位置有三种方式

1. 面向任何设备 queue()
2. 针对预配置的设备 如queue(cpu_selector{})
3. 针对特定设备 class custom_selector : public device_selector {int operator()(…… // Any logic you want! …
    queue(custom_selector{});

应用程序作用域和命令组作用域：
1. 在主机上执行的代码
2. C++的全部功能在应用程序和命令组范围内可用

内核作用域：
1. 在设备上执行的代码。
2. 在内核范围内接受的C++功能

ND-range Kernel 一般内核不能在硬件级别上进行性能优化，  而ND-range将任务可以分配到更小的工作组中，工作组中的每一个项就被安排到硬件中执行。功能通过nd_range和nd_item访问。nd_item为核函数的实例

In [None]:
h.parallel_for(nd_range<1>(range<1>(1024),range<1>(64)), [=](nd_item<1> item){
    auto idx = item.get_global_id();
    auto local_id = itm.get_local_id();
    // command
});


缓冲区buffer和访问器accessor

缓冲区将数据封装在跨设备和主机的 SYCL 应用程序中。访问器是访问缓冲区数据的机制。

示例代码：

In [None]:
void dpcpp_code(int* a, int* b, int* c, int N) {
  //Step 1: create a device queue
  //(developer can specify a device type via device selector or use default selector)
  auto R = range<1>(N);
  queue q;
  //Step 2: create buffers (represent both host and device memory)
  buffer buf_a(a, R);
  buffer buf_b(b, R);
  buffer buf_c(c, R);
  //Step 3: submit a command for (asynchronous) execution
  q.submit([&](handler &h){
  //Step 4: create buffer accessors to access buffer data on the device
  accessor A(buf_a,h,read_only);
  accessor B(buf_b,h,read_only);
  accessor C(buf_c,h,write_only);
  
  //Step 5: send a kernel (lambda) for execution
  h.parallel_for(range<1>(N), [=](auto i){
    //Step 6: write a kernel
    //Kernel invocations are executed in parallel
    //Kernel is invoked for each element of the range
    //Kernel invocation has access to the invocation id
    C[i] = A[i] + B[i];
    });
  });
}

若两个内核使用相同缓冲区则第二个内核需要等第一个内核使用完毕之后才能使用缓冲区

In [None]:
#include <CL/sycl.hpp>

constexpr int num=16;
using namespace sycl;

  int main() {
  auto R = range<1>{ num };
  //Create Buffers A and B
  buffer<int> A{ R }, B{ R };
  //Create a device queue
  queue Q;
  //Submit Kernel 1
  Q.submit([&](handler& h) {
    //Accessor for buffer A
    accessor out(A,h,write_only);
    h.parallel_for(R, [=](auto idx) {
      out[idx] = idx[0]; }); });
  //Submit Kernel 2
  Q.submit([&](handler& h) {
    //This task will wait till the first queue is complete
    accessor out(A,h,write_only);
    h.parallel_for(R, [=](auto idx) {
      out[idx] += idx[0]; }); });
  //Submit Kernel 3
  Q.submit([&](handler& h) { 
    //Accessor for Buffer B
    accessor out(B,h,write_only);
    h.parallel_for(R, [=](auto idx) {
      out[idx] = idx[0]; }); });
  //Submit task 4
  Q.submit([&](handler& h) {
   //This task will wait till kernel 2 and 3 are complete
   accessor in (A,h,read_only);
   accessor inout(B,h);
  h.parallel_for(R, [=](auto idx) {
    inout[idx] *= in[idx]; }); }); 
      
 // And the following is back to device code
 host_accessor result(B,read_only);
  for (int i=0; i<num; ++i)
    std::cout << result[i] << "\n";      
  return 0;
}

主机访问器  

在命令组的作用域之外创建的，并且它授予访问权限的数据将在主机上可用。它们用于通过构造主机访问器对象将数据同步回主机。缓冲区销毁是将数据同步回主机的另一种方法。

In [None]:
#include <CL/sycl.hpp>
using namespace sycl;

int main() {
  constexpr int N = 16;
  auto R = range<1>(N);
  std::vector<int> v(N, 10);
  queue q;
  // Buffer takes ownership of the data stored in vector.  
  buffer buf(v);
  q.submit([&](handler& h) {
    accessor a(buf,h);
    h.parallel_for(R, [=](auto i) { a[i] -= 2; });
  });
  // Creating host accessor is a blocking call and will only return after all
  // enqueued SYCL kernels that modify the same buffer in any queue completes
  // execution and the data is available to the host via this host accessor.
  host_accessor b(buf,read_only);
  for (int i = 0; i < N; i++) std::cout << b[i] << " ";
  return 0;
}

缓冲区销毁示例

缓冲区创建发生在单独的函数范围内。当执行超出此函数范围时，将调用缓冲区析构函数，该解析函数放弃数据的所有权并将数据复制回主机内存。

In [None]:
#include <CL/sycl.hpp>
constexpr int N = 16;
using namespace sycl;

// Buffer creation happens within a separate function scope.
void dpcpp_code(std::vector<int> &v, queue &q) {
  auto R = range<1>(N);
  buffer buf(v);
  q.submit([&](handler &h) {
    accessor a(buf,h);
    h.parallel_for(R, [=](auto i) { a[i] -= 2; });
  });
}
int main() {
  std::vector<int> v(N, 10);
  queue q;
  dpcpp_code(v, q);
  // When execution advances beyond this function scope, buffer destructor is
  // invoked which relinquishes the ownership of data and copies back the data to
  // the host memory.
  for (int i = 0; i < N; i++) std::cout << v[i] << " ";
  return 0;
}

自定义设备选择器

In [None]:
#include <iostream>
using namespace sycl;
class my_device_selector : public device_selector {
public:
    my_device_selector(std::string vendorName) : vendorName_(vendorName){};
    int operator()(const device& dev) const override {
        int rating = 0;
        //We are querying for the custom device specific to a Vendor and if it is a GPU device we
        //are giving the highest rating as 3 . The second preference is given to any GPU device and the third preference is given to
        //CPU device.
        if (dev.is_gpu() & (dev.get_info<info::device::name>().find(vendorName_) != std::string::npos))
            rating = 3;
        else if (dev.is_gpu()) rating = 2;
        else if (dev.is_cpu()) rating = 1;
        return rating;
    };
    
private:
    std::string vendorName_;
};
int main() {
    //pass in the name of the vendor for which the device you want to query 
    std::string vendor_name = "Intel";
    //std::string vendor_name = "AMD";
    //std::string vendor_name = "Nvidia";
    my_device_selector selector(vendor_name);
    queue q(selector);
    std::cout << "Device: "
    << q.get_device().get_info<info::device::name>() << "\n";
    return 0;
}


复数乘法

In [None]:
// complex.hpp
#include <iostream>
#include <vector>
using namespace std;
class Complex2 {
 private:
  int m_real_, m_imag_;

 public:
  Complex2() {
    m_real_ = 0;
    m_imag_ = 0;
  }
  Complex2(int x, int y) {
    m_real_ = x;
    m_imag_ = y;
  }

  // Overloading the  != operator
  friend bool operator!=(const Complex2& a, const Complex2& b) {
    return (a.m_real_ != b.m_real_) || (a.m_imag_ != b.m_imag_);
  }

  // The function performs Complex number multiplication and returns a Complex2
  // object.
  Complex2 complex_mul(const Complex2& obj) const {
    return Complex2(((m_real_ * obj.m_real_) - (m_imag_ * obj.m_imag_)),
                    ((m_real_ * obj.m_imag_) + (m_imag_ * obj.m_real_)));
  }

  // Overloading the ostream operator to print the objects of the Complex2
  // object
  friend ostream& operator<<(ostream& out, const Complex2& obj) {
    out << "(" << obj.m_real_ << " : " << obj.m_imag_ << "i)";
    return out;
  }
};

In [None]:
#include <CL/sycl.hpp>
#include <iomanip>
#include <vector>
// dpc_common.hpp can be found in the dev-utilities include folder.
// e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/dpc_common.hpp
#include "dpc_common.hpp"
#include "Complex.hpp"

using namespace sycl;
using namespace std;

// Number of complex numbers passing to the SYCL code
static const int num_elements = 10000;

class CustomDeviceSelector : public device_selector {
 public:
  CustomDeviceSelector(std::string vendorName) : vendorName_(vendorName){};
  int operator()(const device &dev) const override {
    int device_rating = 0;
    //We are querying for the custom device specific to a Vendor and if it is a GPU device we
    //are giving the highest rating as 3 . The second preference is given to any GPU device and the third preference is given to
    //CPU device. 
    if (dev.is_gpu() & (dev.get_info<info::device::name>().find(vendorName_) !=
                        std::string::npos))
      device_rating = 3;
    else if (dev.is_gpu())
      device_rating = 2;
    else if (dev.is_cpu())
      device_rating = 1;
    return device_rating;
  };

 private:
  std::string vendorName_;
};

// in_vect1 and in_vect2 are the vectors with num_elements complex nubers and
// are inputs to the parallel function
void DpcppParallel(queue &q, std::vector<Complex2> &in_vect1,
                   std::vector<Complex2> &in_vect2,
                   std::vector<Complex2> &out_vect) {
  auto R = range(in_vect1.size());
  if (in_vect2.size() != in_vect1.size() || out_vect.size() != in_vect1.size()){ 
    std::cout << "ERROR: Vector sizes do not  match"<< "\n";
    return;
  }
  // Setup input buffers
  buffer bufin_vect1(in_vect1);
  buffer bufin_vect2(in_vect2);

  // Setup Output buffers 
  buffer bufout_vect(out_vect);

  std::cout << "Target Device: "
            << q.get_device().get_info<info::device::name>() << "\n";
  // Submit Command group function object to the queue
  q.submit([&](auto &h) {
    // Accessors set as read mode
    accessor V1(bufin_vect1,h,read_only);
    accessor V2(bufin_vect2,h,read_only);
    // Accessor set to Write mode
    accessor V3 (bufout_vect,h,write_only);
    h.parallel_for(R, [=](auto i) {
      V3[i] = V1[i].complex_mul(V2[i]);
    });
  });
  q.wait_and_throw();
}
void DpcppScalar(std::vector<Complex2> &in_vect1,
                 std::vector<Complex2> &in_vect2,
                 std::vector<Complex2> &out_vect) {
  if ((in_vect2.size() != in_vect1.size()) || (out_vect.size() != in_vect1.size())){
    std::cout<<"ERROR: Vector sizes do not match"<<"\n";
    return;
    }
  for (int i = 0; i < in_vect1.size(); i++) {
    out_vect[i] = in_vect1[i].complex_mul(in_vect2[i]);
  }
}
// Compare the results of the two output vectors from parallel and scalar. They
// should be equal
int Compare(std::vector<Complex2> &v1, std::vector<Complex2> &v2) {
  int ret_code = 1;
  if(v1.size() != v2.size()){
    ret_code = -1;
  }
  for (int i = 0; i < v1.size(); i++) {
    if (v1[i] != v2[i]) {
      ret_code = -1;
      break;
    }
  }
  return ret_code;
}
int main() {
  // Declare your Input and Output vectors of the Complex2 class
  vector<Complex2> input_vect1;
  vector<Complex2> input_vect2;
  vector<Complex2> out_vect_parallel;
  vector<Complex2> out_vect_scalar;

  for (int i = 0; i < num_elements; i++) {
    input_vect1.push_back(Complex2(i + 2, i + 4));
    input_vect2.push_back(Complex2(i + 4, i + 6));
    out_vect_parallel.push_back(Complex2(0, 0));
    out_vect_scalar.push_back(Complex2(0, 0));
  }

  // Initialize your Input and Output Vectors. Inputs are initialized as below.
  // Outputs are initialized with 0
  try {
    // Pass in the name of the vendor for which the device you want to query
    std::string vendor_name = "Intel";
    // std::string vendor_name = "AMD";
    // std::string vendor_name = "Nvidia";
    // queue constructor passed exception handler
    CustomDeviceSelector selector(vendor_name);
    queue q(selector, dpc_common::exception_handler);
    // Call the DpcppParallel with the required inputs and outputs
    DpcppParallel(q, input_vect1, input_vect2, out_vect_parallel);
  } catch (...) {
    // some other exception detected
    std::cout << "Failure" << "\n";
    std::terminate();
  }

  std::cout
      << "****************************************Multiplying Complex numbers "
         "in Parallel********************************************************"
      << "\n";
  // Print the outputs of the Parallel function
  int indices[]{0, 1, 2, 3, 4, (num_elements - 1)};
  constexpr size_t indices_size = sizeof(indices) / sizeof(int);

  for (int i = 0; i < indices_size; i++) {
    int j = indices[i];
    if (i == indices_size - 1) std::cout << "...\n";
    std::cout << "[" << j << "] " << input_vect1[j] << " * " << input_vect2[j]
              << " = " << out_vect_parallel[j] << "\n";
  }
  // Call the DpcppScalar function with the required input and outputs
  DpcppScalar(input_vect1, input_vect2, out_vect_scalar);

  // Compare the outputs from the parallel and the scalar functions. They should
  // be equal

  int ret_code = Compare(out_vect_parallel, out_vect_scalar);
  if (ret_code == 1) {
    std::cout << "Complex multiplication successfully run on the device"
              << "\n";
  } else
    std::cout
        << "*********************************************Verification Failed. Results are "
           "not matched**************************"
        << "\n";

  return 0;
}


使用 SYCL 缓冲区和访问器概念完成下面的编码练习：

代码在主机上初始化了三个向量vector1

核函数将向量1递增

创建一个新的向量2并初始化值为20，并为它分配缓冲区

为第二个缓冲区增加第二个访问器

将递增修改为相加，计算两个向量之和

内核代码将 递增 1。vector1

创建一个新的秒并初始化为值 20。vector2

为上述第二个向量创建 sycl 缓冲区

在内核代码中，为第二个矢量缓冲区创建第二个访问器

修改矢量增量以矢量添加，方法是将vector2vector1

In [None]:
#include <CL/sycl.hpp>

using namespace sycl;

int main() {
    const int N = 256;
    
    //# Initialize a vector and print values
    std::vector<int> vector1(N, 10);
    std::cout<<"\nInput Vector1: ";    
    for (int i = 0; i < N; i++) std::cout << vector1[i] << " ";

    //# STEP 1 : Create second vector, initialize to 20 and print values
    std::vector<int> vector2(N, 20);
    std::cout<<"\nInput Vector2: ";
    for (int i = 0; i < N; i++) std::cout << vector2[i] << " "; 
    
    //# Create Buffer
    buffer vector1_buffer(vector1);
    
    //# STEP 2 : Create buffer for second vector 
    buffer vector2_buffer(vector2);

    //# Submit task to add vector
    queue q;
    q.submit([&](handler &h) {
      //# Create accessor for vector1_buffer
      accessor vector1_accessor (vector1_buffer,h);
      
      //# STEP 3 - add second accessor for second buffer
      accessor vector2_accessor (vector2_buffer,h);
      
      h.parallel_for(range<1>(N), [=](id<1> index) {

        //# STEP 4 : Modify the code below to add the second vector to first one
        vector1_accessor[index] += vector2_accessor[index];
      });
   });

 
  //# Create a host accessor to copy data from device to host
  host_accessor h_a(vector1_buffer,read_only);

  //# Print Output values 
  std::cout<<"\nOutput Values: ";
  for (int i = 0; i < N; i++) std::cout<< vector1[i] << " ";
  std::cout<<"\n";

  return 0;
}
