<a href="https://colab.research.google.com/github/Konstantin-Grudzin/-/blob/main/Qcuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [3]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpogb7kn_b".


In [4]:
%%writefile cuda.h
#ifndef CUDA_H
#define CUDA_H

#include <thrust/complex.h>
using Complex = thrust::complex<double>;

// Прототип функции ядра
__global__ void H_cuda(Complex* v, int n, int ind);
__global__ void X_cuda(Complex* v, int n,int ind);
__global__ void Z_cuda(Complex* v, int n, int ind);
__global__ void Ph_cuda(Complex* v, int n, int ind,double phi);
#endif // CUDA_H

Writing cuda.h


In [5]:
%%writefile cuda.cu
#include "cuda.h"
#include <cmath> // для sqrt

__global__ void H_cuda(Complex* v, int n, int ind) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        Complex obrk2(sqrt(0.5), 0);
        int mask = (1 << ind) - 1;
        int back = idx & mask;
        int fwd = idx & (~mask);
        fwd <<= 1;
        int nid = fwd | back;
        Complex t0 = obrk2 * v[nid];
        Complex t1 = obrk2 * v[nid + (1 << ind)];
        v[nid] = t0 + t1;
        v[nid + (1 << ind)] = t0 - t1;
    }
}

__global__ void X_cuda(Complex* v, int n, int ind) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        int mask = (1 << ind) - 1;
        int back = idx & mask;
        int fwd = idx & (~mask);
        fwd <<= 1;
        int nid = fwd | back;
        Complex t0 = v[nid];
        v[nid]=v[nid + (1 << ind)];
        v[nid + (1 << ind)] = t0;
    }
}

__global__ void Z_cuda(Complex* v, int n, int ind) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        int mask = (1 << ind) - 1;
        int back = idx & mask;
        int fwd = idx & (~mask);
        fwd <<= 1;
        int nid = (fwd | back) + (1 << ind);
        v[nid]*=-1;
    }
}

__global__ void Ph_cuda(Complex* v, int n, int ind,double phi)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        Complex exp(cos(phi), sin(phi));
        int mask = (1 << ind) - 1;
        int back = idx & mask;
        int fwd = idx & (~mask);
        fwd <<= 1;
        int nid = (fwd | back) + (1 << ind);
        v[nid]*=exp;
    }
}


Writing cuda.cu


In [6]:
%%writefile main.cu
#include <iostream>
#include <vector>
#include <cmath>
#include <cuda_runtime.h>
#include <chrono>
#include "cuda.h"

using namespace std;

using Complex = thrust::complex<double>;

vector<Complex> input;
Complex *cuda_input;

void H(int ind) {
    int n = input.size();
    int block = 256;
    int blocks = ((n>>1) + block - 1) / block;
    H_cuda<<<blocks, block>>>(cuda_input, (n>>1),ind);
}

void X(int ind){
    int n = input.size();
    int block = 256;
    int blocks = ((n>>1) + block - 1) / block;
    X_cuda<<<blocks, block>>>(cuda_input, (n>>1),ind);
}

void Z(int ind)
{
    int n = input.size();
    int block = 256;
    int blocks = ((n>>1) + block - 1) / block;
    Z_cuda<<<blocks, block>>>(cuda_input, (n>>1),ind);
}

void Ph(int ind,double phi)
{
    int n = input.size();
    int block = 256;
    int blocks = ((n>>1) + block - 1) / block;
    Ph_cuda<<<blocks, block>>>(cuda_input, (n>>1),ind,phi);
}

inline void WORK()
{
  for(int i=0;i<100;++i)
    H(0);
}

int main()
{
const auto start{std::chrono::steady_clock::now()};
  //init
    int size = 3;
    input.resize(1<<(size));
    cudaMalloc(&cuda_input, (1<<size)*sizeof(Complex));
    input[0]=1;
    int sz = (1<<size)*sizeof(Complex);
    cudaMemcpy(cuda_input, input.data(), sz, cudaMemcpyHostToDevice);
  //------------------------------

  WORK();

  //do smth with input vector
  cudaMemcpy(input.data(), cuda_input, sz, cudaMemcpyDeviceToHost);
  for(auto el:input)
    cout<<el<<endl;
  //---------------------------------------------------------------

 //end
    cudaFree(cuda_input);
 //-----------------------

const auto finish{std::chrono::steady_clock::now()};
const std::chrono::duration<double> elapsed_seconds{finish - start};
std::cout << elapsed_seconds.count() << '\n';
}


Writing main.cu


In [8]:
%%bash
nvcc cuda.cu main.cu -o gelu -arch=sm_75
./gelu

(1,0)
(0,0)
(0,0)
(0,0)
(0,0)
(0,0)
(0,0)
(0,0)
0.106276
