In [1]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpman70f5t".


In [4]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>

#define checkCuda(err) { if (err != cudaSuccess) { std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl; exit(1); } }

__global__ void yourKernel(float *data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) data[idx] = idx; // Example operation
}

int main() {
    int size = 10485760;
    std::cout << "Array size: " << size << " elements" << std::endl;

    float *hostData, *deviceData;
    checkCuda(cudaMallocHost(&hostData, size * sizeof(float)));
    checkCuda(cudaMalloc(&deviceData, size * sizeof(float)));

    for (int i = 0; i < size; i++) hostData[i] = i; // Initialize data
    checkCuda(cudaMemcpy(deviceData, hostData, size * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time;
    checkCuda(cudaEventCreate(&start));
    checkCuda(cudaEventCreate(&stop));

    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads" << std::endl;

    checkCuda(cudaEventRecord(start, 0));
    yourKernel<<<numBlocks, blockSize>>>(deviceData, size);
    checkCuda(cudaGetLastError());
    checkCuda(cudaDeviceSynchronize());
    checkCuda(cudaEventRecord(stop, 0));
    checkCuda(cudaEventSynchronize(stop));
    checkCuda(cudaEventElapsedTime(&time, start, stop));

    std::cout << "Time: " << time << " ms" << std::endl;
    std::cout.flush();

    checkCuda(cudaFreeHost(hostData));
    checkCuda(cudaFree(deviceData));
    checkCuda(cudaEventDestroy(start));
    checkCuda(cudaEventDestroy(stop));

    std::cout << "Program finished" << std::endl;
    std::cout.flush();
    return 0;
}

Array size: 10485760 elements
Launching kernel with 40960 blocks, 256 threads
CUDA error: no error



In [5]:
%%cuda
#include <iostream>
#include <chrono>

int main() {
    auto start = std::chrono::high_resolution_clock::now();
    std::cout << "Hello from CUDA" << std::endl;
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "Time taken: " << duration.count() << " microseconds" << std::endl;
    return 0;
}

Hello from CUDA
Time taken: 25 microseconds



In [6]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>

#define checkCuda(err) { if (err != cudaSuccess) { std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl; exit(1); } }

__global__ void yourKernel(float *data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) data[idx] = idx; // Example operation
}

int main() {
    int size = 10485760;
    std::cout << "Array size: " << size << " elements" << std::endl;
    std::cout.flush();

    float *hostData, *deviceData;
    checkCuda(cudaMallocHost(&hostData, size * sizeof(float)));
    checkCuda(cudaMalloc(&deviceData, size * sizeof(float)));

    for (int i = 0; i < size; i++) hostData[i] = i; // Initialize data
    checkCuda(cudaMemcpy(deviceData, hostData, size * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time;
    checkCuda(cudaEventCreate(&start));
    checkCuda(cudaEventCreate(&stop));

    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads" << std::endl;
    std::cout.flush();

    checkCuda(cudaEventRecord(start, 0));
    yourKernel<<<numBlocks, blockSize>>>(deviceData, size);
    checkCuda(cudaGetLastError());
    checkCuda(cudaDeviceSynchronize());
    checkCuda(cudaEventRecord(stop, 0));
    checkCuda(cudaEventSynchronize(stop));
    checkCuda(cudaEventElapsedTime(&time, start, stop));

    std::cout << "Time: " << time << " ms" << std::endl;
    std::cout.flush();

    checkCuda(cudaFreeHost(hostData));
    checkCuda(cudaFree(deviceData));
    checkCuda(cudaEventDestroy(start));
    checkCuda(cudaEventDestroy(stop));

    std::cout << "Program finished" << std::endl;
    std::cout.flush();
    return 0;
}

Array size: 10485760 elements
Launching kernel with 40960 blocks, 256 threads
CUDA error: no error



In [7]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>
#include <stdio.h> // For printf

#define checkCuda(err) { if (err != cudaSuccess) { printf("CUDA error: %s\n", cudaGetErrorString(err)); exit(1); } }

__global__ void yourKernel(float *data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) data[idx] = idx; // Example operation
}

int main() {
    int size = 10485760;
    printf("Array size: %d elements\n", size);

    float *hostData, *deviceData;
    checkCuda(cudaMallocHost(&hostData, size * sizeof(float)));
    checkCuda(cudaMalloc(&deviceData, size * sizeof(float)));

    for (int i = 0; i < size; i++) hostData[i] = i; // Initialize data
    checkCuda(cudaMemcpy(deviceData, hostData, size * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time;
    checkCuda(cudaEventCreate(&start));
    checkCuda(cudaEventCreate(&stop));

    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    printf("Launching kernel with %d blocks, %d threads\n", numBlocks, blockSize);

    checkCuda(cudaEventRecord(start, 0));
    yourKernel<<<numBlocks, blockSize>>>(deviceData, size);
    checkCuda(cudaGetLastError());
    checkCuda(cudaDeviceSynchronize());
    checkCuda(cudaEventRecord(stop, 0));
    checkCuda(cudaEventSynchronize(stop));
    checkCuda(cudaEventElapsedTime(&time, start, stop));

    printf("Time: %f ms\n", time);

    checkCuda(cudaFreeHost(hostData));
    checkCuda(cudaFree(deviceData));
    checkCuda(cudaEventDestroy(start));
    checkCuda(cudaEventDestroy(stop));

    printf("Program finished\n");
    return 0;
}

Array size: 10485760 elements
Launching kernel with 40960 blocks, 256 threads
CUDA error: no error



In [8]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>

#define checkCuda(err) { if (err != cudaSuccess) { std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl; std::cerr.flush(); exit(1); } }

__global__ void yourKernel(float *data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) data[idx] = idx; // Example operation
}

int main() {
    int size = 10485760;
    std::cout << "Array size: " << size << " elements" << std::endl;
    std::cout.flush();

    float *hostData, *deviceData;
    checkCuda(cudaMallocHost(&hostData, size * sizeof(float)));
    checkCuda(cudaMalloc(&deviceData, size * sizeof(float)));

    for (int i = 0; i < size; i++) hostData[i] = i; // Initialize data
    checkCuda(cudaMemcpy(deviceData, hostData, size * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time;
    checkCuda(cudaEventCreate(&start));
    checkCuda(cudaEventCreate(&stop));

    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads" << std::endl;
    std::cout.flush();

    checkCuda(cudaEventRecord(start, 0));
    yourKernel<<<numBlocks, blockSize>>>(deviceData, size);
    checkCuda(cudaGetLastError());
    checkCuda(cudaDeviceSynchronize());
    checkCuda(cudaEventRecord(stop, 0));
    checkCuda(cudaEventSynchronize(stop));
    checkCuda(cudaEventElapsedTime(&time, start, stop));

    std::cout << "Time: " << time << " ms" << std::endl;
    std::cout.flush();

    checkCuda(cudaFreeHost(hostData));
    checkCuda(cudaFree(deviceData));
    checkCuda(cudaEventDestroy(start));
    checkCuda(cudaEventDestroy(stop));

    std::cout << "Program finished" << std::endl;
    std::cout.flush();

    std::cerr << "Reached end of main" << std::endl;
    std::cerr.flush();

    return 0;
}

Array size: 10485760 elements
Launching kernel with 40960 blocks, 256 threads
CUDA error: no error



In [9]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>
#include <iomanip> // For std::setprecision

#define checkCuda(err) { if (err != cudaSuccess) { std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl; exit(1); } }

__global__ void yourKernel(float *data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) data[idx] = idx; // Example operation
}

int main() {
    int size = 10485760;
    std::cout << "Array size: " << size << " elements" << std::endl;

    float *hostData, *deviceData;
    checkCuda(cudaMallocHost(&hostData, size * sizeof(float)));
    checkCuda(cudaMalloc(&deviceData, size * sizeof(float)));

    for (int i = 0; i < size; i++) hostData[i] = i; // Initialize data
    checkCuda(cudaMemcpy(deviceData, hostData, size * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time;
    checkCuda(cudaEventCreate(&start));
    checkCuda(cudaEventCreate(&stop));

    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads" << std::endl;

    checkCuda(cudaEventRecord(start, 0));
    yourKernel<<<numBlocks, blockSize>>>(deviceData, size);
    checkCuda(cudaGetLastError());
    checkCuda(cudaDeviceSynchronize());
    checkCuda(cudaEventRecord(stop, 0));
    checkCuda(cudaEventSynchronize(stop));
    checkCuda(cudaEventElapsedTime(&time, start, stop));

    std::cout << "Time: " << std::fixed << std::setprecision(9) << time << " ms" << std::endl;
    std::cout.flush();

    checkCuda(cudaFreeHost(hostData));
    checkCuda(cudaFree(deviceData));
    checkCuda(cudaEventDestroy(start));
    checkCuda(cudaEventDestroy(stop));

    std::cout << "Program finished" << std::endl;
    std::cout.flush();
    return 0;
}

Array size: 10485760 elements
Launching kernel with 40960 blocks, 256 threads
CUDA error: no error



In [10]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>

#define checkCuda(err) { if (err != cudaSuccess) { std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl; exit(1); } }

__global__ void yourKernel(float *data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) data[idx] = idx; // Example operation
}

int main() {
    int size = 10485760;
    std::cout << "Array size: " << size << " elements" << std::endl;

    float *hostData, *deviceData;
    checkCuda(cudaMallocHost(&hostData, size * sizeof(float)));
    checkCuda(cudaMalloc(&deviceData, size * sizeof(float)));

    for (int i = 0; i < size; i++) hostData[i] = i; // Initialize data
    checkCuda(cudaMemcpy(deviceData, hostData, size * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time;
    checkCuda(cudaEventCreate(&start));
    checkCuda(cudaEventCreate(&stop));

    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads" << std::endl;

    checkCuda(cudaEventRecord(start, 0));
    yourKernel<<<numBlocks, blockSize>>>(deviceData, size);
    checkCuda(cudaGetLastError());
    // checkCuda(cudaDeviceSynchronize()); // Removed this line
    checkCuda(cudaEventRecord(stop, 0));
    checkCuda(cudaEventSynchronize(stop));
    checkCuda(cudaEventElapsedTime(&time, start, stop));

    std::cout << "Time: " << time << " ms" << std::endl;
    std::cout.flush();

    checkCuda(cudaFreeHost(hostData));
    checkCuda(cudaFree(deviceData));
    checkCuda(cudaEventDestroy(start));
    checkCuda(cudaEventDestroy(stop));

    std::cout << "Program finished" << std::endl;
    std::cout.flush();
    return 0;
}

Array size: 10485760 elements
Launching kernel with 40960 blocks, 256 threads
CUDA error: no error



In [11]:
%%cuda -t
#include <iostream>
#include <cuda_runtime.h>

#define checkCuda(err) { if (err != cudaSuccess) { std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl; exit(1); } }

__global__ void yourKernel(float *data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) data[idx] = idx; // Example operation
}

int main() {
    int size = 10485760;
    std::cout << "Array size: " << size << " elements" << std::endl;

    float *hostData, *deviceData;
    checkCuda(cudaMallocHost(&hostData, size * sizeof(float)));
    checkCuda(cudaMalloc(&deviceData, size * sizeof(float)));

    for (int i = 0; i < size; i++) hostData[i] = i; // Initialize data
    checkCuda(cudaMemcpy(deviceData, hostData, size * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time;
    checkCuda(cudaEventCreate(&start));
    checkCuda(cudaEventCreate(&stop));

    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads" << std::endl;

    checkCuda(cudaEventRecord(start, 0));
    yourKernel<<<numBlocks, blockSize>>>(deviceData, size);
    checkCuda(cudaGetLastError());
    checkCuda(cudaDeviceSynchronize());
    checkCuda(cudaEventRecord(stop, 0));
    checkCuda(cudaEventSynchronize(stop));
    checkCuda(cudaEventElapsedTime(&time, start, stop));

    std::cout << "Time: " << time << " ms" << std::endl;
    std::cout.flush();

    checkCuda(cudaFreeHost(hostData));
    checkCuda(cudaFree(deviceData));
    checkCuda(cudaEventDestroy(start));
    checkCuda(cudaEventDestroy(stop));

    std::cout << "Program finished" << std::endl;
    std::cout.flush();
    return 0;
}

Array size: 10485760 elements
Launching kernel with 40960 blocks, 256 threads
CUDA error: no error



In [13]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>

#define checkCuda(err) { if (err != cudaSuccess) { std::cout << "CUDA error: " << cudaGetErrorString(err) << " in " << __FILE__ << " at line " << __LINE__ << std::endl; exit(1); } }

__global__ void yourKernel(float *data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) data[idx] = idx;
}

int main() {
    int size = 10485760;
    std::cout << "Array size: " << size << " elements" << std::endl;

    float *hostData, *deviceData;
    checkCuda(cudaMallocHost(&hostData, size * sizeof(float)));
    checkCuda(cudaMalloc(&deviceData, size * sizeof(float)));

    for (int i = 0; i < size; i++) hostData[i] = i;
    checkCuda(cudaMemcpy(deviceData, hostData, size * sizeof(float), cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    checkCuda(cudaEventCreate(&start));
    checkCuda(cudaEventCreate(&stop));

    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads" << std::endl;

    // Correct event recording sequence
    checkCuda(cudaEventRecord(start));
    yourKernel<<<numBlocks, blockSize>>>(deviceData, size);
    checkCuda(cudaGetLastError());
    checkCuda(cudaEventRecord(stop));  // Record stop event right after kernel launch
    checkCuda(cudaEventSynchronize(stop));

    float time;
    checkCuda(cudaEventElapsedTime(&time, start, stop));

    std::cout << "Time: " << time << " ms" << std::endl;

    checkCuda(cudaFreeHost(hostData));
    checkCuda(cudaFree(deviceData));
    checkCuda(cudaEventDestroy(start));
    checkCuda(cudaEventDestroy(stop));

    std::cout << "Program finished" << std::endl;
    return 0;
}

Array size: 10485760 elements
Launching kernel with 40960 blocks, 256 threads
CUDA error: no error in /tmp/tmpman70f5t/30747c17-d5fb-4412-8595-74b0fe22cf48/single_file.cu at line 33

