In [None]:
%%writefile vecAdd.cu

#include <stdio.h>
#include <stdlib.h>
#include <iostream>

// kernel
__global__ void add(float *a, float *b, float *c, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n)
    c[idx] = a[idx] + b[idx];
}


void vecAdd(float *a, float *b, float *c, int n) {
  int size = n * sizeof(float);
  float *a_d, *b_d, *c_d;

  // Allocate memory
  cudaMalloc((void **) &a, size);
  cudaMalloc((void **) &b, size);
  cudaMalloc((void **) &c, size);

  // Copy data to device
  cudaMemcpy(a, a_d, size, cudaMemcpyHostToDevice);
  cudaMemcpy(b, b_d, size, cudaMemcpyHostToDevice);

  // Launch kernel
  int numThreads = 256;
  int numBlocks = (n + numThreads + 1) / numThreads;  // or ceil(n / 256.0)

  dim3 block(numThreads, 1, 1);
  dim3 grid(numBlocks, 1, 1);

  add<<<block, grid>>>(a, b, c, n);

  // Copy data back to host
  cudaMemcpy(c, c_d, size, cudaMemcpyDeviceToHost);
  cudaFree(a_d); cudaFree(b_d); cudaFree(c_d);
}

int main() {

  float *a, *b, *c;
  int n = 1000000;

  return 0;
}

Overwriting vecAdd.cu


In [None]:
%%shell

nvcc vecAdd.cu -o vecAdd
./vecAdd

Hello World!


In [None]:
%%shell

git clone https://github.com/nothings/stb.git

In [None]:
%%writefile colorToGreyscale.cu
#include <stdio.h>
#include <stdlib.h>
#include <iostream>

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image/stb_image_write.h"

__global__
void colorToGreyscaleKernel(unsigned char *input, unsigned char *output, int width, int height) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (col < width && row < height) {
      int greyOffset = row * width + col;
      int rgbOffset = greyOffset * 3; // 3 channels (R, G, B)

      unsigned char r = input[rgbOffset];
      unsigned char g = input[rgbOffset + 1];
      unsigned char b = input[rgbOffset + 2];

      output[greyOffset] = (unsigned char)0.21f * r + 0.71f * g + 0.07f * b;

    }
}

void colorToGreyscale(unsigned char *input, unsigned char *output, int width, int height, int channels) {
    int size = width * height * channels * sizeof(unsigned char);
    int graySize = width * height * sizeof(unsigned char);
    unsigned char *input_d, *output_d;

    cudaMalloc((void **)&input_d, size);
    cudaMalloc((void **)&output_d, graySize);

    cudaMemcpy(input_d, input, size, cudaMemcpyHostToDevice);

    int numThreadsX = 256;
    int numBlocksX = (width + numThreadsX - 1) / numThreadsX;

    colorToGreyscaleKernel<<<numBlocksX, numThreadsX>>>(input_d, output_d, width, height);

    cudaMemcpy(output, output_d, graySize, cudaMemcpyDeviceToHost);

    cudaFree(input_d);
    cudaFree(output_d);
}

int main() {
    std::cout << "It is running!" << std::endl;

    // Load image
    const char* imageSrc = "new-york-city-streets.jpg";
    int width, height, channels;
    unsigned char* input = stbi_load(imageSrc, &width, &height, &channels, STBI_rgb);

    std::cout << "Image dimensions: " << width << " x " << height << std::endl;

    if (input == NULL) {
        std::cerr << "Failed to load image: " << imageSrc << std::endl;
        return -1;
    }

    if (channels != 3) {
        std::cerr << "Image must be RGB" << std::endl;
        stbi_image_free(input);
        return -1;
    }

    // Allocate output for grayscale image
    int outputSize = width * height * sizeof(unsigned char);
    unsigned char* output = (unsigned char*)malloc(outputSize);
    if (output == NULL) {
        std::cerr << "Failed to allocate memory for output image" << std::endl;
        stbi_image_free(input);
        return -1;
    }

    // Convert to grayscale
    colorToGreyscale(input, output, width, height, channels);
    std::cout << "Conversion to grayscale done!" << std::endl;

    for (int i = 0; i < 100; i++) {
        std::cout << (int)output[i] << " ";
    }

    // Save grayscale image
    stbi_write_jpg("output_greyscale.jpg", width, height, 1, output, 100);
    std::cout << "Saved grayscale image as output_greyscale.jpg" << std::endl;

    // Free memory
    stbi_image_free(input);
    stbi_image_free(output);

    return 0;
}

Overwriting colorToGreyscale.cu


In [None]:
%%shell

nvcc colorToGreyscale.cu -o colorToGreyscale
./colorToGreyscale

In [None]:
%%writefile assignments/wb.HTML
#ifndef WB_H
#define WB_H

#include <iostream>
#include <fstream>
#include <sstream>
#include <cuda_runtime.h>
#include <cstdarg> // for variadic functions

// Utility function to read command line arguments
struct wbArg_t {
    int argc;
    char **argv;
};

wbArg_t wbArg_read(int argc, char **argv) {
    wbArg_t args;
    args.argc = argc;
    args.argv = argv;
    return args;
}

// Utility function to import data from a file
const char* wbArg_getInputFile(wbArg_t args, int index) {
    if (index < args.argc - 1) {
        return args.argv[index + 1]; // Assuming input file names are after the program name
    } else {
        std::cerr << "Error: Input file index out of range." << std::endl;
        return nullptr;
    }
}

void* wbImport(const char *file, int *inputLength) {
    std::ifstream inFile(file);
    if (!inFile) {
        std::cerr << "Error: Unable to open file " << file << std::endl;
        return nullptr;
    }

    // Determine file length
    inFile.seekg(0, std::ios::end);
    *inputLength = inFile.tellg() / sizeof(float);
    inFile.seekg(0, std::ios::beg);

    // Allocate memory and read data
    float *data = new float[*inputLength];
    inFile.read(reinterpret_cast<char*>(data), *inputLength * sizeof(float));
    inFile.close();

    return data;
}

// Utility function for logging messages
void wbLog(const char *format, ...) {
    va_list args;
    va_start(args, format);
    std::cout << "[LOG] ";
    vprintf(format, args); // using vprintf to handle variable arguments
    std::cout << std::endl;
    va_end(args);
}

// Utility functions for timing with CUDA events
cudaEvent_t startEvent, stopEvent;

void wbTime_start(const char *label) {
    cudaEventCreate(&startEvent);
    cudaEventCreate(&stopEvent);
    cudaEventRecord(startEvent);
    wbLog(label);
}

void wbTime_stop(const char *label) {
    float elapsedTime;
    cudaEventRecord(stopEvent);
    cudaEventSynchronize(stopEvent);
    cudaEventElapsedTime(&elapsedTime, startEvent, stopEvent);
    wbLog(label);
    std::cout << "Elapsed Time: " << elapsedTime << " ms" << std::endl;
    cudaEventDestroy(startEvent);
    cudaEventDestroy(stopEvent);
}

void wbSolution(wbArg_t args, void *output, int inputLength) {
    // In a real implementation, you might save the output to a file or handle it accordingly
    wbLog("Solution output:");
    for (int i = 0; i < inputLength; ++i) {
        std::cout << ((float*)output)[i] << " ";
    }
    std::cout << std::endl;
}

#endif // WB_H


In [2]:
%%writefile assignments/mp0.cu
#include <wb.h>

//@@ The purpose of this code is to become familiar with the submission
//@@ process. Do not worry if you do not understand all the details of
//@@ the code.

int main(int argc, char **argv) {
  int deviceCount;

  wbArg_read(argc, argv);

  cudaGetDeviceCount(&deviceCount);

  wbTime_start(GPU, "Getting GPU Data."); //@@ start a timer

  for (int dev = 0; dev < deviceCount; dev++) {
    cudaDeviceProp deviceProp;

    cudaGetDeviceProperties(&deviceProp, dev);

    if (dev == 0) {
      if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
        wbLog(TRACE, "No CUDA GPU has been detected");
        return -1;
      } else if (deviceCount == 1) {
        //@@ WbLog is a provided logging API (similar to Log4J).
        //@@ The logging function wbLog takes a level which is either
        //@@ OFF, FATAL, ERROR, WARN, INFO, DEBUG, or TRACE and a
        //@@ message to be printed.
        wbLog(TRACE, "There is 1 device supporting CUDA");
      } else {
        wbLog(TRACE, "There are ", deviceCount,
              " devices supporting CUDA");
      }
    }

    wbLog(TRACE, "Device ", dev, " name: ", deviceProp.name);
    wbLog(TRACE, " Computational Capabilities: ", deviceProp.major, ".",
          deviceProp.minor);
    wbLog(TRACE, " Maximum global memory size: ",
          deviceProp.totalGlobalMem);
    wbLog(TRACE, " Maximum constant memory size: ",
          deviceProp.totalConstMem);
    wbLog(TRACE, " Maximum shared memory size per block: ",
          deviceProp.sharedMemPerBlock);
    wbLog(TRACE, " Maximum block dimensions: ",
          deviceProp.maxThreadsDim[0], " x ", deviceProp.maxThreadsDim[1],
          " x ", deviceProp.maxThreadsDim[2]);
    wbLog(TRACE, " Maximum grid dimensions: ", deviceProp.maxGridSize[0],
          " x ", deviceProp.maxGridSize[1], " x ",
          deviceProp.maxGridSize[2]);
    wbLog(TRACE, " Warp size: ", deviceProp.warpSize);
  }

  wbTime_stop(GPU, "Getting GPU Data."); //@@ stop the timer

  return 0;
}


Writing assignments/mp0.cu


In [None]:
%%writefile assignments/mp1.cu
// MP 1
#include "wb.h"

__global__ void vecAdd(float *in1, float *in2, float *out, int len) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < len)
    out[idx] = in1[idx] + in2[idx];
}

int main(int argc, char **argv) {
  wbArg_t args;
  int inputLength;

  // DECLARE
  float *hostInput1;
  float *hostInput2;
  float *hostOutput;
  float *deviceInput1;
  float *deviceInput2;
  float *deviceOutput;

  args = wbArg_read(argc, argv);

  wbTime_start("Importing data and creating memory on host");
  hostInput1 =
      (float *)wbImport(wbArg_getInputFile(args, 0), &inputLength);
  hostInput2 =
      (float *)wbImport(wbArg_getInputFile(args, 1), &inputLength);
  hostOutput = (float *)malloc(inputLength * sizeof(float));
  wbTime_stop("Importing data and creating memory on host");

  wbLog("The input length is ", inputLength);

  wbTime_start("Allocating GPU memory.");
  //@@ Allocate GPU memory here

  wbTime_stop("Allocating GPU memory.");

  wbTime_start("Copying input memory to the GPU.");
  //@@ Copy memory to the GPU here

  wbTime_stop("Copying input memory to the GPU.");

  //@@ Initialize the grid and block dimensions here

  wbTime_start("Performing CUDA computation");
  //@@ Launch the GPU Kernel here

  cudaDeviceSynchronize();
  wbTime_stop("Performing CUDA computation");

  wbTime_start("Copying output memory to the CPU");
  //@@ Copy the GPU memory back to the CPU here

  wbTime_stop("Copying output memory to the CPU");

  wbTime_start("Freeing GPU Memory");
  //@@ Free the GPU memory here

  wbTime_stop("Freeing GPU Memory");

  wbSolution(args, hostOutput, inputLength);

  free(hostInput1);
  free(hostInput2);
  free(hostOutput);

  return 0;
}

In [14]:
%%shell
nvcc assignments/mp1.cu -o assignments/mp1
./assignments/mp1

    float *deviceInput1;
           ^


    float *deviceInput2;
           ^

    float *deviceOutput;
           ^

[LOG] Importing data and creating memory on host
Error: Input file index out of range.
Error: Unable to open file [LOG] Importing data and creating memory on host
Elapsed Time: 0.098848 ms
[LOG] The input length is 
[LOG] Allocating GPU memory.
[LOG] Allocating GPU memory.
Elapsed Time: 0.004064 ms
[LOG] Copying input memory to the GPU.
[LOG] Copying input memory to the GPU.
Elapsed Time: 0.004768 ms
[LOG] Performing CUDA computation
[LOG] Performing CUDA computation
Elapsed Time: 0.026656 ms
[LOG] Copying output memory to the CPU
[LOG] Copying output memory to the CPU
Elapsed Time: 0.002592 ms
[LOG] Freeing GPU Memory
[LOG] Freeing GPU Memory
Elapsed Time: 0.006144 ms
[LOG] Solution output:



