In [8]:
!apt-get update
!pip install nvcc4jupyter

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acq

In [9]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [10]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


Name: Uttkarsh Malviya

Roll No: IIT2022061

Experimental Setup:
Tested on NVIDIA Tesla T4 GPU (Compute 7.5) using nvcc (CUDA)

Summary of Results:
Sobel edge detection worked correctly on the 64×64 image (clear edges around the square).  
Performance varied with block size.

Observations:
- Block sizes tested: 8×8, 16×16, 32×32.
- Kernel times:

    • 8×8:     ~11.15 ms  
    • 16×16:     ~7.27  ms
    
    • 32×32:     ~7.23 ms  
- 16×16 and 32×32 show similar best performance on this GPU
- 8×8 is slower due to higher scheduling/overhead.


In [11]:
%%writefile sobel_naive.cu
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>

#define W 64
#define H 64

// Kernel: each thread processes one pixel
__global__ void sobel_kernel(const float *in, unsigned char *out, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;

    // Sobel kernels
    const int kx[3][3] = { { -1, 0, 1 }, { -2, 0, 2 }, { -1, 0, 1 } };
    const int ky[3][3] = { {  1, 2, 1 }, {  0, 0, 0 }, { -1,-2,-1 } };

    float Gx = 0.0f, Gy = 0.0f;
    for (int ry = -1; ry <= 1; ++ry) {
        for (int rx = -1; rx <= 1; ++rx) {
            int ix = x + rx, iy = y + ry;
            float v = 0.0f;
            if (ix >= 0 && ix < width && iy >= 0 && iy < height) v = in[iy * width + ix];
            Gx += kx[ry+1][rx+1] * v;
            Gy += ky[ry+1][rx+1] * v;
        }
    }
    float G = sqrtf(Gx*Gx + Gy*Gy);
    if (G > 255.0f) G = 255.0f;
    out[y * width + x] = (unsigned char)(G + 0.5f);
}

void print8f(const float *a, int width, int sx, int sy, const char *title) {
    printf("%s (from %d,%d 8x8):\n", title, sx, sy);
    for (int j=0;j<8;++j) {
        for (int i=0;i<8;++i) printf("%3d ", (int)a[(sy+j)*width + (sx+i)]);
        printf("\n");
    }
    printf("\n");
}
void print8uc(const unsigned char *a, int width, int sx, int sy, const char *title) {
    printf("%s (from %d,%d 8x8):\n", title, sx, sy);
    for (int j=0;j<8;++j) {
        for (int i=0;i<8;++i) printf("%3d ", a[(sy+j)*width + (sx+i)]);
        printf("\n");
    }
    printf("\n");
}

int main(int argc, char **argv) {
    int block = 8;
    if (argc >= 2) block = atoi(argv[1]);
    if (!(block==8 || block==16 || block==32)) {
        printf("block must be 8, 16 or 32. Using 8.\n"); block = 8;
    }
    printf("Sobel Problem 1: W=%d H=%d block=%d\n", W, H, block);

    // host arrays
    float *h_in  = (float*) malloc(W*H * sizeof(float));
    unsigned char *h_out = (unsigned char*) malloc(W*H * sizeof(unsigned char));

    // init image: centered 32x32 square = 255, else 0 (easy to verify)
    for (int y=0; y<H; ++y)
        for (int x=0; x<W; ++x)
            h_in[y*W + x] = (x>=16 && x<48 && y>=16 && y<48) ? 255.0f : 0.0f;

    // print top-left 8x8 original
    print8f(h_in, W, 0, 0, "Original Image (top-left 8x8)");
    // also print center 8x8
    print8f(h_in, W, 16, 16, "Original center 8x8");

    // device memory
    float *d_in = NULL;
    unsigned char *d_out = NULL;
    cudaMalloc(&d_in,  W*H * sizeof(float));
    cudaMalloc(&d_out, W*H * sizeof(unsigned char));
    cudaMemcpy(d_in, h_in, W*H * sizeof(float), cudaMemcpyHostToDevice);

    // launch
    dim3 threads(block, block);
    dim3 grid( (W + block - 1) / block, (H + block - 1) / block );

    // timing: kernel-only via events
    cudaEvent_t start, stop;
    cudaEventCreate(&start); cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    sobel_kernel<<<grid, threads>>>(d_in, d_out, W, H);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float ms=0.0f; cudaEventElapsedTime(&ms, start, stop);

    cudaMemcpy(h_out, d_out, W*H * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    // print outputs
    print8uc(h_out, W, 0, 0, "Edge-Detected Output (top-left 8x8)");
    print8uc(h_out, W, 16, 16, "Edge center 8x8 (from 16,16)");

    printf("Kernel-only time: %f ms\n", ms);

    // cleanup
    cudaFree(d_in); cudaFree(d_out);
    free(h_in); free(h_out);
    cudaEventDestroy(start); cudaEventDestroy(stop);
    return 0;
}


Overwriting sobel_naive.cu


In [12]:
%%bash
nvcc sobel_naive.cu -o sobel_naive
./sobel_naive 8

Sobel Problem 1: W=64 H=64 block=8
Original Image (top-left 8x8) (from 0,0 8x8):
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 

Original center 8x8 (from 16,16 8x8):
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 

Edge-Detected Output (top-left 8x8) (from 0,0 8x8):
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 

Edge center 8x8 (from 16,16) (from

In [13]:
%%bash
nvcc sobel_naive.cu -o sobel_naive
./sobel_naive 16

Sobel Problem 1: W=64 H=64 block=16
Original Image (top-left 8x8) (from 0,0 8x8):
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 

Original center 8x8 (from 16,16 8x8):
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 

Edge-Detected Output (top-left 8x8) (from 0,0 8x8):
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 

Edge center 8x8 (from 16,16) (fro

In [14]:
%%bash
nvcc sobel_naive.cu -o sobel_naive
./sobel_naive 32

Sobel Problem 1: W=64 H=64 block=32
Original Image (top-left 8x8) (from 0,0 8x8):
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 

Original center 8x8 (from 16,16 8x8):
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 
255 255 255 255 255 255 255 255 

Edge-Detected Output (top-left 8x8) (from 0,0 8x8):
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 

Edge center 8x8 (from 16,16) (fro

Name: Uttkarsh Malviya

Roll No: IIT2022061

Experimental Setup:
Tested on NVIDIA Tesla T4 GPU (Compute Capability 7.5) using nvcc (CUDA).
Compiled with: nvcc -arch=sm_75 sobel.cu -o sobel

Summary of Results:
The Sobel shared-memory kernel successfully detected edges on a 64×64 test image.
Clear vertical edges were observed at the boundary of the white square.

Observations:

Block configurations tested:
- 8×8 threads per block   → 64 threads/block
- 16×16 threads per block  → 256 threads/block
- 32×32 threads per block  → 1024 threads/block

Grid sizes:
- For 8×8  : 8×8 blocks
- For 16×16: 4×4 blocks
- For 32×32: 2×2 blocks

Shared memory per block:
- (blockDim.x + 2) × (blockDim.y + 2) × 4 bytes

Kernel Execution Times (as measured):

• 8×8   block → 0.109408 ms  
• 16×16 block → 7.545536 ms  
• 32×32 block → 7.342080 ms  

Interpretation:
- The 8×8 configuration gives the fastest runtime (~0.109 ms) for this small image size.
- 16×16 and 32×32 blocks have similar execution times (~7.3–7.5 ms).
- Larger blocks reduce grid size but increase per-block workload, which can reduce performance on small images.
- The edge output correctly shows a strong vertical boundary at x = 16 and zero gradient in uniform regions.

In [58]:
%%writefile sobel.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>

#define W 64
#define H 64

__global__ void sobel_shared(const float *in, unsigned char *out, int width, int height) {
    extern __shared__ float tile[];
    int bx = blockIdx.x * blockDim.x;
    int by = blockIdx.y * blockDim.y;
    int tx = threadIdx.x, ty = threadIdx.y;
    int sW = blockDim.x + 2;
    int sH = blockDim.y + 2;
    int tile_elems = sW * sH;
    int tid = ty * blockDim.x + tx;
    int nthreads = blockDim.x * blockDim.y;

    for (int idx = tid; idx < tile_elems; idx += nthreads) {
        int ry = idx / sW;
        int rx = idx % sW;
        int gx = bx + (rx - 1);
        int gy = by + (ry - 1);
        float v = 0.0f;
        if (gx >= 0 && gx < width && gy >= 0 && gy < height)
            v = in[gy * width + gx];
        tile[ry * sW + rx] = v;
    }
    __syncthreads();

    int x = bx + tx, y = by + ty;
    if (x >= width || y >= height) return;

    int sX = tx + 1, sY = ty + 1;

    float v_m1_m1 = tile[(sY - 1) * sW + (sX - 1)];
    float v_0_m1  = tile[(sY - 1) * sW + (sX    )];
    float v_p1_m1 = tile[(sY - 1) * sW + (sX + 1)];
    float v_m1_0  = tile[(sY    ) * sW + (sX - 1)];
    float v_p1_0  = tile[(sY    ) * sW + (sX + 1)];
    float v_m1_p1 = tile[(sY + 1) * sW + (sX - 1)];
    float v_0_p1  = tile[(sY + 1) * sW + (sX    )];
    float v_p1_p1 = tile[(sY + 1) * sW + (sX + 1)];

    float Gx = -v_m1_m1 + v_p1_m1 - 2.0f*v_m1_0 + 2.0f*v_p1_0 - v_m1_p1 + v_p1_p1;
    float Gy =  v_m1_m1 + 2.0f*v_0_m1 + v_p1_m1 - v_m1_p1 - 2.0f*v_0_p1 - v_p1_p1;
    float G  = sqrtf(Gx*Gx + Gy*Gy);
    if (G > 255.0f) G = 255.0f;
    out[y*width + x] = (unsigned char)(G + 0.5f);
}

void print8(const unsigned char *a, int width, int sx, int sy, const char *t) {
    printf("%s (8x8):\n", t);
    for (int j=0;j<8;++j){
        for (int i=0;i<8;++i) printf("%3d ", a[(sy+j)*width + (sx+i)]);
        printf("\n");
    }
    printf("\n");
}

int main(int argc, char **argv) {
    int block = 8;
    if (argc >= 2) block = atoi(argv[1]);

    float *h_in = (float*)malloc(W*H*sizeof(float));
    unsigned char *h_out = (unsigned char*)malloc(W*H*sizeof(unsigned char));

    for (int y=0;y<H;++y) for (int x=0;x<W;++x)
        h_in[y*W + x] = (x>=16 && x<48 && y>=16 && y<48) ? 255.0f : 0.0f;

    float *d_in;
    unsigned char *d_out;
    cudaMalloc(&d_in, W*H*sizeof(float));
    cudaMalloc(&d_out, W*H*sizeof(unsigned char));
    cudaMemcpy(d_in, h_in, W*H*sizeof(float), cudaMemcpyHostToDevice);

    dim3 threads(block, block);
    dim3 grid((W+block-1)/block, (H+block-1)/block);
    size_t sm = (size_t)(block+2)*(block+2)*sizeof(float);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    sobel_shared<<<grid, threads, sm>>>(d_in, d_out, W, H);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    cudaMemcpy(h_out, d_out, W*H*sizeof(unsigned char), cudaMemcpyDeviceToHost);

    print8(h_out, W, 16,16, "Edge center 8x8 (16,16)");
    printf("Kernel execution time: %f ms\n", ms);

    cudaFree(d_in);
    cudaFree(d_out);
    free(h_in);
    free(h_out);
    return 0;
}


Overwriting sobel.cu


In [59]:
%%bash
nvcc -arch=sm_75 sobel.cu -o sobel
./sobel 8

Edge center 8x8 (16,16) (8x8):
255 255 255 255 255 255 255 255 
255   0   0   0   0   0   0   0 
255   0   0   0   0   0   0   0 
255   0   0   0   0   0   0   0 
255   0   0   0   0   0   0   0 
255   0   0   0   0   0   0   0 
255   0   0   0   0   0   0   0 
255   0   0   0   0   0   0   0 

Kernel execution time: 0.109408 ms


In [60]:
%%bash
nvcc sobel.cu -o sobel
./sobel 16

Edge center 8x8 (16,16) (8x8):
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 

Kernel execution time: 7.545536 ms


In [61]:
%%bash
nvcc sobel.cu -o sobel
./sobel 32

Edge center 8x8 (16,16) (8x8):
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 
  0   0   0   0   0   0   0   0 

Kernel execution time: 7.342080 ms
