Q1: Identify !, %, and %% used in cell in Google Colab.

In [None]:
# ! (Shell command)Used to run Linux terminal commands from a notebook cell.

!ls
!pwd
!nvidia-smi
!pip install numpy

# % (Line magic) Affects only one line , Used for notebook configuration or profiling
%time x = sum(range(1000000))
%pwd
%matplotlib inline

sample_data
/content
Wed Jan 28 16:44:37 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                           

In [None]:
# %% (Cell magic)Affects the entire cell , Often used for scripting or timing
%%bash
echo "Hello from bash"

Hello from bash


In [None]:
%%time
x = 0
for i in range(10**7):
    x += i

CPU times: user 996 ms, sys: 2.23 ms, total: 998 ms
Wall time: 998 ms


Q2: Identify all key nvidia-smi commands with multiple options

In [None]:
!nvidia-smi
!nvidia-smi -L

Wed Jan 28 16:47:11 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!nvidia-smi -q



Timestamp                                 : Wed Jan 28 16:47:38 2026
Driver Version                            : 550.54.15
CUDA Version                              : 12.4

Attached GPUs                             : 1
GPU 00000000:00:04.0
    Product Name                          : Tesla T4
    Product Brand                         : NVIDIA
    Product Architecture                  : Turing
    Display Mode                          : Enabled
    Display Active                        : Disabled
    Persistence Mode                      : Disabled
    Addressing Mode                       : None
    MIG Mode
        Current                           : N/A
        Pending                           : N/A
    Accounting Mode                       : Disabled
    Accounting Mode Buffer Size           : 4000
    Driver Model
        Current                           : N/A
        Pending                           : N/A
    Serial Number                         : 1561620040299
    GPU UUID  

In [None]:
!nvidia-smi pmon
!nvidia-smi dmon

# gpu         pid   type     sm    mem    enc    dec    jpg    ofa    command 
# Idx           #    C/G      %      %      %      %      %      %    name 
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0          -     -      -      -      -      -      -      -    -              
    0

Q3: Debug common CUDA errors (zero output, incorrect indexing, PTX errors)

In [None]:
#zero output
%%writefile hello.cu
#include <cuda_runtime.h>
#include <stdio.h>

__global__ void kernel(void) {
    printf("Hello GPU\n");
}

/*int main() {
    kernel<<<1, 8>>>();   // kernel launched
    return 0;             //program exits immediately
}*/

int main() {
    kernel<<<1, 1>>>();   // kernel launched
    cudaDeviceSynchronize();  //fix
}

Overwriting hello.cu


In [None]:
!nvcc -arch=sm_75 hello.cu -o hello

In [None]:
!./hello

Hello GPU


In [None]:
#incorrect indexing

%%writefile indexing.cpp
#include <iostream>
using namespace std;

int main() {
    int blocks = 2;
    int threads = 4;

    cout << "Incorrect indexing output:\n";

    for (int blockIdx = 0; blockIdx < blocks; blockIdx++) {
        for (int threadIdx = 0; threadIdx < threads; threadIdx++) {
            int id = threadIdx;   //  ignores block index
            cout << "Thread " << id << endl;
        }
    }

    return 0;
}

Writing indexing.cpp


In [None]:
!nvcc indexing.cpp -o indexing

In [None]:
!./indexing

Incorrect indexing output:
Thread 0
Thread 1
Thread 2
Thread 3
Thread 0
Thread 1
Thread 2
Thread 3


In [None]:
#corrected
%%writefile indexing.cpp
#include <iostream>
using namespace std;

int main() {
    int blocks = 2;
    int threads = 4;

    cout << "Incorrect indexing output:\n";

    for (int blockIdx = 0; blockIdx < blocks; blockIdx++) {
        for (int threadIdx = 0; threadIdx < threads; threadIdx++) {
            int id = blockIdx * threads + threadIdx;
            cout << "Thread " << id << endl;
        }
    }

    return 0;
}

Overwriting indexing.cpp


In [None]:
!nvcc indexing.cpp -o indexing

In [None]:
!./indexing

Incorrect indexing output:
Thread 0
Thread 1
Thread 2
Thread 3
Thread 4
Thread 5
Thread 6
Thread 7


In [None]:
#ptx errors fix(version mismatch)
#nvcc -arch=sm_75 file.cu

Q4:  Write a CUDA C/C++ program to demonstrate GPU kernel execu on and thread indexing.
a. Launch a CUDA kernel using: 1 block and 8 threads
b. Each thread must print: Hello from GPU thread <global_thread_id>
c. Compute the global thread ID using: global_thread_id = blockIdx.x * blockDim.x +
threadIdx.x
d. Clearly separate: Host code (CPU) & Device code (GPU kernel)

In [None]:
%%writefile helloworld.cu
#include <stdio.h>

// ================= DEVICE CODE =================
__global__ void helloKernel() {
    int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    printf("Hello from GPU thread %d\n", global_thread_id);
}

// ================= HOST CODE ===================
int main() {
    // Launch kernel: 1 block, 8 threads
    helloKernel<<<1, 8>>>();

    // Synchronize device
    cudaDeviceSynchronize();

    return 0;
}


Writing helloworld.cu


In [None]:
!nvcc -arch=sm_75 helloworld.cu -o helloworld

In [None]:
!./helloworld

Hello from GPU thread 0
Hello from GPU thread 1
Hello from GPU thread 2
Hello from GPU thread 3
Hello from GPU thread 4
Hello from GPU thread 5
Hello from GPU thread 6
Hello from GPU thread 7


Q5: Write a CUDA program to demonstrate host and device memory separa on.
a. Create an integer array of size 5 on the host (CPU).
b. Allocate corresponding memory on the device (GPU) using cudaMalloc().
c. Copy data from host to device using cudaMemcpy().
d. Launch a kernel where GPU threads print values from device memory.
e. Copy the data back from device to host and print it on CPU.

In [None]:
%%writefile arr.cu
#include <stdio.h>

// ================= DEVICE CODE =================
__global__ void printDeviceArray(int *d_arr) {
    int id = threadIdx.x;
    printf("GPU thread %d value = %d\n", id, d_arr[id]);
}

// ================= HOST CODE ===================
int main() {
    int h_arr[5] = {10, 20, 30, 40, 50};
    int *d_arr;

    // Allocate device memory
    cudaMalloc((void**)&d_arr, 5 * sizeof(int));

    // Copy host to device
    cudaMemcpy(d_arr, h_arr, 5 * sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    printDeviceArray<<<1, 5>>>(d_arr);
    cudaDeviceSynchronize();

    // Copy device to host
    cudaMemcpy(h_arr, d_arr, 5 * sizeof(int), cudaMemcpyDeviceToHost);

    // Print on CPU
    printf("CPU output:\n");
    for (int i = 0; i < 5; i++)
        printf("%d ", h_arr[i]);

    // Free memory
    cudaFree(d_arr);

    return 0;
}


Writing arr.cu


In [None]:
!nvcc -arch=sm_75 arr.cu -o arr

In [None]:
!./arr

GPU thread 0 value = 10
GPU thread 1 value = 20
GPU thread 2 value = 30
GPU thread 3 value = 40
GPU thread 4 value = 50
CPU output:
10 20 30 40 50 

Q6: Compare CPU mes of List/tuple with Numpy arrays.

In [None]:
import time
import numpy as np

size = 10_000_000

# List
lst = list(range(size))
start = time.time()
lst_sum = sum(lst)
print("List time:", time.time() - start)

# Tuple
tpl = tuple(range(size))
start = time.time()
tpl_sum = sum(tpl)
print("Tuple time:", time.time() - start)

# NumPy
arr = np.arange(size)
start = time.time()
arr_sum = np.sum(arr)
print("NumPy time:", time.time() - start)


List time: 0.0789177417755127
Tuple time: 0.07655596733093262
NumPy time: 0.006797075271606445
