## 1


- `!` → Executes shell commands from a notebook cell (e.g., `!ls`, `!nvcc --version`).
- `%` → Line magic command (operates on a single line, e.g., `%time`, `%matplotlib inline`).
- `%%` → Cell magic command (applies to the entire cell, e.g., `%%time`, `%%writefile file.cu`).


## 2


Common `nvidia-smi` commands:

- `nvidia-smi`
- `nvidia-smi -L`
- `nvidia-smi -q`
- `nvidia-smi -q -d MEMORY`
- `nvidia-smi -q -d UTILIZATION`
- `nvidia-smi --help`
- `nvidia-smi topo -m`
- `nvidia-smi pmon -i 0`
- `nvidia-smi dmon`
- `nvidia-smi --gpu-reset -i 0`


## 3


Common CUDA Errors:

1. Zero Output:
   - Kernel not launched properly
   - Missing cudaMemcpy
   - Synchronization issue (use cudaDeviceSynchronize())

2. Incorrect Indexing:
   - Wrong global thread ID calculation
   - Out-of-bounds memory access
   - Incorrect grid/block configuration

3. PTX Errors:
   - Unsupported architecture (use correct `-arch=sm_xx`)
   - CUDA toolkit mismatch
   - Syntax errors in kernel code


## 4

In [None]:

%%writefile hello_gpu.cu
#include <stdio.h>

// Device Code (GPU Kernel)
__global__ void helloKernel() {
    int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    printf("Hello from GPU thread %d\n", global_thread_id);
}

// Host Code (CPU)
int main() {
    helloKernel<<<1, 8>>>();   // 1 block, 8 threads
    cudaDeviceSynchronize();
    return 0;
}


## 5

In [None]:

%%writefile memory_demo.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Device Code
__global__ void printKernel(int *d_array) {
    int idx = threadIdx.x;
    printf("GPU Thread %d: %d\n", idx, d_array[idx]);
}

// Host Code
int main() {
    int h_array[5] = {10, 20, 30, 40, 50};
    int *d_array;

    cudaMalloc((void**)&d_array, 5 * sizeof(int));
    cudaMemcpy(d_array, h_array, 5 * sizeof(int), cudaMemcpyHostToDevice);

    printKernel<<<1, 5>>>(d_array);
    cudaDeviceSynchronize();

    cudaMemcpy(h_array, d_array, 5 * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Back on CPU:\n");
    for(int i = 0; i < 5; i++) {
        printf("%d ", h_array[i]);
    }

    cudaFree(d_array);
    return 0;
}


## 6

In [None]:

import time
import numpy as np

# List timing
start = time.time()
lst = [i for i in range(1000000)]
end = time.time()
print("List time:", end - start)

# Tuple timing
start = time.time()
tpl = tuple(i for i in range(1000000))
end = time.time()
print("Tuple time:", end - start)

# NumPy timing
start = time.time()
arr = np.arange(1000000)
end = time.time()
print("NumPy time:", end - start)
