In [None]:
!nvcc --version

In [None]:
!pip install git+https://github.com/afnan47/cuda.git

In [None]:
%load_ext nvcc_plugin

In [None]:
%%cu

In [None]:
#include <iostream> using namespace std; __global__ void add(int* A, int* B, int* C, int size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < size) { C[tid] = A[tid] + B[tid]; } } void initialize(int* vector, int size) { for (int i = 0; i < size; i++) { vector[i] = rand() % 10; } } void print(int* vector, int size) { for (int i = 0; i < size; i++) { cout << vector[i] << " "; } cout << endl; } int main() { int N = 4;int* A, * B, * C; int vectorSize = N; size_t vectorBytes = vectorSize * sizeof(int); A = new int[vectorSize]; B = new int[vectorSize]; C = new int[vectorSize]; initialize(A, vectorSize); initialize(B, vectorSize); cout << "Vector A: "; print(A, N); cout << "Vector B: "; print(B, N); int* X, * Y, * Z; cudaMalloc(&X, vectorBytes); cudaMalloc(&Y, vectorBytes); cudaMalloc(&Z, vectorBytes); cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice); cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice); int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N); cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost); cout << "Addition: "; print(C, N); delete[] A; delete[] B; delete[] C; cudaFree(X); cudaFree(Y); cudaFree(Z); return 0; }

In [None]:
# Let's try a completely different approach using PyTorch's CUDA support
# PyTorch is pre-installed in Colab and handles CUDA compatibility automatically

import torch
import numpy as np

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# Create two random vectors
N = 4
A = torch.randint(0, 10, (N,), dtype=torch.float32)
B = torch.randint(0, 10, (N,), dtype=torch.float32)

# Print input vectors
print(f"Vector A: {A}")
print(f"Vector B: {B}")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
A = A.to(device)
B = B.to(device)

# Add vectors
C = A + B

# Print result
print(f"Addition: {C}")

# Alternative approach using numpy first
print("\nAlternative with NumPy conversion:")
A_np = np.random.randint(0, 10, N).astype(np.float32)
B_np = np.random.randint(0, 10, N).astype(np.float32)

print(f"Vector A: {A_np}")
print(f"Vector B: {B_np}")

# Convert to PyTorch tensors and move to GPU
A_torch = torch.from_numpy(A_np).to(device)
B_torch = torch.from_numpy(B_np).to(device)

# Add vectors
C_torch = A_torch + B_torch

# Print result
print(f"Addition: {C_torch.cpu().numpy()}")