In [4]:
%%writefile vector_add.cu
#include <iostream>
#include <stdio.h>

__global__ void vectorAdd(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    const int N = 10;
    float A[N], B[N], C[N];

    // Initialize input vectors
    for (int i = 0; i < N; ++i) {
        A[i] = i + 1.0f;
        B[i] = i * 2.0f;
    }

    float *d_a, *d_b,*d_c;
    cudaMalloc(&d_a,N*sizeof(float));
    cudaMalloc(&d_b,N*sizeof(float));
    cudaMalloc(&d_c,N*sizeof(float));
    cudaMemcpy(d_a,A,N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,B,N*sizeof(float),cudaMemcpyHostToDevice);
    int blocksize=256;
    int gridsize=(N + blocksize - 1) / blocksize; // Corrected grid size calculation
    vectorAdd<<<gridsize,blocksize>>>(d_a,d_b,d_c,N);
    cudaMemcpy(C,d_c,N*sizeof(float),cudaMemcpyDeviceToHost);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Print the result
    printf("Vector A:\n");
    for (int i = 0; i < N; ++i) {
        printf("%f ", A[i]);
    }
    printf("\n");

    printf("Vector B:\n");
    for (int i = 0; i < N; ++i) {
        printf("%f ", B[i]);
    }
    printf("\n");

    printf("Vector C (A + B):\n");
    for (int i = 0; i < N; ++i) {
        printf("%f ", C[i]);
    }
    printf("\n");


    return 0;
}

Overwriting vector_add.cu


In [5]:
# Compile with the specified architecture
!nvcc vector_add.cu -o vector_add -gencode arch=compute_75,code=sm_75

# Run the executable
!./vector_add

Vector A:
1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 
Vector B:
0.000000 2.000000 4.000000 6.000000 8.000000 10.000000 12.000000 14.000000 16.000000 18.000000 
Vector C (A + B):
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 
