In [None]:
%%writefile flock.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>

// Each agent has [x, y, vx, vy] in agents array, for a total of 4*N floats.
// This kernel performs one flocking step for each agent.
__global__
void flockKernel(const float* agents, float* agents_next, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N) return;
    int base = 4 * i;
    float x  = agents[base + 0];
    float y  = agents[base + 1];
    float vx = agents[base + 2];
    float vy = agents[base + 3];
    const float r      = 5.0f;
    const float r_sq   = r * r;
    const float alpha  = 0.05f;
    float sum_vx = 0.0f;
    float sum_vy = 0.0f;
    int neighborCount = 0;
    for (int j = 0; j < N; j++)
    {
        if (j == i) continue;
        int jbase = 4 * j;
        float xj  = agents[jbase + 0];
        float yj  = agents[jbase + 1];
        float dx = xj - x;
        float dy = yj - y;
        float dist_sq = dx*dx + dy*dy;
        if (dist_sq < r_sq) {
            sum_vx += agents[jbase + 2];
            sum_vy += agents[jbase + 3];
            neighborCount++;
        }
    }
    float new_vx = vx;
    float new_vy = vy;
    if (neighborCount > 0)
    {
        float avg_vx = sum_vx / neighborCount;
        float avg_vy = sum_vy / neighborCount;
        new_vx = vx + alpha * (avg_vx - vx);
        new_vy = vy + alpha * (avg_vy - vy);
    }
    float new_x = x + new_vx;
    float new_y = y + new_vy;
    agents_next[base + 0] = new_x;
    agents_next[base + 1] = new_y;
    agents_next[base + 2] = new_vx;
    agents_next[base + 3] = new_vy;
}
void solve_host(const float* agents, float* agents_next, int N)
{
    float *d_agents     = nullptr;
    float *d_agentsNext = nullptr;
    size_t size = 4 * N * sizeof(float);
    cudaMalloc((void**)&d_agents,     size);
    cudaMalloc((void**)&d_agentsNext, size);
    cudaMemcpy(d_agents, agents, size, cudaMemcpyHostToDevice);
    int blockSize = 256;
    int gridSize  = (N + blockSize - 1) / blockSize;
    flockKernel<<<gridSize, blockSize>>>(d_agents, d_agentsNext, N);
    cudaDeviceSynchronize();
    cudaMemcpy(agents_next, d_agentsNext, size, cudaMemcpyDeviceToHost);
    cudaFree(d_agents);
    cudaFree(d_agentsNext);
}

int main() {
    int N = 1024;
    size_t elems = (size_t)4 * N;
    float* agents = (float*)malloc(elems * sizeof(float));
    float* agents_next = (float*)malloc(elems * sizeof(float));
    srand(0);
    for (size_t i = 0; i < elems; ++i) agents[i] = ((float)rand() / RAND_MAX) * 10.0f - 5.0f;
    solve_host(agents, agents_next, N);
    printf("Sample output (first 8 floats): ");
    for (int i = 0; i < 8; ++i) printf("%0.6f ", agents_next[i]);
    printf("\n");
    free(agents); free(agents_next);
    return 0;
}

In [None]:
# Compile and run the flocking demo (small example)
!nvcc -O3 flock.cu -o flock && ./flock

## Output:
```
Sample output (first 8 floats):
-1.234567 0.345678 0.123456 ... (example values)
```
Note: The demo uses N=1024 agents by default. Reduce `N` in `main()` if you want a faster run or to conserve GPU memory.