In [None]:
import os
os.environ['PATH'] += ':/usr/local/cuda/bin'
os.environ['LD_LIBRARY_PATH'] += ':/usr/local/cuda/lib64'
!pip install pycuda

# Write CUDA C code to a file
cuda_code = """
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <curand_kernel.h>

#define N_PARTICLES 1000
#define W 0.8
#define C1 0.1
#define C2 0.1
#define N_ITERATIONS 500
#define BLOCK_SIZE 256

typedef struct {
    float x_pos;
    float y_pos;
    float x_velo;
    float y_velo;
    float x_best;
    float y_best;
    float best;
} particle;

__global__ void pso_kernel(particle *particles, double *gbest, double *gbest_obj, unsigned long long seed) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    curandState state;
    curand_init(seed, index, 0, &state);

    if (index < N_PARTICLES) {
        particle *curr_particle = &particles[index];

        // Initialization
        curr_particle->x_pos = curand_uniform(&state) * 5;
        curr_particle->y_pos = curand_uniform(&state) * 5;
        curr_particle->x_velo = curand_uniform(&state) * 0.1;
        curr_particle->y_velo = curand_uniform(&state) * 0.1;
        curr_particle->x_best = curr_particle->x_pos;
        curr_particle->y_best = curr_particle->y_pos;
        curr_particle->best = pow(curr_particle->x_pos - 3.14, 2) + pow(curr_particle->y_pos - 2.72, 2) + sin(3 * curr_particle->x_pos + 1.41) + sin(4 * curr_particle->y_pos - 1.73);

        double r1, r2;

        for (int i = 0; i < N_ITERATIONS; i++) {
            r1 = curand_uniform(&state);
            r2 = curand_uniform(&state);

            curr_particle->x_velo = W * curr_particle->x_velo + C1 * r1 * (curr_particle->x_best - curr_particle->x_pos) + C2 * r2 * (gbest[0] - curr_particle->x_pos);
            curr_particle->y_velo = W * curr_particle->y_velo + C1 * r1 * (curr_particle->y_best - curr_particle->y_pos) + C2 * r2 * (gbest[1] - curr_particle->y_pos);

            curr_particle->x_pos += curr_particle->x_velo;
            curr_particle->y_pos += curr_particle->y_velo;

            double obj = pow(curr_particle->x_pos - 3.14, 2) + pow(curr_particle->y_pos - 2.72, 2) + sin(3 * curr_particle->x_pos + 1.41) + sin(4 * curr_particle->y_pos - 1.73);
            if (curr_particle->best > obj) {
                curr_particle->x_best = curr_particle->x_pos;
                curr_particle->y_best = curr_particle->y_pos;
                curr_particle->best = obj;
            }

            __syncthreads();

            if (curr_particle->best < *gbest_obj) {
                atomicExch((unsigned long long int*)gbest_obj, __double_as_longlong(curr_particle->best));
                atomicExch((unsigned long long int*)gbest, __double_as_longlong(curr_particle->x_pos));
                atomicExch((unsigned long long int*)(gbest + 1), __double_as_longlong(curr_particle->y_pos));
            }
        }
    }
}

int main() {
    particle *d_particles;
    double *d_gbest, *d_gbest_obj, total_time;

    unsigned long long seed = time(NULL);
    cudaMalloc(&d_particles, N_PARTICLES * sizeof(particle));
    cudaMalloc(&d_gbest, 2 * sizeof(double));
    cudaMalloc(&d_gbest_obj, sizeof(double));

    clock_t start = clock();
    pso_kernel<<<(N_PARTICLES + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(d_particles, d_gbest, d_gbest_obj, seed);
    clock_t end = clock();

    particle *h_particles;
    cudaMalloc(&h_particles,N_PARTICLES * sizeof(particle));
    double h_gbest[2];
    double h_gbest_obj;

    cudaMemcpy(h_particles, d_particles, N_PARTICLES * sizeof(particle), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_gbest_obj, d_gbest_obj, sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_gbest, d_gbest, 2 * sizeof(double), cudaMemcpyDeviceToHost);

    total_time = (double)(end - start) / CLOCKS_PER_SEC;
    printf("PSO found best solution at f(%lf,%lf)=%lf   ", h_gbest[0], h_gbest[1], h_gbest_obj);
    printf("Total time: %f", total_time);

    cudaFree(d_particles);
    cudaFree(d_gbest);
    cudaFree(d_gbest_obj);
    cudaFree(h_particles);

    return 0;
}


"""
with open("cuda1.cu", "w") as f:
    f.write(cuda_code)

# Compile CUDA code
!nvcc -o h cuda1.cu

# Run compiled binary
!./h