In [73]:
%%writefile code4.cu

#include <iostream>
#include <stdio.h>
#include <math.h>
#include <omp.h> // Include the OpenMP library

__global__ void matrixVectorMultiplyWithSigmoid(double *d_X, double *d_vector, double *d_result) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < 5) {
        double sum = 0.0;
        for (int j = 0; j < 3; j++) {
            sum += d_X[idx * 3 + j] * d_vector[j];
        }
        // Apply sigmoid to the sum
        d_result[idx] = 1.0 / (1.0 + exp(-sum));
    }
}

__global__ void subtractArrays(double *d_result, double *d_Y) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < 5) {
        d_result[idx] -= d_Y[idx];
    }
}

__global__ void multiplyTransposedMatrix(double *d_X, double *d_result, double *d_gradient_update) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < 3) {
        double product = 0.0;
        for (int j = 0; j < 5; j++) {
            product += d_result[j] * d_X[j * 3 + idx];
        }
        d_gradient_update[idx] = product;
    }
}

double calculateL2Norm(double *v, int n) {
    double norm = 0.0;

    for (int i = 0; i < n; i++) {
        norm += v[i] * v[i];
    }

    // Take the square root to get the L2 norm
    norm = sqrt(norm);

    return norm;
}

int main() {
    double X[5][3] = {
        {1.0, 2.0, 3.0},
        {4.0, 5.0, 6.0},
        {7.0, 8.0, 9.0},
        {10.0, 11.0, 12.0},
        {13.0, 14.0, 15.0}
    };
    double vector[3] = {0.1, 0.1, 0.1};
    double result[5];
    double Y[5] = {0.0, 1.0, 0.0, 1.0, 1.0};
    double gradient_update[3];
    double gradient[3];
    double w[3] = {0.1, 0.1, 0.1};
    double alpha = 0.05;
    int n = 3;
    int iteration = 0;
    int max_iterations = 10;
    double epsilon = 0.001;
    double epsilon_sq = epsilon * epsilon;

    double *d_X, *d_vector, *d_result, *d_Y, *d_gradient_update;
    cudaMalloc((void **)&d_X, 5 * 3 * sizeof(double));
    cudaMalloc((void **)&d_vector, 3 * sizeof(double));
    cudaMalloc((void **)&d_result, 5 * sizeof(double));
    cudaMalloc((void **)&d_Y, 5 * sizeof(double));
    cudaMalloc((void **)&d_gradient_update, 3 * sizeof(double));


    cudaMemcpy(d_X, X, 5 * 3 * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_vector, vector, 3 * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Y, Y, 5 * sizeof(double), cudaMemcpyHostToDevice);

    const int maxCores = 8; // Set the maximum number of CPU cores to test

    for (int numCores = 1; numCores <= maxCores; numCores++) {
        // Set the number of threads for parallel processing
        omp_set_num_threads(numCores);

        double start_time = omp_get_wtime(); // Get the start time

        // Calculate the number of threads per block and the number of blocks
        dim3 threadsPerBlock(256);
        dim3 numBlocks((5 + threadsPerBlock.x - 1) / threadsPerBlock.x);

        while (true) {
            matrixVectorMultiplyWithSigmoid<<<numBlocks, threadsPerBlock>>>(d_X, d_vector, d_result);
            cudaDeviceSynchronize();

            subtractArrays<<<numBlocks, threadsPerBlock>>>(d_result, d_Y);
            cudaDeviceSynchronize();

            multiplyTransposedMatrix<<<1, 3>>>(d_X, d_result, d_gradient_update);
            cudaDeviceSynchronize();

            cudaMemcpy(result, d_result, 5 * sizeof(double), cudaMemcpyDeviceToHost);
            cudaMemcpy(gradient_update, d_gradient_update, 3 * sizeof(double), cudaMemcpyDeviceToHost);

            // Copy the content from gradient_update to gradient
            cudaMemcpy(gradient, d_gradient_update, 3 * sizeof(double), cudaMemcpyDeviceToHost);

            // Print the updated result
            printf("Updated Result:\n");
            for (int i = 0; i < 5; i++) {
                printf("%.5f ", result[i]);
            }
            printf("\n");

            // Print the gradient
            printf("Gradient:\n");
            for (int i = 0; i < 3; i++) {
                printf("%.2f ", gradient[i]);
            }
            printf("\n");

            // Update the weight vector
            for (int i = 0; i < 3; i++) {
                w[i] -= alpha * gradient[i];
            }

            // Print the updated weight vector
            printf("Updated Weight Vector:\n");
            for (int i = 0; i < 3; i++) {
                printf("%.2f ", w[i]);
            }
            printf("\n");

            double l2Norm = calculateL2Norm(gradient, n);

            printf("L2 Norm: %.5f\n", l2Norm);

            if (l2Norm * l2Norm <= epsilon_sq) {
                break;
            }

            if (iteration >= max_iterations) {
                break;
            }

            iteration++;
        }

        double end_time = omp_get_wtime(); // Get the end time
        double elapsed_time = end_time - start_time;

        // Print the number of CPU cores and the elapsed time
        std::cout << "Number of CPU cores: " << numCores << std::endl;
        std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl;
    }

    cudaFree(d_X);
    cudaFree(d_vector);
    cudaFree(d_result);
    cudaFree(d_Y);
    cudaFree(d_gradient_update);

    return 0;
}









Overwriting code4.cu


In [74]:
! nvcc code4.cu -o code4.o -Xcompiler -fopenmp

In [75]:
! ./code4.o

Updated Result:
0.64566 -0.18243 0.91683 -0.03557 -0.01477 
Gradient:
5.79 7.12 8.45 
Updated Weight Vector:
-0.19 -0.26 -0.32 
L2 Norm: 12.46736
Updated Result:
0.64566 -0.18243 0.91683 -0.03557 -0.01477 
Gradient:
5.79 7.12 8.45 
Updated Weight Vector:
-0.48 -0.61 -0.74 
L2 Norm: 12.46736
Updated Result:
0.64566 -0.18243 0.91683 -0.03557 -0.01477 
Gradient:
5.79 7.12 8.45 
Updated Weight Vector:
-0.77 -0.97 -1.17 
L2 Norm: 12.46736
Updated Result:
0.64566 -0.18243 0.91683 -0.03557 -0.01477 
Gradient:
5.79 7.12 8.45 
Updated Weight Vector:
-1.06 -1.32 -1.59 
L2 Norm: 12.46736
Updated Result:
0.64566 -0.18243 0.91683 -0.03557 -0.01477 
Gradient:
5.79 7.12 8.45 
Updated Weight Vector:
-1.35 -1.68 -2.01 
L2 Norm: 12.46736
Updated Result:
0.64566 -0.18243 0.91683 -0.03557 -0.01477 
Gradient:
5.79 7.12 8.45 
Updated Weight Vector:
-1.64 -2.03 -2.43 
L2 Norm: 12.46736
Updated Result:
0.64566 -0.18243 0.91683 -0.03557 -0.01477 
Gradient:
5.79 7.12 8.45 
Updated Weight Vector:
-1.93 -2.39 -2.