<a href="https://colab.research.google.com/github/NaomiJSang/high_perfomance_computing/blob/main/softmax_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Student Name: Naomi Sang**

For project 3, you will write a parallel version of the softmax function that will work on a GPU. The suggested environment is numba/cuda.


You can find more information about the softmax function in
 [Wikipedia](https://en.wikipedia.org/wiki/Softmax_function)

In [None]:
!pip install numba




In [None]:
from numba import cuda
import numpy as np
import math

# Kernel for exponentiation using math.exp
@cuda.jit
def expon_kernel(input_values, exp_results):
    thread_idx = cuda.grid(1)  # Get the thread ID
    if thread_idx < input_values.size:
        exp_results[thread_idx] = math.exp(input_values[thread_idx])

# Kernel for normalization
@cuda.jit
def normalize_kernel(exp_values, total_sum, softmax_results):
    thread_idx = cuda.grid(1)
    if thread_idx < exp_values.size:
        softmax_results[thread_idx] = exp_values[thread_idx] / total_sum

def softmax_gpu(input_values):
    n = input_values.size

    # Allocate device memory and transfer data
    d_input_values = cuda.to_device(input_values)
    d_exp_values = cuda.device_array(n, dtype=np.float64)
    d_softmax_results = cuda.device_array(n, dtype=np.float64)

    # Launch the exponentiation kernel
    threads_per_block = 32
    blocks_per_grid = (n + (threads_per_block - 1)) // threads_per_block
    expon_kernel[blocks_per_grid, threads_per_block](d_input_values, d_exp_values)

    # Copy exponentiated results back to host and print them
    exp_values_host = d_exp_values.copy_to_host()
    print("The array after calling expon_kernel: ", exp_values_host)

    # Calculate the sum of the exponentiated values
    total_sum = np.sum(exp_values_host)
    print("The sum is: ", total_sum)

    # Launch the normalization kernel
    normalize_kernel[blocks_per_grid, threads_per_block](d_exp_values, total_sum, d_softmax_results)

    # Copy the softmax results back to host and print them
    softmax_results_host = d_softmax_results.copy_to_host()
    return softmax_results_host

# Test the softmax function
if __name__ == "__main__":
    # Initialize array as in the original example
    a = np.zeros(4, dtype=np.float64)
    for i in range(4):
        a[i] = (i + 1) * 1.0

    print("The original array: ", a)

    # Call the GPU softmax function
    result = softmax_gpu(a)

    # Print results as in the original example
    print("The result: ", result)
    print("The sum of the values in result is: ", np.sum(result))


The original array:  [1. 2. 3. 4.]




The array after calling expon_kernel:  [ 2.71828183  7.3890561  20.08553692 54.59815003]
The sum is:  84.7910248837216




The result:  [0.0320586  0.08714432 0.23688282 0.64391426]
The sum of the values in result is:  1.0
