In [None]:
# SPDX-License-Identifier: Apache-2.0 AND CC-BY-NC-4.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<img src="./images/nvmath_head_panel@0.5x.png" alt="nvmath-python" />

# Getting Started with nvmath-python: Device APIs

## Exercise: Implement numba-cuda kernel for Monte Carlo simulation of American Option call and put option cashflows

In this exercise you will do the following:
1. Familiarize yourself with the definitions of the call and put American options.
2. Take GBM implementation as a baseline for implementing call/put cashflows simulation. Additionally use the CuPy implementation below as a reference implementation of the entire workflow. 
3. Compare nvmath-python/numba-cuda vs. CuPy implementations for correctness. 
4. Benchmark nvmath-python/numba-cuda implementation vs. CuPy and explain the performance difference 

### American Options: Call and Put Definitions

**American Call Option:**
An American call option gives the holder the **right, but not the obligation**, to **buy** an underlying asset (such as a stock) at a predetermined strike price **at any time before or on the expiration date** `T`. The key feature that distinguishes American options from European options is this ability to exercise early. The holder profits when the asset price exceeds the strike price, with the payoff being `max(S - K, 0) - Premium`, where `S` is the spot price, `K` is the strike price, and `Premium` is the initial cost paid for the option.

**American Put Option:**
An American put option gives the holder the **right, but not the obligation**, to **sell** an underlying asset at a predetermined strike price **at any time before or on the expiration date** `T`. Similar to the call option, it can be exercised at any point during its lifetime. The holder profits when the asset price falls below the strike price, with the payoff being `max(K - S, 0) - Premium`.

**Key Characteristics:**
- **Flexibility**: Can be exercised at any time up to expiration (unlike European options which can only be exercised at expiration)
- **Premium**: The upfront cost paid to acquire the option
- **Strike Price**: The predetermined price at which the transaction can occur
- **Intrinsic Value**: The immediate profit if exercised now (can be zero if out-of-the-money)
- **Time Value**: Additional value due to the possibility of favorable price movements before expiration


The following charts illustrate call and put option *payoff functions* relative to the spot price `S`. Note horizontal negative areas representing the fact that the holder has the right to quit without exercising the contract with a small penalty, which is equal to `Premium`.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

LIGHT_GREY = "#AAA"
GREY = "#777"
BLACK = "#000"
PRIMARY_GREEN = "#76B900"
DARK_GREEN = "#619900"
PRIMARY_BLUE = '#88f'
DARK_BLUE = '#55a'
PRIMARY_RED = '#eb7734'
DARK_RED = '#ba5012'

def call_payoff(x, strike, premium):
    return np.maximum(x - strike, 0.0) - premium

def put_payoff(x, strike, premium):
    return np.maximum(strike - x, 0.0) - premium


strike_price = 100.0
premium = 5.0
stock_price = np.linspace(70.0, 130.0, 100)
cpay = call_payoff(stock_price, strike_price, premium)
ppay = put_payoff(stock_price, strike_price, premium)

fig, axs = plt.subplots(2, 1, figsize=(6, 8))
axs[0].plot(stock_price, cpay, label="Call option", color=PRIMARY_BLUE)
axs[1].plot(stock_price, ppay, label="Put option", color=PRIMARY_RED)
axs[0].annotate("Strike", xy=(strike_price, -premium), xytext=(94, -2), arrowprops = dict(arrowstyle='-|>', color=GREY), color=GREY )
axs[0].annotate("Premium", xy=(80, -premium), xytext=(75.58, 0.8), arrowprops = dict(arrowstyle='<|-|>', color=GREY), color=GREY )
axs[1].annotate("Strike", xy=(strike_price, -premium), xytext=(100, -2), arrowprops = dict(arrowstyle='->', color=GREY), color=GREY )
axs[1].annotate("Premium", xy=(120, -premium), xytext=(115.58, 0.8), arrowprops = dict(arrowstyle='<|-|>', color=GREY), color=GREY )
axs[0].set_title("Call option payoff function")
axs[1].set_title("Put option payoff function")
axs[1].set_xlabel("Stock price")


for ax in axs:
    ax.set_ylabel("Payoff")
    ax.grid(True, linestyle='--')
    ax.legend()
    ax.axhline(y=0, color='k')
plt.axis('on')
plt.show()

The following code is a reference implementation of the American call and put option cashflows using CuPy. It is based on the respective GBM CuPy code.

In [None]:
import nvmath # noqa: F401
import cupy as cp

RNG_SEED = 777777  # Random seed
N_STEPS = 252  # Number of time steps (trading days in a year)
N_PATHS = 800000  # Number of simulated paths (large number to get a reliable estimate)

S0 = 100.0  # Initial stock price
MU = 0.003  # Drift with upward trend
SIGMA = 0.027  # Volatility
STRIKE = 110.0  # Strike price
PREMIUM = 5.0  # Premium

def brownian_motion(nsteps, npaths, mu, sigma):
    # Differential form of the Brownian motion
    dBt = cp.empty((npaths, nsteps), dtype=cp.float32)
    dBt[:, 0] = 0.0
    dBt[:, 1:] = cp.random.randn(npaths, nsteps - 1) * sigma + mu

    # Integral form of the Brownian motion
    Bt = cp.cumsum(dBt, axis=1)

    return Bt


def generate_call_put_payoffs_cupy(nsteps, npaths, mu, sigma, s0, strike, premium):
    b_t = brownian_motion(nsteps, npaths, mu, sigma)
    s_t = s0 * cp.exp(b_t)
    call_paths = cp.maximum(s_t - STRIKE, 0.0) - PREMIUM  # Call option cashflows
    put_paths = cp.maximum(STRIKE - s_t, 0.0) - PREMIUM  # Put option cashflows
    return call_paths, put_paths

np.random.seed(RNG_SEED)
# Allocate space for paths
call_cashflow = cp.empty((N_PATHS, N_STEPS), dtype=cp.float32, order="F")
put_cashflow = cp.empty((N_PATHS, N_STEPS), dtype=cp.float32, order="F")

call_cashflow, put_cashflow = generate_call_put_payoffs_cupy(N_STEPS, N_PATHS, MU, SIGMA, S0, STRIKE, PREMIUM)

print(f"Mean call option cashflow at t=T: {call_cashflow[:, -1].mean():0.2f}")
print(f"Mean put option cashflow at t=T: {put_cashflow[:, -1].mean():0.2f}")


Take a note that call option cashflow at $ t = T $ is positive due to stock upward trend $ \mu > 0$. Respective put option cashflow is negative for the same reason but is very small. Call and put options are the great ways to *hedge* possible losses in the *bear market* and the *bull market* respectively.

The following implementation leverages **numba-cuda** and **nvmath-python** to implement much faster version of the American option cashflows simulation. It is based on the respective GBM implementation we completed earlier in this exercise.

In [None]:
from numba import cuda
from nvmath.device import random
import cupy as cp
import math

# Pre-compile the random number generator into IR to use alongside other device code
compiled_rng = random.Compile(cc=None)

# Set up CUDA kernel launch configuration
threads_per_block = 32
blocks = N_PATHS // threads_per_block
nthreads = threads_per_block * blocks + bool(N_PATHS % threads_per_block)
print(f"blocks: {blocks}, threads_per_block: {threads_per_block}, nthreads: {nthreads}")

# Allocate space for random states
states = random.StatesPhilox4_32_10(nthreads)


# RNG initialization kernel
@cuda.jit(link=compiled_rng.files, extensions=compiled_rng.extension)
def init_rng_gpu(states, seed):
    idx = cuda.grid(1)
    random.init(seed, idx, 0, states[idx])


@cuda.jit(link=compiled_rng.files, extensions=compiled_rng.extension)
def generate_call_put_payoffs_nvmath(states, call_paths, put_paths, nsteps, mu, sigma, s0, strike, premium):
    # Make sure the parameters are aligned with the call/put payoff arrays dtype
    mu = call_paths.dtype.type(mu)
    sigma = call_paths.dtype.type(sigma)
    s0 = call_paths.dtype.type(s0)
    path5 = cuda.local.array((5,), dtype=call_paths.dtype)

    # Get the thread index
    idx = cuda.grid(1)

    # If the thread index is out of bounds, return
    if idx >= call_paths.shape[0]:
        return

    # Each thread generates one path in the time domain
    path5[0] = s0
    call_paths[idx, 0] = max(path5[0] - strike, 0.0) - premium
    put_paths[idx, 0] = max(strike - path5[0], 0.0) - premium

    # Consume 4 normal variates at a time for better throughput
    for i in range(1, nsteps, 4):
        v = random.normal4(states[idx])  # Returned as float32x4 type
        vals = v.x, v.y, v.z, v.w  # Decompose into a tuple of float32
        # Process a chunk of 4 time steps, use min() to avoid out-of-bounds access
        for j in range(i, min(i + 4, nsteps)):
            k = j - i
            path5[k + 1] = path5[k] * math.exp(mu + sigma * vals[k])
            call_paths[idx, j] = max(path5[k + 1] - strike, 0.0) - premium
            put_paths[idx, j] = max(strike - path5[k + 1], 0.0) - premium
        path5[0] = path5[4]  # Shift the path chunk for the next iteration



# Allocate space for paths
call_cashflow = cp.empty((N_PATHS, N_STEPS), dtype=cp.float32, order="F")
put_cashflow = cp.empty((N_PATHS, N_STEPS), dtype=cp.float32, order="F")

# Initialize RNG states
init_rng_gpu[blocks, threads_per_block](states, RNG_SEED)

# Generate GBM paths on GPU
generate_call_put_payoffs_nvmath[blocks, threads_per_block](states, call_cashflow, put_cashflow, N_STEPS, MU, SIGMA, S0, STRIKE, PREMIUM)

print(f"Mean call option cashflow at t=T: {call_cashflow[:, -1].mean():0.2f}")
print(f"Mean put option cashflow at t=T: {put_cashflow[:, -1].mean():0.2f}")

Note that the results between CuPy and nvmath-python match to a high degree. Statistical correctness validation of the results is out of scope of this exercise.

Last but not least, let's benchmark the two implementations like we did before:

In [None]:
import numpy as np
import cupyx as cpx


# Helper function to benchmark two implementations F and (optionally) F_alternative
# When F_alternative is provided, in addition to raw performance numbers (seconds)
# speedup of F relative to F_alternative is reported
def benchmark(
    F, F_name="Implementation", F_alternative=None, F_alternative_name="Alternative implementation", n_repeat=10, n_warmup=1
):
    # warm-up + repeated runs
    timing = cpx.profiler.benchmark(F, n_repeat=n_repeat, n_warmup=n_warmup)
    # best time from repeated runs
    perf = np.min(timing.gpu_times)
    print(f"{F_name} performance = {perf:0.4f} sec")

    if F_alternative is not None:
        timing_alt = cpx.profiler.benchmark(F_alternative, n_repeat=n_repeat, n_warmup=n_warmup)
        perf_alt = np.min(timing_alt.gpu_times)
        print(f"{F_alternative_name} performance = {perf_alt:0.4f} sec")
        print(f"Speedup = {perf_alt / perf:0.4f}x")
    else:
        perf_alt = None

    return perf, perf_alt

In [None]:
benchmark(
    lambda: generate_call_put_payoffs_nvmath[blocks, threads_per_block](states, call_cashflow, put_cashflow, N_STEPS, MU, SIGMA, S0, STRIKE, PREMIUM),
    "nvmath-python",
    lambda: generate_call_put_payoffs_cupy(N_STEPS, N_PATHS, MU, SIGMA, S0, STRIKE, PREMIUM),
    "CuPy",
)