In [10]:
import numpy as np

In [11]:
def initialize_parameters(input_size, output_size):
    """Initialize weight parameters with small random values."""
    return np.random.randn(output_size, input_size) * 0.1

In [12]:
# Task 1: Implement the Softmax Function
# The softmax function converts a vector of values to a probability distribution.
# Each element is transformed using the exponential function, making them positive,
# and then normalized so that the sum of the resulting values is 1.
# Implement the softmax function that takes a numpy array `x` as input and returns
# a numpy array of the same shape, where each element is the softmax of `x`.
def softmax(x):
    norm_exp_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return norm_exp_x / np.sum(norm_exp_x, axis=-1, keepdims=True)  # Maintains the original number of dimensions; ensures that each set of scores is independently normalized

In [13]:
# Task 2: Implement the Scaled Dot-Product Attention Mechanism
# The attention function computes a weighted sum of values V, where the weight assigned
# to each value is computed by a compatibility function of the query Q with the corresponding key K.
# Implement the function `scaled_dot_product_attention(Q, K, V)` that computes the attention
# mechanism's output and the attention weights.
# Hint: Use the softmax function you implemented earlier for computing the attention weights.
def scaled_dot_product_attention(Q, K, V):

    # dotProd = np.dot(Q, K.T)  # Dot product of Q and K^T
    dotProd = np.matmul(Q, K.swapaxes(-1, -2))

    dim_k = K.shape[1]  # Dimension of the keys
    scaled_attention_logits = dotProd / np.sqrt(dim_k)  # Scale by sqrt(dim_k)

    # Get the attention weights using softmax
    attention_weights = softmax(scaled_attention_logits)

    # Multiply the attention weights with V
    output = np.matmul(attention_weights, V)

    return output, attention_weights

In [14]:
# Task 3: Implement the Transformer Decoder Layer
# A transformer decoder layer consists of a self-attention mechanism, cross-attention with
# respect to the encoder outputs, and a position-wise feed-forward network.
# Using the `initialize_parameters` function for initializing weights, implement the transformer
# decoder layer function `transformer_decoder_layer(Q, K, V, memory, params, mask=None)`.
# Use the attention mechanism you defined in Task 2 for both self-attention and cross-attention.
# Hint: The decoder outputs should pass through a layer normalization step at the end.
def transformer_decoder_layer(Q, K, V, memory, params, mask=None):
    # Self attention
    self_attention_output, _ = scaled_dot_product_attention(
        np.dot(Q, params['W_q']),
        np.dot(K, params['W_k']),
        np.dot(V, params['W_v'])
    )

    # Cross-attention with memory
    cross_attention_output, _ = scaled_dot_product_attention(
        self_attention_output,
        np.dot(memory, params['W_m_k']),
        np.dot(memory, params['W_m_v'])
    )

    # Feed-forward
    feed_forward_1 = np.dot(cross_attention_output, params['W_ff1']) + params['b_ff1']
    feed_forward_1 = np.maximum(0, feed_forward_1)  # ReLU activation
    feed_forward_2 = np.dot(feed_forward_1, params['W_ff2']) + params['b_ff2']

    # Layer normalization
    decoder_output = layer_norm(feed_forward_2)
    return decoder_output

In [15]:
#Layer_norm is given as:
def layer_norm(x):
    return (x - x.mean(axis=-1, keepdims=True)) / np.sqrt(x.var(axis=-1, keepdims=True) + 1e-6)
# ----------------------
# Parameters Initialization (Provided)
# ----------------------

d_model = 10  # Embedding size
d_ff = 20  # Size of the feed-forward network
vocab_size = 10  # Assuming a vocab size of 10 for simplicity

# Initialize weights (This part is provided to students)
params = {
    'W_q': initialize_parameters(d_model, d_model),
    'W_k': initialize_parameters(d_model, d_model),
    'W_v': initialize_parameters(d_model, d_model),
    'W_o': initialize_parameters(d_model, d_model),
    'W_m_k': initialize_parameters(d_model, d_model),
    'W_m_v': initialize_parameters(d_model, d_model),
    'W_ff1': initialize_parameters(d_ff, d_model),
    'b_ff1': np.zeros(d_ff),
    'W_ff2': initialize_parameters(d_model, d_ff),
    'b_ff2': np.zeros(d_model),
    'd_model': d_model
}

# Test Check 1: Softmax Function
def check_softmax():
    print("Checking the softmax function...")
    test_input = np.array([1.0, 2.0, 3.0])
    output = softmax(test_input)
    if np.allclose(output, np.array([0.09003057, 0.24472847, 0.66524096])):
        print("Softmax function seems to be implemented correctly.")
    else:
        print("Softmax function may be incorrect. Please check your implementation.")

# Test Check 2: Scaled Dot-Product Attention
def check_attention():
    print("Checking the attention mechanism...")
    Q = np.array([[1, 0, 0], [0, 1, 0]])
    K = V = np.array([[1, 2, 3], [4, 5, 6]])
    output, _ = scaled_dot_product_attention(Q, K, V)
    # print("output.shape", output.shape)
    expected_output = np.array([[3.54902366, 4.54902366, 5.54902366], [3.54902366, 4.54902366, 5.54902366]])
    if np.allclose(output, expected_output):
        print("Attention mechanism seems to be implemented correctly.")
    else:
        print("Attention mechanism may be incorrect. Please check your implementation.")

# Test Check 3: Transformer Decoder Layer Functionality
def check_decoder_layer():
    print("Checking the transformer decoder layer...")
    Q = K = V = memory = np.random.randn(1, 10, d_model)
    output = transformer_decoder_layer(Q, K, V, memory, params)
    # Instead of just checking the shape, let's ensure there's a non-zero variance
    # across the output, indicating that the layer has applied some transformation.

    if output.shape == (1, 10, d_model) and np.var(output) != 0:
        print("Transformer decoder layer output shape is correct and shows variance across outputs.")
    else:
        print("There might be an issue with the transformer decoder layer. Please check your implementation.")

In [16]:
# Uncomment to run checks
check_softmax()

Checking the softmax function...
Softmax function seems to be implemented correctly.


In [17]:
check_attention()

Checking the attention mechanism...
Attention mechanism seems to be implemented correctly.


In [18]:
check_decoder_layer()

Checking the transformer decoder layer...
Transformer decoder layer output shape is correct and shows variance across outputs.
