4.4 

In [None]:
import numpy as np

def loss_GloVe(W, W_, b, b_, X, symmetric=False):
    V = X.shape[0]
    if symmetric:
        # Symmetric model
        W_ = W
        b_ = b
    dot = np.dot(W, W_.T)
    dot_bias = dot + b + b_.T
    dot_bias = np.maximum(dot_bias, 0)
    dot_bias = np.log(dot_bias + 1e-9)
    X_log = np.log(X + 1e-9)
    loss = np.sum((dot_bias - X_log) ** 2)
    return loss


4.5

In [None]:
import numpy as np

def grad_GloVe(W, W_prime, b, b_prime, X, C, V, d, symmetric=True):
    if symmetric:
        # W_prime = W and b_prime = b in symmetric case
        W_prime = W
        b_prime = b
    
    N = len(X)
    grad_W = np.zeros((V, d))
    grad_W_prime = np.zeros((V, d))
    grad_b = np.zeros(V)
    grad_b_prime = np.zeros(V)

    for i in range(N):
        j, c = X[i][0], X[i][1]
        w_i, w_j = W[i, :], W_prime[j, :]
        b_i, b_j = b[i], b_prime[j]
        
        dot = np.dot(w_i, w_j)
        dot += b_i + b_j
        dot -= np.log(c)
        
        x = (c / dot) ** 0.5
        grad_W[i, :] += x * w_j
        grad_W_prime[j, :] += x * w_i
        grad_b[i] += x
        grad_b_prime[j] += x
        
        x = -0.5 * x * c / dot
        grad_W[i, :] -= x * w_j
        grad_W_prime[j, :] -= x * w_i
        grad_b[i] -= x
        grad_b_prime[j] -= x
        
    return grad_W, grad_W_prime, grad_b, grad_b_prime


6.1

In [None]:
def compute_loss(self, input_batch, target_batch, mask_batch):
    """
    Compute the cross-entropy loss for a mini-batch of training examples.

    Arguments:
    - input_batch: A (B, N, D) numpy array, where B is the batch size, N is the context length,
    and D is the input dimension.
    - target_batch: A (B, N) numpy array, where B is the batch size, and N is the context length.
    The values of target_batch should be indices in the vocabulary.
    - mask_batch: A (B, N) numpy array, where B is the batch size, and N is the context length.
    The values of mask_batch should be 0 or 1, indicating whether each context word position
    is masked or not.

    Returns:
    - loss: A scalar representing the cross-entropy loss for the mini-batch.
    """
    # Compute the activations for the input batch
    activations = self.compute_activations(input_batch)

    # Compute the logits and softmax output probabilities
    logits = self.hid_to_output_weights @ activations
    output_probs = self.softmax(logits)

    # Calculate the loss as the sum of cross-entropy losses for the masked positions
    loss = 0.0
    for i in range(target_batch.shape[0]):
        for n in range(target_batch.shape[1]):
            if mask_batch[i, n] == 1:
                loss -= np.log(output_probs[target_batch[i, n], i * self.vocab_size + n])

    return loss


In [None]:
6.2

In [None]:
class Model:
    ...
    
    def back_propagate(self, input_batch, target_batch, activations, logits):
        """
        Computes the gradient of the loss with respect to model parameters using backpropagation.

        Arguments:
        input_batch -- a batch of input data of shape (batch_size, input_dim)
        target_batch -- a batch of target data of shape (batch_size, output_dim)
        activations -- a list of activations computed by forward pass, starting with input_batch
        logits -- a list of logits computed by forward pass, starting with input_batch

        Returns:
        grad_embed_to_hid_weights -- gradient of the loss with respect to embed_to_hid_weights
        grad_hid_to_output_weights -- gradient of the loss with respect to hid_to_output_weights
        grad_hid_bias -- gradient of the loss with respect to hid_bias
        grad_output_bias -- gradient of the loss with respect to output_bias
        """
        grad_embed_to_hid_weights = np.zeros(self.embed_to_hid_weights.shape)
        grad_hid_to_output_weights = np.zeros(self.hid_to_output_weights.shape)
        grad_hid_bias = np.zeros(self.hid_bias.shape)
        grad_output_bias = np.zeros(self.output_bias.shape)
        
        delta = self.compute_loss_derivative(target_batch, activations[-1], logits[-1])
        grad_hid_to_output_weights = np.matmul(delta.T, activations[-2])
        grad_output_bias = np.sum(delta, axis=0)
        
        delta = np.matmul(delta, self.hid_to_output_weights.T) * (activations[-2] > 0)
        grad_hid_bias = np.sum(delta, axis=0)
        grad_embed_to_hid_weights = np.matmul(input_batch.T, delta)
        
        return grad_embed_to_hid_weights, grad_hid_to_output_weights, grad_hid_bias, grad_output_bias


6.3

In [None]:
def back_propagate(self, x, t, m):
    activations = self.compute_activations(x)
    loss_derivative = self.compute_loss_derivative(activations[-1], t, m)
    N, V = loss_derivative.shape
    _, H = activations[-2].shape
    
    output_bias_grad = np.sum(loss_derivative, axis=0)
    hid_to_output_weights_grad = activations[-2].T @ loss_derivative
    hid_error = loss_derivative @ self.hid_to_output_weights.T

    # YOUR CODE HERE
    embed_to_hid_weights_grad = activations[0].T @ hid_error
    hid_bias_grad = np.sum(hid_error, axis=0)

    return (embed_to_hid_weights_grad, hid_bias_grad, hid_to_output_weights_grad, output_bias_grad)


7.1

In [None]:
import numpy as np

def weat_association_score(target_words, attribute_words_A, attribute_words_B, word_vectors):
    target_word_vectors = np.array([word_vectors[word] for word in target_words])
    attribute_vectors_A = np.array([word_vectors[word] for word in attribute_words_A])
    attribute_vectors_B = np.array([word_vectors[word] for word in attribute_words_B])

    mean_A = np.mean(attribute_vectors_A, axis=0)
    mean_B = np.mean(attribute_vectors_B, axis=0)

    cosine_similarities_A = np.dot(target_word_vectors, mean_A) / (np.linalg.norm(target_word_vectors, axis=1) * np.linalg.norm(mean_A))
    cosine_similarities_B = np.dot(target_word_vectors, mean_B) / (np.linalg.norm(target_word_vectors, axis=1) * np.linalg.norm(mean_B))

    return np.mean(cosine_similarities_A) - np.mean(cosine_similarities_B)
