In [None]:
import numpy as np
    
class Transformer:
    def __init__(self, context_length, embedding_dim, vocab_size):
        self.vocab_size = vocab_size
        self.context_length = context_length
        self.embedding_dim = embedding_dim
        self.w_q = np.random.randn(embedding_dim, 64)
        self.w_k = np.random.randn(embedding_dim, 64)
        self.w_v = np.random.randn(embedding_dim, 64)

    def softmax(self, w):
        exp_w = np.exp(w = np.max(w, axis=1, keepdims=True))
        return exp_w / np.sum(exp_w, axis=1, keepdims=True)    
    
    def relu(self, z):
        return np.maximum(0, z)

    def attention_output(self,X):
        # X is the text embeddings (m,embedding_size)
        X = np.asarray(X, dtype=np.float64)

        # Calculating the Queries, Keys, Values vectors
        Q = X @ self.w_q     
        K = X @ self.w_k
        V = X @ self.w_v             # (m, emb) @ (emb, 64) = (m, 64)
 
        # Computing the attention scores
        attention_scores = Q @ K.T   # (m,m)

        # Normalizing the attention scores
        normalized_attention_scores = attention_scores / 8                 # Because the dimension size of K is 64 # (m, m)

        # Calculating the attention weights
        attention_weights = self.softmax(normalized_attention_scores)      # (m, m)

        # Weighted sum of the weights
        weighted_sum = attention_weights @ V      # (m, m) @ (m, 64) = (m, 64)

        return weighted_sum  
    
    def cross_attention(self, X1, X2):
        Q = X2 @ self.w_q
        K = X1 @ self.w_k
        V = X1 @ self.w_v

        # Computing the attention scores
        attention_scores = Q @ K.T   # (m,m)

        # Normalizing the attention scores
        normalized_attention_scores = attention_scores / 8                 # Because the dimension size of K is 64 # (m, m)

        # Calculating the attention weights
        attention_weights = self.softmax(normalized_attention_scores)      # (m, m)

        # Weighted sum of the weights
        weighted_sum = attention_weights @ V      # (m, m) @ (m, 64) = (m, 64)

        return weighted_sum
    
    def feed_forward(self, X, input_dim, hidden_dim, output_dim, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.w1 = np.random.randn(input_dim, hidden_dim)
        self.b1 = np.zeros((1, input_dim))
        self.w2 = np.random.randn(hidden_dim, output_dim)
        self.b2 = np.zeros((1, output_dim))

        z1 = X @ self.w1 + self.b1
        a1 = self.relu(z1)
        z2 = a1 @ self.w2 + self.b2
        a2 = z2

        return a2
    
    def linear(self, X):
        w1 = np.random.randn(X.shape[1], self.vocab_size)
        linear_out = X @ w1
        return linear_out

    def layer_norm(self,X):
        mean = np.mean(X)
        std = (X - np.mean(X))/X.shape[0]
        X = (X - mean) / std

    def encoder(self, X):
        # Get the output of the self-attention
        attention_out = self.attention_output(X)         # (context_length, embedding_dim)

        # Pass these outputs parallely to the Feed-forward Neural Network
        ffnn_out = self.feed_forward(self.layer_norm(attention_out + X), self.embedding_dim, 4*self.embedding_dim, self.embedding_dim, learning_rate=0.01)

        encoder_out = self.layer_norm(ffnn_out + self.layer_norm(attention_out + X))

        return encoder_out
    
    def decoder(self, encoder_out, previous_token): 
        masked_attention_out = self.attention_output(previous_token)         # Assuming same weights for both the encoder and decoder self-attention layer, but ideally it should be different 

        cross_attention_out = self.cross_attention(encoder_out, self.layer_norm(masked_attention_out + previous_token))  # Assuming the same weights of self-attention layer for cross-attention layer

        ffnn_out = self.feed_forward(self.layer_norm(cross_attention_out + self.layer_norm(masked_attention_out + previous_token)))

        decoder_out = self.layer_norm(ffnn_out + cross_attention_out) 

        linear_out = self.linear(decoder_out)

        probabilities = self.softmax(linear_out)

        predicted_word = np.argmax(probabilities)  

        previous_token = predicted_word

        return predicted_word
    
    def predict(self, seq):
        encoder_out = self.encoder(seq)
        decoder_out = self.decoder(encoder_out, '<end>')

sequence = np.random.randn(1000, 512)    # embeddings
# positional_embeddings = self.add_positional_embeddings(sequence)

transformer = Transformer(context_length=1000, embedding_dim=512, vocab_size=10000)
predicted_word = transformer.predict(sequence)