In [1]:
import torch
import torch.nn as nn
from models.transformer_pytorch import TransformerPyTorch
from models.transformer import Transformer
from hyperparameters import hyperparameters

vocab_size = 32000
batch_size = 2
seq_len = 5

pytorch_model: nn.Module = TransformerPyTorch(
    vocab_size=vocab_size,
    d_model=hyperparameters.transformer.hidden_size,
    num_heads=hyperparameters.transformer.num_heads,
    d_ff=hyperparameters.transformer.encoder_ffn_embed_dim,
    num_encoder_layers=hyperparameters.transformer.num_hidden_layers,
    num_decoder_layers=hyperparameters.transformer.num_hidden_layers,
    dropout=hyperparameters.transformer.dropout,
    max_len=hyperparameters.transformer.max_len,
)
own_model = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=hyperparameters.transformer.hidden_size,
    num_heads=hyperparameters.transformer.num_heads,
    d_ff=hyperparameters.transformer.encoder_ffn_embed_dim,
    num_encoder_layers=hyperparameters.transformer.num_hidden_layers,
    num_decoder_layers=hyperparameters.transformer.num_hidden_layers,
    dropout=hyperparameters.transformer.dropout,
    max_len=hyperparameters.transformer.max_len,
)
criterion = nn.CrossEntropyLoss(ignore_index=0, reduction="mean")

# Dummy data
src = torch.randint(1, vocab_size, (batch_size, seq_len))
tgt = torch.randint(1, vocab_size, (batch_size, seq_len))

# Ensure no zeros in the middle (just for clarity)
# but you can keep them if you want to test pad
decoder_in = tgt[:, :-1]
labels = tgt[:, 1:]

logits = pytorch_model(src, decoder_in)  # shape [B, T-1, vocab_size]
logits = logits.transpose(1, 2)  # shape [B, vocab_size, T-1]

loss = criterion(logits, labels)  # shape [B, T-1]
print("Dummy test loss =", loss.item())

# Own model
logits = own_model(src, decoder_in)  # shape [B, T-1, vocab_size]
logits = logits.transpose(1, 2)  # shape [B, vocab_size, T-1]

loss = criterion(logits, labels)  # shape [B, T-1]
print("Dummy test loss on own model =", loss.item())

Dummy test loss = 10.64527416229248
Dummy test loss on own model = 10.447006225585938




In [4]:
# Find average sentence length in the dataset
merged_path = "local/data/training/bpe_train.de"
total_len = 0
num_lines = 0

with open(merged_path, "r", encoding="utf-8") as f:
    for line in f:
        total_len += len(line.split())
        num_lines += 1

avg_len = total_len / num_lines
print("Average sentence length in the dataset =", avg_len)


Average sentence length in the dataset = 30.32287386028867


In [5]:

import pickle
from vocab import Vocabulary


vocab = pickle.load(open("local/vocab_shared.pkl", "rb")) # type: ignore
print("Vocab size =", len(vocab))

Vocab size = 32181


In [1]:
import torch

x = torch.tensor(
    [
        [1, 2],
        [3, 4],
    ]
)

import torch
import torch.nn as nn
import torch.nn.functional as F

class DropoutEmbedding(nn.Module):
    def __init__(self, num_embeddings: int, embedding_dim: int, dropout: float, padding_idx: int) -> None:
        """
        Applies dropout to entire rows of the embedding matrix.
        
        Args:
            num_embeddings (int): number of embeddings (vocabulary size).
            embedding_dim (int): dimension of each embedding vector.
            dropout (float): probability of dropping an entire embedding row.
            padding_idx (int): index of the padding token (never dropped).
        """
        super().__init__()
        self.dropout = dropout
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # When training, apply dropout to the embedding weights.
        if self.training and self.dropout > 0:
            weight = self.embedding.weight  # shape: [num_embeddings, embedding_dim]
            # Create a dropout mask for rows: shape: [num_embeddings, 1]
            mask = weight.new_empty((weight.size(0), 1)).bernoulli_(1 - self.dropout)
            # Scale the surviving rows to maintain expected values
            mask = mask / (1 - self.dropout)
            # Make sure that the padding index is always kept.
            if self.embedding.padding_idx is not None:
                mask[self.embedding.padding_idx] = 1
            # Apply the mask to zero out (drop) entire rows.
            dropped_weight = weight * mask
            # Use the masked weights for the embedding lookup.
            return F.embedding(
                input,
                dropped_weight,
                self.embedding.padding_idx,
                self.embedding.max_norm,
                self.embedding.norm_type,
                self.embedding.scale_grad_by_freq,
                self.embedding.sparse,
            )
        else:
            # In evaluation mode (or if dropout == 0), use the regular embedding.
            return self.embedding(input)


# dropout
dropout_embedding = DropoutEmbedding(10, 2, 0.1, 0)
print(dropout_embedding(x, ))

tensor([[[ 0.5186,  0.5575],
         [-0.5299,  0.1968]],

        [[-0.7712, -1.9582],
         [-1.8723,  0.8392]]], grad_fn=<EmbeddingBackward0>)


In [8]:
from weakref import ref
import sacrebleu

ref_sentence = "Hello, I am a boy"
pred_sentence = "Hi, I am a big boy"

# bleu = sacrebleu.sentence_bleu(pred_sentence, [ref_sentence])
bleu = sacrebleu.sentence_bleu(ref_sentence, [pred_sentence])
bleu.score

45.48019047027906

In [2]:
import torch

torch.log(torch.tensor(0.0))

tensor(-inf)

In [10]:
import torch

logits= torch.tensor([0.05, 0.05, 0.6, 0.05, 0.1, 0.05])
temperature = 2.0
torch.softmax(logits / temperature, dim=-1)

tensor([0.1577, 0.1577, 0.2076, 0.1577, 0.1617, 0.1577])

In [26]:
import numpy as np
from torch import nn


class ConcreteDropout(nn.Module):
    def __init__(
        self,
        weight_regularizer=1e-6,
        dropout_regularizer=1e-5,
        init_min=0.1,
        init_max=0.1,
    ):
        super(ConcreteDropout, self).__init__()

        self.weight_regularizer = weight_regularizer
        self.dropout_regularizer = dropout_regularizer

        init_min = np.log(init_min) - np.log(1.0 - init_min)
        init_max = np.log(init_max) - np.log(1.0 - init_max)

        self.p_logit = nn.Parameter(torch.empty(1).uniform_(init_min, init_max))

    def forward(self, x, layer):
        p = torch.sigmoid(self.p_logit)

        out = layer(self._concrete_dropout(x, p))

        sum_of_square = 0
        for param in layer.parameters():
            sum_of_square += torch.sum(torch.pow(param, 2))

        weights_regularizer = self.weight_regularizer * sum_of_square / (1 - p)

        dropout_regularizer = p * torch.log(p)
        dropout_regularizer += (1.0 - p) * torch.log(1.0 - p)

        input_dimensionality = x[0].numel()  # Number of elements of first item in batch
        dropout_regularizer *= self.dropout_regularizer * input_dimensionality

        regularization = weights_regularizer + dropout_regularizer
        return out, regularization

    def _concrete_dropout(self, x, p):
        eps = 1e-7
        temp = 0.1

        unif_noise = torch.rand_like(x)

        drop_prob = (
            torch.log(p + eps)
            - torch.log(1 - p + eps)
            + torch.log(unif_noise + eps)
            - torch.log(1 - unif_noise + eps)
        )

        drop_prob = torch.sigmoid(drop_prob / temp)
        random_tensor = 1 - drop_prob
        retain_prob = 1 - p

        x = torch.mul(x, random_tensor)
        x /= retain_prob

        return x


x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

cd = ConcreteDropout()
cd(x)

TypeError: ConcreteDropout.forward() missing 1 required positional argument: 'layer'

In [21]:


from pydantic import BaseModel


class ConcreteDropoutHyperparameters(BaseModel):
    l: float = 0.01 # Try also 0.1 and 0.001
    _number_of_training_examples: int = 4_603_578
    weight_regularizer: float = l**2 / _number_of_training_examples
    dropout_regularizer: float = 2 / (_number_of_training_examples)

ConcreteDropoutHyperparameters()

ConcreteDropoutHyperparameters(l=0.01, weight_regularizer=2.172223431426599e-11, dropout_regularizer=4.344446862853198e-07)