In [1]:
import torch
import torch.nn as nn
from models.transformer_pytorch import TransformerPyTorch
from models.transformer import Transformer
from hyperparameters import hyperparameters

vocab_size = 32000
batch_size = 2
seq_len = 5

pytorch_model: nn.Module = TransformerPyTorch(
    vocab_size=vocab_size,
    d_model=hyperparameters.transformer.hidden_size,
    num_heads=hyperparameters.transformer.num_heads,
    d_ff=hyperparameters.transformer.encoder_ffn_embed_dim,
    num_encoder_layers=hyperparameters.transformer.num_hidden_layers,
    num_decoder_layers=hyperparameters.transformer.num_hidden_layers,
    dropout=hyperparameters.transformer.dropout,
    max_len=hyperparameters.transformer.max_len
)
own_model = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=hyperparameters.transformer.hidden_size,
    num_heads=hyperparameters.transformer.num_heads,
    d_ff=hyperparameters.transformer.encoder_ffn_embed_dim,
    num_encoder_layers=hyperparameters.transformer.num_hidden_layers,
    num_decoder_layers=hyperparameters.transformer.num_hidden_layers,
    dropout=hyperparameters.transformer.dropout,
    max_len=hyperparameters.transformer.max_len
)
criterion = nn.CrossEntropyLoss(ignore_index=0, reduction="mean")

# Dummy data
src = torch.randint(1, vocab_size, (batch_size, seq_len))
tgt = torch.randint(1, vocab_size, (batch_size, seq_len))

# Ensure no zeros in the middle (just for clarity)
# but you can keep them if you want to test pad
decoder_in = tgt[:, :-1]
labels = tgt[:, 1:]

logits = pytorch_model(src, decoder_in)  # shape [B, T-1, vocab_size]
logits = logits.transpose(1, 2)  # shape [B, vocab_size, T-1]

loss = criterion(logits, labels)  # shape [B, T-1]
print("Dummy test loss =", loss.item())

# Own model
logits = own_model(src, decoder_in)  # shape [B, T-1, vocab_size]
logits = logits.transpose(1, 2)  # shape [B, vocab_size, T-1]

loss = criterion(logits, labels)  # shape [B, T-1]
print("Dummy test loss on own model =", loss.item())

Dummy test loss = 10.64527416229248
Dummy test loss on own model = 10.447006225585938




In [4]:
# Find average sentence length in the dataset
merged_path = "local/data/training/bpe_train.de"
total_len = 0
num_lines = 0

with open(merged_path, "r", encoding="utf-8") as f:
    for line in f:
        total_len += len(line.split())
        num_lines += 1

avg_len = total_len / num_lines
print("Average sentence length in the dataset =", avg_len)

Average sentence length in the dataset = 30.32287386028867


In [5]:
import pickle
from vocab import Vocabulary


vocab = pickle.load(open("local/vocab_shared.pkl", "rb")) # type: ignore
print("Vocab size =", len(vocab))

Vocab size = 32181


In [1]:
import torch

x = torch.tensor(
    [
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ]
)
print(x.view(-1))
print(x.view(3, 3))
x[:1]

tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])
tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


tensor([[1, 2, 3]])

In [15]:
from models.transformer_model import TransformerModel
from hyperparameters import hyperparameters
from uq.generate_with_uq import _enable_test_time_dropout
from torch import nn

vocab_size = 32000

hyperparameters.transformer.transformer_implementation = "pytorch"

model = TransformerModel(
    vocab_size=vocab_size,
    d_model=hyperparameters.transformer.hidden_size,
    num_heads=hyperparameters.transformer.num_heads,
    d_ff=hyperparameters.transformer.encoder_ffn_embed_dim,
    num_encoder_layers=hyperparameters.transformer.num_hidden_layers,
    num_decoder_layers=hyperparameters.transformer.num_hidden_layers,
    dropout=hyperparameters.transformer.dropout,
    max_len=hyperparameters.transformer.max_len
)

model.eval()

def dropout_repr(self):
    return f"{self.__class__.__name__}(p={self.p}, training={self.training})"
nn.Dropout.__repr__ = dropout_repr


def enable_fast_test_time_dropout(model: TransformerModel) -> None:
    """
    Enable dropout for the final decoder layer in the transformer.
    """
    final_decoder_layer = model.transformer.decoder.layers[-1]
    for module in final_decoder_layer.modules():
        if isinstance(module, nn.Dropout):
            module.train()

enable_fast_test_time_dropout(model)


def print_model_with_mode(module, indent=0):
    prefix = ' ' * indent
    mode = 'train' if module.training else 'eval'
    print(f"{prefix}{module.__class__.__name__} (mode={mode})")
    for name, child in module.named_children():
        print_model_with_mode(child, indent + 4)

print_model_with_mode(model)
# model


TransformerModel (mode=eval)
    Embedding (mode=eval)
    Dropout (mode=eval)
    PositionalEncoding (mode=eval)
    Transformer (mode=eval)
        TransformerEncoder (mode=eval)
            ModuleList (mode=eval)
                TransformerEncoderLayer (mode=eval)
                    MultiheadAttention (mode=eval)
                        NonDynamicallyQuantizableLinear (mode=eval)
                    Linear (mode=eval)
                    Dropout (mode=eval)
                    Linear (mode=eval)
                    LayerNorm (mode=eval)
                    LayerNorm (mode=eval)
                    Dropout (mode=eval)
                    Dropout (mode=eval)
                TransformerEncoderLayer (mode=eval)
                    MultiheadAttention (mode=eval)
                        NonDynamicallyQuantizableLinear (mode=eval)
                    Linear (mode=eval)
                    Dropout (mode=eval)
                    Linear (mode=eval)
                    LayerNorm (mode=eval)


In [1]:
from uq.acquisition_func import BLEU_mean_output_batch,BLEUVariance,VR_mpnet_base_distance
import torch
# Dummy data
sentences = [["Hello world", "Goodbye world", "Hi globe", "Hi you are cool","Hello world"],
             ["dogs are cool", "cats are cool", "dogs are nice", "cats are cool","doggie"],
             [
                "The W514 village association is once again hosting this great exhibition.",
                "Once again, the local club W514 is setting out this large exhibition.",
                "The local district association W514 is once again holding this large exhibition.",
                "The local association W514 is once again setting out this large exhibition.",
                "The W514 local association is once again organizing this large exhibition."
             ],             
            ]
bv=BLEUVariance()
vr=VR_mpnet_base_distance()
print("BLEU_mean_output_batch",BLEU_mean_output_batch(sentences))
print("BLEUVariance",bv(sentences,torch.zeros(2)))
print("VR_mpnet_base_distance",vr(sentences,torch.zeros(2)))


  from .autonotebook import tqdm as notebook_tqdm


BLEU_mean_output_batch ['Hi globe', 'dogs are cool', 'The local association W514 is once again setting out this large exhibition.']
BLEUVariance tensor([1.9722e-31, 1.9722e-31, 1.9722e-31])
VR_mpnet_base_distance tensor([1.8255, 1.5468, 0.6451])


In [None]:
import tiktoken
from gpt2project.data_processing.load_commongen import generate_input_text
from gpt2project.gpt2model import GPT
from gpt2project.gpt2_generate import generate_autoregressivly_gpt2
import torch

from gpt2project.utils.decode import decode_token_list

model = GPT.from_pretrained("gpt2").cuda()
model.eval()
prompt = generate_input_text(["dog", "cat", "mouse"])
tokenizer = tiktoken.get_encoding("gpt2")


using device: cuda
loading weights from pretrained gpt: gpt2


In [73]:

from gpt2project.search_methods_gpt import AutoregressiveInferenceResults, _clean_inference_results
import torch.nn as nn
from hyperparameters import hyperparameters

def topk_sampling_gpt(
    model: nn.Module,
    tgt_tokens: torch.Tensor,
    vocab_size: int,
    max_generated_len: int,
    break_on_newline: bool,
    k: int = 8,
    temperature: float = 0.6,
) -> "AutoregressiveInferenceResults":
    with torch.no_grad():
        prompt_len = tgt_tokens.size(1)
        total_len = prompt_len + max_generated_len
        batch_size = tgt_tokens.size(0)
        softmax_probs = torch.empty((batch_size, 0, vocab_size)).to(
            hyperparameters.device
        )

        assert tgt_tokens.shape == (batch_size, prompt_len)

        for t in range(max_generated_len):
            output, _ = model(tgt_tokens)
            assert output.shape == (batch_size, tgt_tokens.size(1), vocab_size)
            logits = output[:, -1, :]
            assert logits.shape == (batch_size, vocab_size)
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            assert probs.shape == (batch_size, vocab_size)
            topk_probs, topk_tokens = torch.topk(probs, k=k, dim=-1)
            ix = torch.multinomial(topk_probs, num_samples=1)
            predicted_tokens = torch.gather(topk_tokens, dim=-1, index=ix)
            assert predicted_tokens.shape == (batch_size, 1)
            tgt_tokens = torch.cat([tgt_tokens, predicted_tokens], dim=1)
            softmax_probs = torch.cat([softmax_probs, probs.unsqueeze(1)], dim=1)

    assert tgt_tokens.shape == (batch_size, total_len)
    assert softmax_probs.shape == (batch_size, max_generated_len, vocab_size)

    tgt_tokens = tgt_tokens[:, prompt_len:]
    assert tgt_tokens.shape == (batch_size, max_generated_len)
    tgt_tokens, softmax_probs = _clean_inference_results(
        tgt_tokens, softmax_probs, break_on_newline
    )

    return AutoregressiveInferenceResults(tgt_tokens, softmax_probs)


tgt_tokens = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).cuda()
result = generate_autoregressivly_gpt2(
    model, tokenizer, tgt_tokens, topk_sampling_gpt, break_on_newline=True
)
generated_tokens = result.token_ids.squeeze(0).cpu()
decode_token_list(generated_tokens, tokenizer)

" The cat ran happily on the dog's lap.\n"