In [1]:


import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import PyramidionsConfig, PyramidionsModel, RobertaTokenizerFast

In [3]:
print("Creating model...")
tokenizer = RobertaTokenizerFast.from_pretrained("uklfr/gottbert-base")
tokenizer.model_max_length = 512
tokenizer.init_kwargs["model_max_length"] = 512


config = PyramidionsConfig()
config.update({"num_hidden_layers": 9, "max_position_embeddings": 514, "type_vocab_size": 1})
print(config)
model = PyramidionsModel(config)

model.resize_token_embeddings(len(tokenizer))


print("Loading pretrained model and copying weights...")
from transformers import RobertaModel

pretrained_model = RobertaModel.from_pretrained("uklfr/gottbert-base")
    


model.load_state_dict(pretrained_model.state_dict(), strict=False)

Creating model...
PyramidionsConfig {
  "alpha": 3.0,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "pyramidions",
  "num_attention_heads": 12,
  "num_hidden_layers": 9,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 30522
}

Loading pretrained model and copying weights...


Some weights of the model checkpoint at uklfr/gottbert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


_IncompatibleKeys(missing_keys=['encoder.layer.0.pooler.scorer.0.weight', 'encoder.layer.0.pooler.scorer.0.bias', 'encoder.layer.1.pooler.scorer.0.weight', 'encoder.layer.1.pooler.scorer.0.bias', 'encoder.layer.2.pooler.scorer.0.weight', 'encoder.layer.2.pooler.scorer.0.bias', 'encoder.layer.3.pooler.scorer.0.weight', 'encoder.layer.3.pooler.scorer.0.bias', 'encoder.layer.4.pooler.scorer.0.weight', 'encoder.layer.4.pooler.scorer.0.bias', 'encoder.layer.5.pooler.scorer.0.weight', 'encoder.layer.5.pooler.scorer.0.bias', 'encoder.layer.6.pooler.scorer.0.weight', 'encoder.layer.6.pooler.scorer.0.bias', 'encoder.layer.7.pooler.scorer.0.weight', 'encoder.layer.7.pooler.scorer.0.bias', 'encoder.layer.8.pooler.scorer.0.weight', 'encoder.layer.8.pooler.scorer.0.bias'], unexpected_keys=['encoder.layer.9.attention.self.query.weight', 'encoder.layer.9.attention.self.query.bias', 'encoder.layer.9.attention.self.key.weight', 'encoder.layer.9.attention.self.key.bias', 'encoder.layer.9.attention.self.

In [4]:
model.save_pretrained("pyramidions-gottbert-base")

In [5]:
from transformers import EncoderDecoderModel

enc_dec = EncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_pretrained_model_name_or_path="pyramidions-gottbert-base",
    decoder_pretrained_model_name_or_path="pyramidions-gottbert-base",
    decoder_add_cross_attention=True
)
enc_dec.config.decoder_start_token_id = tokenizer.cls_token_id
enc_dec.config.pad_token_id = tokenizer.pad_token_id
enc_dec.config.vocab_size = enc_dec.config.decoder.vocab_size
enc_dec.config.decoder.add_cross_attention = True
enc_dec.config.decoder.add_cross_attention = True
enc_dec.config.decoder.add_cross_attention


Some weights of PyramidionsForCausalLM were not initialized from the model checkpoint at pyramidions-gottbert-base and are newly initialized: ['encoder.layer.2.crossattention.output.dense.bias', 'encoder.layer.1.crossattention.self.query.weight', 'encoder.layer.5.crossattention.self.value.weight', 'lm_head.dense.bias', 'encoder.layer.6.crossattention.self.query.bias', 'encoder.layer.4.crossattention.output.dense.weight', 'encoder.layer.7.crossattention.output.LayerNorm.bias', 'encoder.layer.8.crossattention.self.query.bias', 'encoder.layer.4.crossattention.self.query.weight', 'encoder.layer.2.crossattention.self.value.weight', 'encoder.layer.5.crossattention.self.value.bias', 'encoder.layer.0.crossattention.output.LayerNorm.bias', 'encoder.layer.3.crossattention.self.value.bias', 'encoder.layer.1.crossattention.output.dense.weight', 'encoder.layer.1.crossattention.output.LayerNorm.weight', 'encoder.layer.3.crossattention.self.key.weight', 'lm_head.bias', 'encoder.layer.8.crossattention

True

In [6]:
input_ids = tokenizer(["This is a really long text", "This a text."], padding="max_length", return_tensors="pt").input_ids
labels = tokenizer(["This is the corresponding summary", "To sum up"], padding="max_length", return_tensors="pt").input_ids
outputs = enc_dec(input_ids=input_ids, labels=input_ids)
loss, logits = outputs.loss, outputs.logits
print(loss.item())


12.509509086608887




In [7]:
generated = enc_dec.generate(input_ids)
print(tokenizer.decode(generated[0]))

<s>steilenegradsteilensens STR Agnpässepässepässesensichersteilenbadgrat Probenkantstocksteilenegrad


In [8]:
enc_dec.save_pretrained("pyramidion2pyramidion-9layer")
tokenizer.save_pretrained("pyramidion2pyramidion-9layer")

('pyramidion2pyramidion-9layer/tokenizer_config.json',
 'pyramidion2pyramidion-9layer/special_tokens_map.json',
 'pyramidion2pyramidion-9layer/vocab.json',
 'pyramidion2pyramidion-9layer/merges.txt',
 'pyramidion2pyramidion-9layer/added_tokens.json',
 'pyramidion2pyramidion-9layer/tokenizer.json')