In [1]:
from transformers import RobertaModel, RobertaTokenizerFast
from transformers import LongformerModel, LongformerTokenizerFast

roberta_model = RobertaModel.from_pretrained("uklfr/gottbert-base")
roberta_tokenizer = RobertaTokenizerFast.from_pretrained("uklfr/gottbert-base")
longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")

Some weights of the model checkpoint at uklfr/gottbert-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- T

In [2]:
list(longformer_model.encoder.state_dict().items())[0]

('layer.0.attention.self.query.weight',
 tensor([[ 0.0709,  0.0058, -0.0958,  ...,  0.1009,  0.0916, -0.1061],
         [-0.0454,  0.1986,  0.0717,  ...,  0.0694,  0.0555,  0.1310],
         [ 0.0862,  0.0564, -0.0507,  ..., -0.0383, -0.0048,  0.1035],
         ...,
         [-0.1843,  0.0107, -0.0321,  ..., -0.0515,  0.1050, -0.1197],
         [-0.2524,  0.0441,  0.0672,  ...,  0.0712, -0.1082,  0.0129],
         [-0.0488, -0.0893,  0.1063,  ..., -0.1860,  0.0062, -0.0533]]))

In [3]:
list(roberta_model.encoder.state_dict().items())[0]

('layer.0.attention.self.query.weight',
 tensor([[-0.0269,  0.0167,  0.1127,  ...,  0.0181, -0.0102,  0.0051],
         [ 0.0810,  0.0226,  0.0244,  ..., -0.0358,  0.0325,  0.0302],
         [ 0.0144,  0.0280,  0.0135,  ..., -0.0652,  0.0628, -0.0140],
         ...,
         [-0.0131,  0.0109,  0.0334,  ..., -0.0476,  0.0423,  0.0392],
         [ 0.0170,  0.0442, -0.0579,  ..., -0.0340, -0.0264, -0.0402],
         [ 0.0092, -0.0088, -0.0338,  ...,  0.0180,  0.0569,  0.0584]]))

In [4]:
from collections import OrderedDict
from tempfile import TemporaryDirectory

def convert_roberta_to_longformer(
    roberta_model,
    roberta_tokenizer,
    longformer_model,
    longformer_max_length: int = None):
    
    if longformer_max_length is None:
        longformer_max_length = longformer_model.config.max_position_embeddings + 1
        
    
    
    
    
    ###############################
    # Create longformer tokenizer #
    ###############################
    
    # Longformer tokenizers are Roberta tokenizers.
    # But to follow the conventions 
    # and to avoid confusion we create a 
    # longformer tokenizer class with the state of
    # the original tokenizer.
    with TemporaryDirectory() as temp_dir:
        roberta_tokenizer.save_pretrained(temp_dir)
        longformer_tokenizer = LongformerTokenizerFast.from_pretrained(temp_dir)
    longformer_tokenizer.model_max_length = longformer_max_length
    longformer_tokenizer.init_kwargs['model_max_length'] = longformer_max_length
    
    
    
    ######################
    # Copy model weights #
    ######################
    
    # We only copy the encoder weights and resize the embeddings.
    # Pooler weights are kept untouched.
    
    #---------#
    # Encoder #
    #---------#
    roberta_parameters = roberta_model.encoder.state_dict()
    longformer_parameters = longformer_model.encoder.state_dict()
    
    # Load all compatible keys directly and obtain missing keys to handle later
    errors = longformer_model.encoder.load_state_dict(roberta_parameters, strict=False)
    assert not errors.unexpected_keys, "Found unexpected keys"
    missing_keys = errors.missing_keys
    
    # We expect, the keys to be the weights of the global attention modules and
    # reuse roberta's normal attention weights for those modules.
    for longformer_key in missing_keys:
        # Resolve layer properties
        prefix, layer_idx, layer_class, layer_type, target, params = longformer_key.split(".")
        assert layer_class == "attention" or target.endswith("global"), f"Unexcpected parameters {longformer_key}."
        # Copy the normal weights attention weights to the global attention layers too
        roberta_target_key = ".".join([prefix, layer_idx, layer_class, layer_type, target.removesuffix("_global"), params])
        roberta_weights = roberta_parameters[roberta_target_key]
        orig_weights = longformer_parameters[longformer_key]
        longformer_parameters[longformer_key] = roberta_weights
    
    # Update the state of the longformer model
    longformer_model.encoder.load_state_dict(longformer_parameters, strict=True)
    
    #------------#
    # Embeddings #
    #------------#
    # There are two types of embeddings:
    # 1. Token embeddings
    # 2. Positional embeddings
    # But we only need to copy the token embeddings 
    # while keeping the positional embeddings fixed.

    roberta_embeddings_parameters = roberta_model.embeddings.state_dict()
    embedding_parameters2copy = []
    # We have to resize the token embeddings upfront, to make load_state_dict work.
    longformer_model.resize_token_embeddings(len(roberta_tokenizer))
    for key, item in roberta_embeddings_parameters.items():
        if not "position" in key:
            embedding_parameters2copy.append((key, item))
    embedding_parameters2copy = OrderedDict(embedding_parameters2copy)
    
    longformer_model.embeddings.load_state_dict(embedding_parameters2copy, strict=False)

    
    return longformer_model, longformer_tokenizer

In [5]:
longformer_model, longformer_tokenizer = convert_roberta_to_longformer(
    roberta_model=roberta_model,
    roberta_tokenizer=roberta_tokenizer,
    longformer_model=longformer_model,
    longformer_max_length=12
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'LongformerTokenizerFast'.


In [6]:
inputs = longformer_tokenizer("Er sah eine irdische Zentralregierung, und er erblickte Frieden, Wohlstand und galaktische Anerkennung."
                              "Es war eine Vision, doch er nahm sie mit vollen Sinnen in sich auf."
                              "Im Laderaum der STARDUST begann eine rätselhafte Maschine zu summen."
                              "Die dritte Macht nahm die Arbeit auf."
                              "Da lächelte Perry Rhodan zum blauen Himmel empor."
                              "Langsam löste er die Rangabzeichen von dem Schulterstück seiner Kombination.",
                              return_tensors="pt")
ouputs = longformer_model(**inputs)

Token indices sequence length is longer than the specified maximum sequence length for this model (85 > 12). Running this sequence through the model will result in indexing errors


In [7]:
list(longformer_model.encoder.state_dict().items())[0]

('layer.0.attention.self.query.weight',
 tensor([[-0.0269,  0.0167,  0.1127,  ...,  0.0181, -0.0102,  0.0051],
         [ 0.0810,  0.0226,  0.0244,  ..., -0.0358,  0.0325,  0.0302],
         [ 0.0144,  0.0280,  0.0135,  ..., -0.0652,  0.0628, -0.0140],
         ...,
         [-0.0131,  0.0109,  0.0334,  ..., -0.0476,  0.0423,  0.0392],
         [ 0.0170,  0.0442, -0.0579,  ..., -0.0340, -0.0264, -0.0402],
         [ 0.0092, -0.0088, -0.0338,  ...,  0.0180,  0.0569,  0.0584]]))

In [8]:
list(roberta_model.encoder.state_dict().items())[0]

('layer.0.attention.self.query.weight',
 tensor([[-0.0269,  0.0167,  0.1127,  ...,  0.0181, -0.0102,  0.0051],
         [ 0.0810,  0.0226,  0.0244,  ..., -0.0358,  0.0325,  0.0302],
         [ 0.0144,  0.0280,  0.0135,  ..., -0.0652,  0.0628, -0.0140],
         ...,
         [-0.0131,  0.0109,  0.0334,  ..., -0.0476,  0.0423,  0.0392],
         [ 0.0170,  0.0442, -0.0579,  ..., -0.0340, -0.0264, -0.0402],
         [ 0.0092, -0.0088, -0.0338,  ...,  0.0180,  0.0569,  0.0584]]))

In [9]:
list(longformer_model.embeddings.state_dict().items())[1]

('word_embeddings.weight',
 tensor([[-0.0071, -0.1073,  0.1197,  ..., -0.0265,  0.1016, -0.0173],
         [-0.0232, -0.0226,  0.0150,  ..., -0.0031,  0.0157, -0.0189],
         [ 0.0167,  0.2000,  0.0245,  ...,  0.0293, -0.0021,  0.0197],
         ...,
         [-0.0061, -0.0659,  0.0326,  ...,  0.0021,  0.0040, -0.0293],
         [-0.0052,  0.0323,  0.0685,  ...,  0.0269,  0.0426, -0.0095],
         [ 0.0011, -0.0304, -0.0013,  ...,  0.0030,  0.0210,  0.0010]]))

In [10]:
list(roberta_model.embeddings.state_dict().items())[1]

('word_embeddings.weight',
 tensor([[-0.0071, -0.1073,  0.1197,  ..., -0.0265,  0.1016, -0.0173],
         [-0.0232, -0.0226,  0.0150,  ..., -0.0031,  0.0157, -0.0189],
         [ 0.0167,  0.2000,  0.0245,  ...,  0.0293, -0.0021,  0.0197],
         ...,
         [-0.0061, -0.0659,  0.0326,  ...,  0.0021,  0.0040, -0.0293],
         [-0.0052,  0.0323,  0.0685,  ...,  0.0269,  0.0426, -0.0095],
         [ 0.0011, -0.0304, -0.0013,  ...,  0.0030,  0.0210,  0.0010]]))