In [1]:
from roberta2longformer import convert_roberta_to_longformer

from transformers import RobertaModel, RobertaTokenizerFast
from transformers import LongformerModel, LongformerTokenizerFast

# uklfr/gottbert-base
roberta_model = RobertaModel.from_pretrained("roberta-base")
roberta_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [2]:
longformer_model, longformer_tokenizer = convert_roberta_to_longformer(
    roberta_model=roberta_model,
    roberta_tokenizer=roberta_tokenizer,
    longformer_max_length=8192,
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'LongformerTokenizerFast'.


In [3]:
longformer_model.save_pretrained("tmp/longformer-gottbert")
longformer_tokenizer.save_pretrained("tmp/longformer-gottbert")

('tmp/longformer-gottbert/tokenizer_config.json',
 'tmp/longformer-gottbert/special_tokens_map.json',
 'tmp/longformer-gottbert/vocab.json',
 'tmp/longformer-gottbert/merges.txt',
 'tmp/longformer-gottbert/added_tokens.json',
 'tmp/longformer-gottbert/tokenizer.json')

In [4]:
longformer_model = LongformerModel.from_pretrained("tmp/longformer-gottbert")
longformer_tokenizer = LongformerTokenizerFast.from_pretrained(
    "tmp/longformer-gottbert"
)

In [5]:
inputs = longformer_tokenizer(
    "Er sah eine irdische Zentralregierung, und er erblickte Frieden, Wohlstand und galaktische Anerkennung."
    "Es war eine Vision, doch er nahm sie mit vollen Sinnen in sich auf."
    "Im Laderaum der STARDUST begann eine rätselhafte Maschine zu summen."
    "Die dritte Macht nahm die Arbeit auf."
    "Da lächelte Perry Rhodan zum blauen Himmel empor."
    "Langsam löste er die Rangabzeichen von dem Schulterstück seiner Kombination.",
    return_tensors="pt",
)
ouputs = longformer_model(**inputs)

In [6]:
list(longformer_model.encoder.state_dict().items())[0]

('layer.0.attention.self.query.weight',
 tensor([[ 0.0729, -0.0029, -0.0902,  ...,  0.1033,  0.0900, -0.1030],
         [-0.0516,  0.2061,  0.0739,  ...,  0.0657,  0.0634,  0.1282],
         [ 0.0878,  0.0698, -0.0515,  ..., -0.0426, -0.0081,  0.1100],
         ...,
         [-0.1871,  0.0172, -0.0315,  ..., -0.0503,  0.1024, -0.1165],
         [-0.2532,  0.0439,  0.0638,  ...,  0.0701, -0.1045,  0.0118],
         [-0.0516, -0.0859,  0.1027,  ..., -0.1895,  0.0033, -0.0541]]))

In [7]:
list(roberta_model.encoder.state_dict().items())[0]

('layer.0.attention.self.query.weight',
 tensor([[ 0.0729, -0.0029, -0.0902,  ...,  0.1033,  0.0900, -0.1030],
         [-0.0516,  0.2061,  0.0739,  ...,  0.0657,  0.0634,  0.1282],
         [ 0.0878,  0.0698, -0.0515,  ..., -0.0426, -0.0081,  0.1100],
         ...,
         [-0.1871,  0.0172, -0.0315,  ..., -0.0503,  0.1024, -0.1165],
         [-0.2532,  0.0439,  0.0638,  ...,  0.0701, -0.1045,  0.0118],
         [-0.0516, -0.0859,  0.1027,  ..., -0.1895,  0.0033, -0.0541]]))

In [8]:
list(longformer_model.embeddings.state_dict().items())[1]

('word_embeddings.weight',
 tensor([[ 0.1476, -0.0365,  0.0753,  ..., -0.0023,  0.0172, -0.0016],
         [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
         [-0.0347, -0.0873, -0.0180,  ...,  0.1174, -0.0098, -0.0355],
         ...,
         [ 0.0304,  0.0504, -0.0307,  ...,  0.0377,  0.0096,  0.0084],
         [ 0.0623, -0.0596,  0.0307,  ..., -0.0920,  0.1080, -0.0183],
         [ 0.1259, -0.0145,  0.0332,  ...,  0.0121,  0.0342,  0.0168]]))

In [9]:
list(roberta_model.embeddings.state_dict().items())[1]

('word_embeddings.weight',
 tensor([[ 0.1476, -0.0365,  0.0753,  ..., -0.0023,  0.0172, -0.0016],
         [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
         [-0.0347, -0.0873, -0.0180,  ...,  0.1174, -0.0098, -0.0355],
         ...,
         [ 0.0304,  0.0504, -0.0307,  ...,  0.0377,  0.0096,  0.0084],
         [ 0.0623, -0.0596,  0.0307,  ..., -0.0920,  0.1080, -0.0183],
         [ 0.1259, -0.0145,  0.0332,  ...,  0.0121,  0.0342,  0.0168]]))

In [10]:
list(longformer_model.embeddings.state_dict().items())[0]

('position_ids', tensor([[   0,    1,    2,  ..., 8191, 8192, 8193]]))