In [1]:
import os
import json
import time
from torch import nn
from transformers import pipeline
from transformers import BertTokenizer
from transformers import BertForMaskedLM

  return torch._C._cuda_getDeviceCount() > 0


First, download and extract the [2018 Wikipedia dumps](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2735) in the data folder for the 15 XNLI languages :

```bash
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/en.txt.gz -P data
gunzip data/en.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/fr.txt.gz -P data
gunzip data/fr.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/es.txt.gz -P data
gunzip data/es.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/de.txt.gz -P data
gunzip data/de.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/zh.txt.gz -P data
gunzip data/zh.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/ar.txt.gz -P data
gunzip data/ar.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/ru.txt.gz -P data
gunzip data/ru.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/vi.txt.gz -P data
gunzip data/vi.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/el.txt.gz -P data
gunzip data/el.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/bg.txt.gz -P data
gunzip data/bg.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/th.txt.gz -P data
gunzip data/th.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/tr.txt.gz -P data
gunzip data/tr.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/hi.txt.gz -P data
gunzip data/hi.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/ur.txt.gz -P data
gunzip data/ur.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/sw.txt.gz -P data
gunzip data/sw.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/nl.txt.gz -P data
gunzip data/nl.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/uk.txt.gz -P data
gunzip data/uk.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/ro.txt.gz -P data
gunzip data/ro.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/pt.txt.gz -P data
gunzip data/pt.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/it.txt.gz -P data
gunzip data/it.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/lt.txt.gz -P data
gunzip data/lt.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/no.txt.gz -P data
gunzip data/no.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/pl.txt.gz -P data
gunzip data/pl.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/da.txt.gz -P data
gunzip data/da.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/ja.txt.gz -P data
gunzip data/ja.txt.gz
```

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [3]:
bert_vocab = list(tokenizer.vocab.keys())
len(bert_vocab)

119547

# Select vocabularies

In [4]:
languages = ['en', 'fr', 'es', 'de', 'zh', 'ar', 'ru', 'vi', 'el', 'bg', 'th', 'tr', 'hi', 
              'ur', 'sw', 'nl', 'uk', 'ro', 'pt', 'it', 'lt', 'no', 'pl', 'da', 'ja']
len(languages)

25

In [5]:
for lang in languages:
    num_lines = 0
    num_long_lines = 0
    path = 'data/'+lang+'.txt'
    with open(path) as infile:
        for line in infile:
            num_lines += 1
            if len(line)>5:
                num_long_lines += 1
    # compute frequencies
    lang_tokens = dict()
    lang_tokens_unique = dict()
    t0 = time.time()
    with open(path) as infile:
        for line in infile:
            if len(line)>5:
                tokens = tokenizer.tokenize(line)
                for token in tokens:
                    if token not in lang_tokens:
                        lang_tokens[token] = 1
                    else:
                        lang_tokens[token] += 1
                for token in list(set(tokens)):
                    if token not in lang_tokens_unique:
                        lang_tokens_unique[token] = 1
                    else:
                        lang_tokens_unique[token] += 1
    # save frequencies
    with open('tokens_freqs/'+lang+'_freqs.json', 'w') as outfile:
        json.dump(lang_tokens, outfile)
    seuil = int(num_long_lines*0.005/100)
    num_selected_tokens = 0
    with open('selected_tokens/selected_'+lang+'_tokens.txt', 'w') as output:
        for tok in lang_tokens_unique:
            if lang_tokens_unique[tok] >= seuil:
                output.write(tok+'\n')
                num_selected_tokens += 1

## Load all vocabs

In [6]:
langs = dict()

for l in languages:
    with open('selected_tokens/selected_'+l+'_tokens.txt') as file:
        langs[l] = file.read().splitlines()
len(langs)

25

## Choosing vocabulary

In [7]:
all_selected_tokens = []
for k in langs.keys():
    all_selected_tokens.extend(langs[k])
selected_tokens = list(set(all_selected_tokens))
len(selected_tokens)

84972

## Resize token embeddings

In [8]:
TOKENS_TO_KEEP = ['[PAD]','[UNK]','[CLS]','[SEP]','[MASK]','[unused1]','[unused2]','[unused3]',
                  '[unused4]','[unused5]', '[unused6]','[unused7]','[unused8]','[unused9]']

for tok in TOKENS_TO_KEEP:
    if tok not in selected_tokens:
        selected_tokens.append(tok)

len(selected_tokens)

84985

In [9]:
def select_embeddings(model, old_vocab, new_vocab, model_name='new_model'):
    
    # Get old embeddings from model
    old_embeddings = model.get_input_embeddings()
    old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
    
    if old_num_tokens != len(old_vocab):
        print('len(old_vocab) != len(model.old_embeddings)')
        return old_embeddings
    
    new_num_tokens = len(new_vocab)
    if new_vocab is None:
        print('nothing to copy')
        return old_embeddings
    
    # Build new embeddings
    new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
    new_embeddings.to(old_embeddings.weight.device)
    
    # Copy weights
    i = 0
    j = 0
    vocab = []
    for token in old_vocab:
        if token in new_vocab:
            vocab.append(token)
            new_embeddings.weight.data[i, :] = old_embeddings.weight.data[j, :]
            i += 1
        j += 1
    
    model.set_input_embeddings(new_embeddings)
    
    # Update base model and current model config
    model.config.vocab_size = new_num_tokens
    model.vocab_size = new_num_tokens

    # Tie weights
    model.tie_weights()
    
    # Save new model
    model.save_pretrained(model_name)
    print(model_name, " - ", " num_parameters : ", model.num_parameters())
    print(model_name, " - ", " num_tokens : ", len(vocab))
    
    # Save vocab
    fw = open(os.path.join(model_name, 'vocab.txt'), 'w')
    for token in vocab:
        fw.write(token+'\n')
    fw.close()
    
    # Save tokenizer config
    fw = open(os.path.join(model_name, 'tokenizer_config.json'), 'w')
    json.dump({"do_lower_case": False, "model_max_length": 512}, fw)
    fw.close()
    
    return new_embeddings

In [10]:
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model_cased.num_parameters()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


177974523

# Generating models

## Generating 25langs model

In [11]:
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, selected_tokens, 'new-models/bert-base-25lang-cased')
print(time.time()-t)
new_embs

new-models/bert-base-25lang-cased  -   num_parameters :  151396345
new-models/bert-base-25lang-cased  -   num_tokens :  84985
449.4667663574219


Embedding(84985, 768)

## Generating 5langs models

In [12]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['es']+
                                                               langs['de']+langs['zh']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-es-de-zh-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-es-de-zh-cased  -   num_parameters :  125126536
new-models/bert-base-en-fr-es-de-zh-cased  -   num_tokens :  50824
158.7407102584839


In [13]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['nl']+
                                                               langs['ru']+langs['ar']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-nl-ru-ar-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-nl-ru-ar-cased  -   num_parameters :  123720035
new-models/bert-base-en-fr-nl-ru-ar-cased  -   num_tokens :  48995
143.9530508518219


In [14]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['uk']+
                                                               langs['el']+langs['ro']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-uk-el-ro-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-uk-el-ro-cased  -   num_parameters :  120014224
new-models/bert-base-en-fr-uk-el-ro-cased  -   num_tokens :  44176
120.72881722450256


In [15]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['es']+
                                                               langs['pt']+langs['it']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-es-pt-it-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-es-pt-it-cased  -   num_parameters :  119231382
new-models/bert-base-en-fr-es-pt-it-cased  -   num_tokens :  43158
157.18285059928894


In [16]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['lt']+
                                                               langs['no']+langs['pl']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-lt-no-pl-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-lt-no-pl-cased  -   num_parameters :  118497756
new-models/bert-base-en-fr-lt-no-pl-cased  -   num_tokens :  42204
104.5706377029419


In [17]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['zh']+
                                                               langs['ja']+langs['vi']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-zh-ja-vi-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-zh-ja-vi-cased  -   num_parameters :  119745074
new-models/bert-base-en-fr-zh-ja-vi-cased  -   num_tokens :  43826
131.54477763175964


In [18]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['de']+
                                                               langs['no']+langs['da']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-de-no-da-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-de-no-da-cased  -   num_parameters :  118117870
new-models/bert-base-en-fr-de-no-da-cased  -   num_tokens :  41710
124.39375185966492


In [19]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['da']+
                                                               langs['ja']+langs['vi']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-da-ja-vi-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-da-ja-vi-cased  -   num_parameters :  119753533
new-models/bert-base-en-fr-da-ja-vi-cased  -   num_tokens :  43837
106.04906463623047


## Generating trilingual models

In [10]:
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['es']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-es-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-es-cased  -   num_parameters :  116154613
new-models/bert-base-en-fr-es-cased  -   num_tokens :  39157
109.01875948905945


In [12]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['es']+langs['it']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-es-it-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-es-it-cased  -   num_parameters :  115747812
new-models/bert-base-en-es-it-cased  -   num_tokens :  38628
107.14139437675476


In [13]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['es']+langs['pt']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-es-pt-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-es-pt-cased  -   num_parameters :  114788100
new-models/bert-base-en-es-pt-cased  -   num_tokens :  37380
110.02508020401001


In [14]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['de']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-de-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-de-cased  -   num_parameters :  116043877
new-models/bert-base-en-fr-de-cased  -   num_tokens :  39013
110.83046197891235


In [15]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['it']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-it-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-it-cased  -   num_parameters :  114829626
new-models/bert-base-en-fr-it-cased  -   num_tokens :  37434
102.47859787940979


In [16]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['zh']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-zh-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-zh-cased  -   num_parameters :  116735208
new-models/bert-base-en-fr-zh-cased  -   num_tokens :  39912
102.47170114517212


In [17]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['es']+langs['zh']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-es-zh-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-es-zh-cased  -   num_parameters :  118330114
new-models/bert-base-en-es-zh-cased  -   num_tokens :  41986
121.32015228271484


In [18]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+langs['ar']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-ar-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-ar-cased  -   num_parameters :  114258259
new-models/bert-base-en-fr-ar-cased  -   num_tokens :  36691
118.46728754043579


In [19]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['el']+langs['ru']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-el-ru-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-el-ru-cased  -   num_parameters :  116167686
new-models/bert-base-en-el-ru-cased  -   num_tokens :  39174
111.97321152687073


In [20]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['zh']+langs['hi']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-zh-hi-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-zh-hi-cased  -   num_parameters :  114350539
new-models/bert-base-en-zh-hi-cased  -   num_tokens :  36811
84.94034099578857


## Generating bilingual models

In [10]:
for lang in list(langs.keys())[-10:]:
    model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
    t = time.time()
    new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs[lang]+TOKENS_TO_KEEP)), 
                                 'new-models/bert-base-en-'+lang+'-cased')
    del model_cased
    print(time.time()-t)
    print()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-nl-cased  -   num_parameters :  111575987
new-models/bert-base-en-nl-cased  -   num_tokens :  33203
106.0707049369812



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-uk-cased  -   num_parameters :  113307775
new-models/bert-base-en-uk-cased  -   num_tokens :  35455
99.78673195838928



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-ro-cased  -   num_parameters :  110857741
new-models/bert-base-en-ro-cased  -   num_tokens :  32269
92.63383269309998



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-pt-cased  -   num_parameters :  112438805
new-models/bert-base-en-pt-cased  -   num_tokens :  34325
98.68715047836304



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-it-cased  -   num_parameters :  112105059
new-models/bert-base-en-it-cased  -   num_tokens :  33891
96.94106602668762



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-lt-cased  -   num_parameters :  110510153
new-models/bert-base-en-lt-cased  -   num_tokens :  31817
91.33998203277588



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-no-cased  -   num_parameters :  111474479
new-models/bert-base-en-no-cased  -   num_tokens :  33071
94.8031907081604



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-pl-cased  -   num_parameters :  112090448
new-models/bert-base-en-pl-cased  -   num_tokens :  33872
97.298259973526



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-da-cased  -   num_parameters :  111213019
new-models/bert-base-en-da-cased  -   num_tokens :  32731
94.70722985267639



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-ja-cased  -   num_parameters :  111672112
new-models/bert-base-en-ja-cased  -   num_tokens :  33328
105.59650444984436



## Generating monolingual models

In [11]:
for lang in list(langs.keys())[-10:]:
    model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
    t = time.time()
    new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs[lang]+TOKENS_TO_KEEP)), 
                                 'new-models/bert-base-'+lang+'-cased')
    del model_cased
    print(time.time()-t)
    print()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-nl-cased  -   num_parameters :  104251262
new-models/bert-base-nl-cased  -   num_tokens :  23678
81.82937741279602



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-uk-cased  -   num_parameters :  95125539
new-models/bert-base-uk-cased  -   num_tokens :  11811
36.54637026786804



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-ro-cased  -   num_parameters :  102620982
new-models/bert-base-ro-cased  -   num_tokens :  21558
72.35977983474731



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-pt-cased  -   num_parameters :  105267880
new-models/bert-base-pt-cased  -   num_tokens :  25000
76.86364984512329



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-it-cased  -   num_parameters :  105649304
new-models/bert-base-it-cased  -   num_tokens :  25496
81.77199244499207



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-lt-cased  -   num_parameters :  98382254
new-models/bert-base-lt-cased  -   num_tokens :  16046
48.86039853096008



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-no-cased  -   num_parameters :  104036711
new-models/bert-base-no-cased  -   num_tokens :  23399
80.93481659889221



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-pl-cased  -   num_parameters :  103266173
new-models/bert-base-pl-cased  -   num_tokens :  22397
83.10979652404785



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-da-cased  -   num_parameters :  103845230
new-models/bert-base-da-cased  -   num_tokens :  23150
82.06179618835449



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-ja-cased  -   num_parameters :  93342228
new-models/bert-base-ja-cased  -   num_tokens :  9492
32.165956258773804



# Compare original and new models

In [10]:
# original model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model.num_parameters()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


177974523

In [11]:
# new model
tokenizer_cust = BertTokenizer.from_pretrained('new-models/bert-base-en-fr-ar-cased')
model_cust = BertForMaskedLM.from_pretrained('new-models/bert-base-en-fr-ar-cased')
model_cust.num_parameters()

114258259

In [12]:
len(tokenizer_cust.get_vocab())

36691

In [13]:
model_cust.get_input_embeddings()

Embedding(36691, 768, padding_idx=0)

In [14]:
text = "I love NLP"
encoded_input = tokenizer(text, return_tensors='pt')
output_original = model(**encoded_input)
encoded_input_cust = tokenizer_cust(text, return_tensors='pt')
output_cust = model_cust(**encoded_input_cust)

In [15]:
encoded_input

{'input_ids': tensor([[  101,   146, 16138, 81130, 11127,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [16]:
encoded_input_cust

{'input_ids': tensor([[   11,    54,  3953, 28486,  1043,    12]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [17]:
print(len(output_original[0][0]), len(output_cust[0][0]))

6 6


In [18]:
output_original[0][0]

tensor([[-8.5026, -8.4598, -8.5441,  ..., -8.4676, -8.3309, -8.4011],
        [-7.3152, -7.4170, -7.2602,  ..., -6.7312, -7.1424, -7.0654],
        [-8.5618, -9.1271, -7.8516,  ..., -8.3914, -7.0207, -8.6194],
        [-6.9560, -6.5613, -6.1853,  ..., -6.7822, -6.2483, -6.3508],
        [-8.5762, -7.8520, -6.7847,  ..., -8.3420, -6.3392, -7.7183],
        [-7.3157, -7.2129, -6.7588,  ..., -6.8620, -6.5756, -8.0586]],
       grad_fn=<SelectBackward>)

In [19]:
output_cust[0][0]

tensor([[-8.5026, -8.4598, -8.5441,  ..., -4.2134, -2.0971, -1.1937],
        [-7.3152, -7.4170, -7.2602,  ..., -5.6871, -5.9795, -3.7613],
        [-8.5618, -9.1271, -7.8516,  ..., -6.0970, -6.2893,  0.0856],
        [-6.9560, -6.5613, -6.1853,  ..., -4.5743, -2.0049, -0.7851],
        [-8.5762, -7.8520, -6.7847,  ..., -3.6335, -5.0034, -0.6123],
        [-7.3157, -7.2129, -6.7588,  ..., -5.2001, -5.3873,  1.1282]],
       grad_fn=<SelectBackward>)

In [26]:
i = 0
for input_id in encoded_input['input_ids'][0]:
    print(tokenizer.convert_ids_to_tokens(int(input_id)))
    print(output_original[0][0][i].detach().numpy()[:6])
    print(output_cust[0][0][i].detach().numpy()[:6])
    print()
    i+=1

[CLS]
[-8.502596 -8.459798 -8.544106 -8.420239 -8.55526  -8.383109]
[-8.502596 -8.459798 -8.544106 -8.420239 -8.55526  -8.383109]

I
[-7.3151646 -7.416972  -7.260161  -7.000843  -7.0258822 -6.568579 ]
[-7.3151646 -7.416972  -7.260161  -7.000843  -7.0258822 -6.568579 ]

love
[-8.561801  -9.127073  -7.8515744 -8.405504  -8.349455  -8.195387 ]
[-8.561801  -9.127073  -7.8515744 -8.405504  -8.349455  -8.195387 ]

NL
[-6.9560323 -6.5612655 -6.185295  -5.8626823 -6.8318934 -6.2828846]
[-6.9560323 -6.5612655 -6.185295  -5.8626823 -6.8318934 -6.2828846]

##P
[-8.576168  -7.852015  -6.784669  -7.650504  -7.808926  -7.4190974]
[-8.576168  -7.852015  -6.784669  -7.650504  -7.808926  -7.4190974]

[SEP]
[-7.315665  -7.2128882 -6.75876   -6.995212  -7.240693  -7.080781 ]
[-7.315665  -7.2128882 -6.75876   -6.995212  -7.240693  -7.080781 ]



## Tests on MLM

In [27]:
## declare task ##
pipe = pipeline(task="fill-mask", model=model, tokenizer=tokenizer)

## example ##
input_  = 'Paris is the [MASK] of France.'

output_ = pipe(input_)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

capital 0.6365790963172913
city 0.08376165479421616
City 0.034411922097206116
port 0.02745007537305355
centre 0.012592659331858158


In [28]:
## declare task ##
pipe = pipeline(task="fill-mask", model=model_cust, tokenizer=tokenizer_cust)

## example ##
input_  = 'Paris is the [MASK] of France.'

output_ = pipe(input_)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

capital 0.6015416979789734
city 0.07779796421527863
City 0.035266101360321045
port 0.028833329677581787
centre 0.014866690151393414


# Convert all models to TF

In [4]:
from transformers import TFBertForMaskedLM

In [5]:
for model_name in os.listdir('new-models'):
    tf_model = TFBertForMaskedLM.from_pretrained("new-models/"+model_name, from_pt=True)
    tf_model.save_pretrained("new-models/"+model_name)
    del tf_model