In [1]:
import os
import json
import time
from torch import nn
from transformers import pipeline
from transformers import BertTokenizer
from transformers import BertForMaskedLM

First, download and extract the [2018 Wikipedia dumps](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2735) in the data folder for the 15 XNLI languages :

```bash
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/en.txt.gz -P data
gunzip data/en.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/fr.txt.gz -P data
gunzip data/fr.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/es.txt.gz -P data
gunzip data/es.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/de.txt.gz -P data
gunzip data/de.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/zh.txt.gz -P data
gunzip data/zh.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/ar.txt.gz -P data
gunzip data/ar.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/ru.txt.gz -P data
gunzip data/ru.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/vi.txt.gz -P data
gunzip data/vi.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/el.txt.gz -P data
gunzip data/el.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/bg.txt.gz -P data
gunzip data/bg.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/th.txt.gz -P data
gunzip data/th.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/tr.txt.gz -P data
gunzip data/tr.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/hi.txt.gz -P data
gunzip data/hi.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/ur.txt.gz -P data
gunzip data/ur.txt.gz
wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2735/sw.txt.gz -P data
gunzip data/sw.txt.gz
```

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [3]:
bert_vocab = list(tokenizer.vocab.keys())
len(bert_vocab)

119547

# Select vocabularies

## English

In [4]:
num_lines = 0
num_long_lines = 0
path_english = 'data/en.txt'

with open(path_english) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

40331979


85988806

In [5]:
path_english = 'data/en.txt'
english_tokens = dict()
english_tokens_unique = dict()

cpt = 0

t0 = time.time()

with open(path_english) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in english_tokens:
                    english_tokens[token] = 1
                else:
                    english_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in english_tokens_unique:
                    english_tokens_unique[token] = 1
                else:
                    english_tokens_unique[token] += 1
len(english_tokens)

1.0 619.2274067401886
2.0 1110.1034936904907
3.0 1541.703180551529
4.0 1903.5885076522827
5.0 2333.342573404312
6.0 2735.3478829860687
7.0 3114.140008687973
8.0 3427.5012040138245
9.0 3733.960935115814
10.0 4035.6765863895416
11.0 4335.431081771851
12.0 4630.971633672714
13.0 4921.836084842682
14.0 5209.920123338699
15.0 5493.038271903992
16.0 5775.249083280563
17.0 6066.010138750076
18.0 6382.9865119457245
19.0 6653.678861618042
20.0 6921.082093477249
21.0 7185.778695106506
22.0 7446.724161863327
23.0 7717.169724225998
24.0 7989.366755723953
25.0 8292.291065216064
26.0 10190.70014500618
27.0 10436.101276874542
28.0 10673.275538444519
29.0 10905.899139642715
30.0 11140.80565571785
31.0 11374.124992609024
32.0 11606.152292251587
33.0 11839.679318904877
34.0 12076.439000844955
35.0 12306.209696769714
36.0 12536.870078802109
37.0 12760.052688360214
38.0 12981.1307888031
39.0 13195.619985103607
40.0 13406.025039196014
41.0 13623.775834798813
42.0 13849.890589237213
43.0 14055.302285909653


94768

In [23]:
with open('tokens_freqs/english_freqs_lines.json', 'w') as outfile:
    json.dump(english_tokens_unique, outfile)

In [137]:
seuil_en = int(num_long_lines*0.005/100)
seuil_en

2016

In [139]:
print(len(english_tokens_unique))

selected_english_tokens = []

for tok in english_tokens_unique:
    if english_tokens_unique[tok] >= seuil_en:
        selected_english_tokens.append(tok)
len(selected_english_tokens)

94768


28458

## French

In [24]:
num_lines = 0
num_long_lines = 0
path_french = 'data/fr.txt'

with open(path_french) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

14157922


30963413

In [25]:
path_french = 'data/fr.txt'
french_tokens = dict()
french_tokens_unique = dict()
cpt = 0

with open(path_french) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in french_tokens:
                    french_tokens[token] = 1
                else:
                    french_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in french_tokens_unique:
                    french_tokens_unique[token] = 1
                else:
                    french_tokens_unique[token] += 1
len(french_tokens)

1.0 39810.99858593941
2.0 40137.153487443924
3.0 40431.46473360062
4.0 40708.45555782318
5.0 40973.78323554993
6.0 41215.0137860775
7.0 41478.401816129684
8.0 41750.72194433212
9.0 42008.57228732109
10.0 42283.9727435112
11.0 42519.331260204315
12.0 42731.96723771095
13.0 42995.029457330704
14.0 43226.48523283005
15.0 43466.27327799797
16.0 43696.58191180229
17.0 43921.311435222626
18.0 44130.201075553894
19.0 44332.168563365936
20.0 44539.21525597572
21.0 44742.61037325859
22.0 44933.7055413723
23.0 45125.2714009285
24.0 45318.316663980484
25.0 45513.25025177002
26.0 45709.20458507538
27.0 45917.36214637756
28.0 46108.93855166435
29.0 46304.16796207428
30.0 46499.237392902374


79453

In [26]:
with open('tokens_freqs/french_freqs_lines.json', 'w') as outfile:
    json.dump(french_tokens_unique, outfile)

In [140]:
seuil_fr = int(num_long_lines*0.005/100)
seuil_fr

707

In [141]:
print(len(french_tokens_unique))

selected_french_tokens = []

for tok in french_tokens_unique:
    if french_tokens_unique[tok] >= seuil_fr:
        selected_french_tokens.append(tok)
len(selected_french_tokens)

79453


24482

## Spanish

In [27]:
num_lines = 0
num_long_lines = 0
path_spanish = 'data/es.txt'

with open(path_spanish) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

10366564


21981550

In [28]:
path_spanish = 'data/es.txt'
spanish_tokens = dict()
spanish_tokens_unique = dict()
cpt = 0

with open(path_spanish) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in spanish_tokens:
                    spanish_tokens[token] = 1
                else:
                    spanish_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in spanish_tokens_unique:
                    spanish_tokens_unique[token] = 1
                else:
                    spanish_tokens_unique[token] += 1
len(spanish_tokens)

1.0 47031.93489098549
2.0 47351.575922489166
3.0 47646.15384101868
4.0 47929.57264137268
5.0 48202.53243994713
6.0 48464.20136833191
7.0 48718.98474192619
8.0 48971.43407225609
9.0 49217.07389807701
10.0 49461.11621928215
11.0 49698.869629621506
12.0 49923.902220487595
13.0 50120.99433493614
14.0 50328.89301943779
15.0 50556.55177426338
16.0 50828.06156396866
17.0 51066.96748971939
18.0 51323.644045352936
19.0 51575.98098778725
20.0 51824.440475702286
21.0 52069.99340748787


80150

In [29]:
with open('tokens_freqs/spanish_freqs_lines.json', 'w') as outfile:
    json.dump(spanish_tokens_unique, outfile)

In [30]:
len(spanish_tokens)

80150

In [144]:
seuil_es = int(num_long_lines*0.005/100)
seuil_es

518

In [145]:
print(len(spanish_tokens_unique))

selected_spanish_tokens = []

for tok in spanish_tokens_unique:
    if spanish_tokens_unique[tok] >= seuil_es:
        selected_spanish_tokens.append(tok)
len(selected_spanish_tokens)

80150


26346

## German

In [33]:
num_lines = 0
num_long_lines = 0
path_german = 'data/de.txt'

with open(path_german) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

16089720


34715825

In [35]:
path_german = 'data/de.txt'
german_tokens = dict()
german_tokens_unique = dict()
cpt = 0

with open(path_german) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in german_tokens:
                    german_tokens[token] = 1
                else:
                    german_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in german_tokens_unique:
                    german_tokens_unique[token] = 1
                else:
                    german_tokens_unique[token] += 1
len(german_tokens)

1.0 54116.66038155556
2.0 54577.04998540878
3.0 54942.04729270935
4.0 55246.97596168518
5.0 55532.67652797699
6.0 55822.38264584541
7.0 56106.992736816406
8.0 56388.41462779045
9.0 56674.296174287796
10.0 56943.6097369194
11.0 57210.43217921257
12.0 57457.73711562157
13.0 57720.30193734169
14.0 57982.308339595795
15.0 58241.14898991585
16.0 58501.795637369156
17.0 58760.0890789032
18.0 59018.77443575859
19.0 59278.861579179764
20.0 59535.25803351402
21.0 59790.468202352524
22.0 60043.414365291595
23.0 60314.525517463684
24.0 60575.48369884491
25.0 60859.920449495316
26.0 61181.619931936264
27.0 61499.977361917496
28.0 61816.68152093887
29.0 62186.0216858387
30.0 62483.2623193264
31.0 62740.58487343788
32.0 62993.88537359238
33.0 63248.73959302902
34.0 63495.93461251259


78478

In [36]:
with open('tokens_freqs/german_freqs_lines.json', 'w') as outfile:
    json.dump(german_tokens_unique, outfile)

In [146]:
seuil_de = int(num_long_lines*0.005/100)
seuil_de

804

In [147]:
print(len(german_tokens_unique))

selected_german_tokens = []

for tok in german_tokens_unique:
    if german_tokens_unique[tok] >= seuil_de:
        selected_german_tokens.append(tok)
len(selected_german_tokens)

78478


26031

## Chinese

In [38]:
num_lines = 0
num_long_lines = 0
path_chinese = 'data/zh.txt'

with open(path_chinese) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

4487824


10878878

In [39]:
chinese_tokens = dict()
chinese_tokens_unique = dict()
cpt = 0

with open(path_chinese) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in chinese_tokens:
                    chinese_tokens[token] = 1
                else:
                    chinese_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in chinese_tokens_unique:
                    chinese_tokens_unique[token] = 1
                else:
                    chinese_tokens_unique[token] += 1
len(chinese_tokens)

1.0 63904.307149887085
2.0 64103.00419545174
3.0 64302.74151086807
4.0 64489.303250312805
5.0 64671.73628425598
6.0 64847.511660814285
7.0 64970.82591891289
8.0 65122.165700912476
9.0 65302.25393438339
10.0 65474.06551837921


65060

In [40]:
len(chinese_tokens)

65060

In [41]:
with open('tokens_freqs/chinese_freqs_lines.json', 'w') as outfile:
    json.dump(chinese_tokens_unique, outfile)

In [149]:
seuil_zh = int(num_long_lines*0.005/100)
seuil_zh

224

In [150]:
print(len(chinese_tokens_unique))

selected_chinese_tokens = []

for tok in chinese_tokens_unique:
    if chinese_tokens_unique[tok] >= seuil_zh:
        selected_chinese_tokens.append(tok)
len(selected_chinese_tokens)

65060


12928

## Arabic

In [42]:
num_lines = 0
num_long_lines = 0
path_arabic = 'data/ar.txt'

with open(path_arabic) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

2853863


5715225

In [43]:
arabic_tokens = dict()
arabic_tokens_unique = dict()
cpt = 0

with open(path_arabic) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in arabic_tokens:
                    arabic_tokens[token] = 1
                else:
                    arabic_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in arabic_tokens_unique:
                    arabic_tokens_unique[token] = 1
                else:
                    arabic_tokens_unique[token] += 1
len(arabic_tokens)

1.0 65985.41222953796
2.0 66255.21002721786
3.0 66543.65768647194
4.0 66747.40564274788
5.0 66937.59504151344


58052

In [44]:
with open('tokens_freqs/arabic_freqs.json', 'w') as outfile:
    json.dump(arabic_tokens, outfile)

In [45]:
with open('tokens_freqs/arabic_freqs_lines.json', 'w') as outfile:
    json.dump(arabic_tokens_unique, outfile)

In [151]:
seuil_ar = int(num_long_lines*0.005/100)
seuil_ar

142

In [152]:
print(len(arabic_tokens_unique))

selected_arabic_tokens = []

for tok in arabic_tokens_unique:
    if arabic_tokens_unique[tok] >= seuil_ar:
        selected_arabic_tokens.append(tok)
len(selected_arabic_tokens)

58052


7292

## Russe

In [46]:
num_lines = 0
num_long_lines = 0
path_russe = 'data/ru.txt'

with open(path_russe) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

11510609


24412492

In [47]:
russe_tokens = dict()
russe_tokens_unique = dict()
cpt = 0

with open(path_russe) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in russe_tokens:
                    russe_tokens[token] = 1
                else:
                    russe_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in russe_tokens_unique:
                    russe_tokens_unique[token] = 1
                else:
                    russe_tokens_unique[token] += 1
len(russe_tokens)

1.0 67441.4977927208
2.0 67781.51218128204
3.0 68125.64979195595
4.0 68482.1391248703
5.0 68855.19577002525
6.0 69165.49130535126
7.0 69496.68894863129
8.0 69766.70977449417
9.0 70017.82785439491
10.0 70265.03732800484
11.0 70504.1641740799
12.0 70732.79295420647
13.0 70970.14817857742
14.0 71204.29126548767
15.0 71450.25069975853
16.0 71685.4098341465
17.0 71919.06532096863
18.0 72156.71354842186
19.0 72406.77476096153
20.0 72658.3042345047
21.0 72908.02834582329
22.0 73165.59919810295
23.0 73424.83898615837
24.0 73684.35419297218


79241

In [48]:
with open('tokens_freqs/russe_freqs.json', 'w') as outfile:
    json.dump(russe_tokens, outfile)

In [49]:
with open('tokens_freqs/russe_freqs_lines.json', 'w') as outfile:
    json.dump(russe_tokens_unique, outfile)

In [160]:
seuil_ru = int(num_long_lines*0.005/100)
seuil_ru

575

In [161]:
print(len(russe_tokens_unique))

selected_russe_tokens = []

for tok in russe_tokens_unique:
    if russe_tokens_unique[tok] >= seuil_ru:
        selected_russe_tokens.append(tok)
len(selected_russe_tokens)

79241


14270

## Vietnamese

In [99]:
num_lines = 0
num_long_lines = 0
path_vi = 'data/vi.txt'

with open(path_vi) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

3538941


7520180

In [100]:
vi_tokens = dict()
vi_tokens_unique = dict()
cpt = 0

with open(path_vi) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in vi_tokens:
                    vi_tokens[token] = 1
                else:
                    vi_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in vi_tokens_unique:
                    vi_tokens_unique[token] = 1
                else:
                    vi_tokens_unique[token] += 1
len(vi_tokens)

1.0 98122.46129226685
2.0 98327.61943459511
3.0 98463.24028873444
4.0 98585.39857244492
5.0 98707.9568901062
6.0 98833.27023386955
7.0 98938.17175722122


65103

In [101]:
with open('tokens_freqs/vi_freqs.json', 'w') as outfile:
    json.dump(vi_tokens, outfile)

In [102]:
with open('tokens_freqs/vi_freqs_lines.json', 'w') as outfile:
    json.dump(vi_tokens_unique, outfile)

In [162]:
seuil_vi = int(num_long_lines*0.005/100)
seuil_vi

176

In [163]:
print(len(vi_tokens_unique))

selected_vi_tokens = []

for tok in vi_tokens_unique:
    if vi_tokens_unique[tok] >= seuil_vi:
        selected_vi_tokens.append(tok)
len(selected_vi_tokens)

65103


17512

## Greek

In [103]:
num_lines = 0
num_long_lines = 0
path_el = 'data/el.txt'

with open(path_el) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

1097507


2278048

In [104]:
el_tokens = dict()
el_tokens_unique = dict()
cpt = 0

with open(path_el) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in el_tokens:
                    el_tokens[token] = 1
                else:
                    el_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in el_tokens_unique:
                    el_tokens_unique[token] = 1
                else:
                    el_tokens_unique[token] += 1
len(el_tokens)

1.0 99542.60071659088
2.0 100057.30239129066


54387

In [105]:
with open('tokens_freqs/el_freqs.json', 'w') as outfile:
    json.dump(el_tokens, outfile)

In [106]:
with open('tokens_freqs/el_freqs_lines.json', 'w') as outfile:
    json.dump(el_tokens_unique, outfile)

In [166]:
seuil_el = int(num_long_lines*0.005/100)
seuil_el

54

In [167]:
print(len(el_tokens_unique))

selected_el_tokens = []

for tok in el_tokens_unique:
    if el_tokens_unique[tok] >= seuil_el:
        selected_el_tokens.append(tok)
len(selected_el_tokens)

54387


11616

## Bulgarian

In [170]:
num_lines = 0
num_long_lines = 0
path_bg = 'data/bg.txt'

with open(path_bg) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

1394130


2976901

In [171]:
bg_tokens = dict()
bg_tokens_unique = dict()
cpt = 0

with open(path_bg) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in bg_tokens:
                    bg_tokens[token] = 1
                else:
                    bg_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in bg_tokens_unique:
                    bg_tokens_unique[token] = 1
                else:
                    bg_tokens_unique[token] += 1
len(bg_tokens)

1.0 140929.85722017288
2.0 141206.18313527107


54431

In [172]:
with open('tokens_freqs/bg_freqs.json', 'w') as outfile:
    json.dump(bg_tokens, outfile)

In [173]:
with open('tokens_freqs/bg_freqs_lines.json', 'w') as outfile:
    json.dump(bg_tokens_unique, outfile)

In [174]:
seuil_bg = int(num_long_lines*0.005/100)
seuil_bg

69

In [175]:
print(len(bg_tokens_unique))

selected_bg_tokens = []

for tok in bg_tokens_unique:
    if bg_tokens_unique[tok] >= seuil_bg:
        selected_bg_tokens.append(tok)
len(selected_bg_tokens)

54431


12121

## Thai

In [115]:
num_lines = 0
num_long_lines = 0
path_th = 'data/th.txt'

with open(path_th) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

745765


1594166

In [116]:
th_tokens = dict()
th_tokens_unique = dict()
cpt = 0

with open(path_th) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in th_tokens:
                    th_tokens[token] = 1
                else:
                    th_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in th_tokens_unique:
                    th_tokens_unique[token] = 1
                else:
                    th_tokens_unique[token] += 1
len(th_tokens)

1.0 105579.57854032516


45346

In [117]:
with open('tokens_freqs/th_freqs.json', 'w') as outfile:
    json.dump(th_tokens, outfile)

In [118]:
with open('tokens_freqs/th_freqs_lines.json', 'w') as outfile:
    json.dump(th_tokens_unique, outfile)

In [176]:
seuil_th = int(num_long_lines*0.005/100)

In [177]:
print(len(th_tokens_unique))

selected_th_tokens = []

for tok in th_tokens_unique:
    if th_tokens_unique[tok] >= seuil_th:
        selected_th_tokens.append(tok)
len(selected_th_tokens)

45346


8493

## Turkish

In [119]:
num_lines = 0
num_long_lines = 0
path_tr = 'data/tr.txt'

with open(path_tr) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

1708074


3576847

In [120]:
tr_tokens = dict()
tr_tokens_unique = dict()
cpt = 0

with open(path_tr) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in tr_tokens:
                    tr_tokens[token] = 1
                else:
                    tr_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in tr_tokens_unique:
                    tr_tokens_unique[token] = 1
                else:
                    tr_tokens_unique[token] += 1
len(th_tokens)

1.0 106623.1616370678
2.0 106868.95719408989
3.0 107085.14948058128


45346

In [121]:
with open('tokens_freqs/tr_freqs.json', 'w') as outfile:
    json.dump(tr_tokens, outfile)

In [122]:
with open('tokens_freqs/tr_freqs_lines.json', 'w') as outfile:
    json.dump(tr_tokens_unique, outfile)

In [180]:
seuil_tr = int(num_long_lines*0.005/100)
seuil_tr

85

In [181]:
print(len(tr_tokens_unique))

selected_tr_tokens = []

for tok in tr_tokens_unique:
    if tr_tokens_unique[tok] >= seuil_tr:
        selected_tr_tokens.append(tok)
len(selected_tr_tokens)

60808


19086

## Hindi

In [123]:
num_lines = 0
num_long_lines = 0
path_hi = 'data/hi.txt'

with open(path_hi) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

586128


1192644

In [124]:
hi_tokens = dict()
hi_tokens_unique = dict()
cpt = 0

with open(path_hi) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in hi_tokens:
                    hi_tokens[token] = 1
                else:
                    hi_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in hi_tokens_unique:
                    hi_tokens_unique[token] = 1
                else:
                    hi_tokens_unique[token] += 1
len(th_tokens)

1.0 107493.82563185692


45346

In [125]:
with open('tokens_freqs/hi_freqs.json', 'w') as outfile:
    json.dump(hi_tokens, outfile)

In [126]:
with open('tokens_freqs/hi_freqs_lines.json', 'w') as outfile:
    json.dump(hi_tokens_unique, outfile)

In [182]:
seuil_hi = int(num_long_lines*0.005/100)
seuil_hi

29

In [183]:
print(len(hi_tokens_unique))

selected_hi_tokens = []

for tok in hi_tokens_unique:
    if hi_tokens_unique[tok] >= seuil_hi:
        selected_hi_tokens.append(tok)
len(selected_hi_tokens)

37780


5664

## Urdu

In [127]:
num_lines = 0
num_long_lines = 0
path_ur = 'data/ur.txt'

with open(path_ur) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

434866


897328

In [128]:
ur_tokens = dict()
ur_tokens_unique = dict()
cpt = 0

with open(path_ur) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in ur_tokens:
                    ur_tokens[token] = 1
                else:
                    ur_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in ur_tokens_unique:
                    ur_tokens_unique[token] = 1
                else:
                    ur_tokens_unique[token] += 1
len(ur_tokens)

35702

In [129]:
with open('tokens_freqs/ur_freqs.json', 'w') as outfile:
    json.dump(ur_tokens, outfile)

In [130]:
with open('tokens_freqs/ur_freqs_lines.json', 'w') as outfile:
    json.dump(ur_tokens_unique, outfile)

In [184]:
seuil_ur = int(num_long_lines*0.005/100)
seuil_ur

21

In [185]:
print(len(ur_tokens_unique))

selected_ur_tokens = []

for tok in ur_tokens_unique:
    if ur_tokens_unique[tok] >= seuil_ur:
        selected_ur_tokens.append(tok)
len(selected_ur_tokens)

35702


8656

## Swahili

In [131]:
num_lines = 0
num_long_lines = 0
path_sw = 'data/sw.txt'

with open(path_sw) as infile:
    for line in infile:
        num_lines += 1
        if len(line)>5:
            num_long_lines += 1
print(num_long_lines)
num_lines

159632


350193

In [132]:
sw_tokens = dict()
sw_tokens_unique = dict()
cpt = 0

with open(path_sw) as infile:
    for line in infile:
        cpt += 1
        if cpt%1000000==0:
            print(cpt/1000000, time.time()-t0)
        if len(line)>5:
            tokens = tokenizer.tokenize(line)
            for token in tokens:
                if token not in sw_tokens:
                    sw_tokens[token] = 1
                else:
                    sw_tokens[token] += 1
            for token in list(set(tokens)):
                if token not in sw_tokens_unique:
                    sw_tokens_unique[token] = 1
                else:
                    sw_tokens_unique[token] += 1
len(sw_tokens)

34536

In [133]:
with open('tokens_freqs/sw_freqs.json', 'w') as outfile:
    json.dump(sw_tokens, outfile)

In [134]:
with open('tokens_freqs/sw_freqs_lines.json', 'w') as outfile:
    json.dump(sw_tokens_unique, outfile)

In [186]:
seuil_sw = int(num_long_lines*0.005/100)
seuil_sw

7

In [187]:
print(len(sw_tokens_unique))

selected_sw_tokens = []

for tok in sw_tokens_unique:
    if sw_tokens_unique[tok] >= seuil_sw:
        selected_sw_tokens.append(tok)
len(selected_sw_tokens)

34536


16619

## Save all vocabs

In [238]:
langs = {'en': selected_english_tokens, 'fr': selected_french_tokens, 'es': selected_spanish_tokens,
         'de': selected_german_tokens, 'zh': selected_chinese_tokens, 'ar': selected_arabic_tokens,
         'ru': selected_russe_tokens, 'vi': selected_vi_tokens, 'el': selected_el_tokens,
         'bg': selected_bg_tokens, 'th': selected_th_tokens, 'tr': selected_tr_tokens,
         'hi': selected_hi_tokens, 'ur': selected_ur_tokens, 'sw': selected_sw_tokens}
len(langs)

15

In [241]:
for lang in langs.keys():
    with open('selected_tokens/selected_'+lang+'_tokens.txt', 'w') as output:
        for tok in langs[lang]:
            output.write(tok+'\n')

## Load all vocabs

In [4]:
langs = dict()

for l in ['en', 'fr', 'es', 'de', 'zh', 'ar', 'ru', 'vi', 'el', 'bg', 'th', 'tr', 'hi', 'ur', 'sw']:
    with open('selected_tokens/selected_'+l+'_tokens.txt') as file:
        langs[l] = file.read().splitlines()
len(langs)

15

## Choosing vocabulary

In [5]:
all_selected_tokens = []
for k in langs.keys():
    all_selected_tokens.extend(langs[k])
selected_tokens = list(set(all_selected_tokens))
len(selected_tokens)

71564

## Resize token embeddings

In [6]:
TOKENS_TO_KEEP = ['[PAD]','[UNK]','[CLS]','[SEP]','[MASK]','[unused1]','[unused2]','[unused3]',
                  '[unused4]','[unused5]', '[unused6]','[unused7]','[unused8]','[unused9]']

for tok in TOKENS_TO_KEEP:
    if tok not in selected_tokens:
        selected_tokens.append(tok)

len(selected_tokens)

71577

In [7]:
def select_embeddings(model, old_vocab, new_vocab, model_name='new_model'):
    
    # Get old embeddings from model
    old_embeddings = model.get_input_embeddings()
    old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
    
    if old_num_tokens != len(old_vocab):
        print('len(old_vocab) != len(model.old_embeddings)')
        return old_embeddings
    
    new_num_tokens = len(new_vocab)
    if new_vocab is None:
        print('nothing to copy')
        return old_embeddings
    
    # Build new embeddings
    new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
    new_embeddings.to(old_embeddings.weight.device)
    
    # Copy weights
    i = 0
    j = 0
    vocab = []
    for token in old_vocab:
        if token in new_vocab:
            vocab.append(token)
            new_embeddings.weight.data[i, :] = old_embeddings.weight.data[j, :]
            i += 1
        j += 1
    
    model.set_input_embeddings(new_embeddings)
    
    # Update base model and current model config
    model.config.vocab_size = new_num_tokens
    model.vocab_size = new_num_tokens

    # Tie weights
    model.tie_weights()
    
    # Save new model
    model.save_pretrained(model_name)
    print(model_name, " - ", " num_parameters : ", model.num_parameters())
    print(model_name, " - ", " num_tokens : ", len(vocab))
    
    # Save vocab
    fw = open(os.path.join(model_name, 'vocab.txt'), 'w')
    for token in vocab:
        fw.write(token+'\n')
    fw.close()
    
    # Save tokenizer config
    fw = open(os.path.join(model_name, 'tokenizer_config.json'), 'w')
    json.dump({"do_lower_case": False, "model_max_length": 512}, fw)
    fw.close()
    
    return new_embeddings

In [8]:
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model_cased.num_parameters()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


177974523

# Generating models

## Generating 15langs model

In [9]:
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, selected_tokens, 'new-models/bert-base-15lang-cased')
print(time.time()-t)
new_embs

new-models/bert-base-15lang-cased  -   num_parameters :  141085593
new-models/bert-base-15lang-cased  -   num_tokens :  71577
307.41669940948486


Embedding(71577, 768)

## Generating monolingual models

In [10]:
for lang in list(langs.keys()):
    del model_cased
    model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
    t = time.time()
    new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs[lang]+TOKENS_TO_KEEP)), 
                                 'new-models/bert-base-'+lang+'-cased')
    print(time.time()-t)
    print()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-cased  -   num_parameters :  107937079
new-models/bert-base-en-cased  -   num_tokens :  28471
73.92174339294434



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-fr-cased  -   num_parameters :  104879535
new-models/bert-base-fr-cased  -   num_tokens :  24495
65.44030809402466



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-es-cased  -   num_parameters :  106312951
new-models/bert-base-es-cased  -   num_tokens :  26359
69.25197982788086



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-de-cased  -   num_parameters :  106070716
new-models/bert-base-de-cased  -   num_tokens :  26044
70.63011884689331



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-zh-cased  -   num_parameters :  95994509
new-models/bert-base-zh-cased  -   num_tokens :  12941
32.777645111083984



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-ar-cased  -   num_parameters :  91660425
new-models/bert-base-ar-cased  -   num_tokens :  7305
19.091326236724854



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-ru-cased  -   num_parameters :  97026507
new-models/bert-base-ru-cased  -   num_tokens :  14283
41.2095103263855



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-vi-cased  -   num_parameters :  99519605
new-models/bert-base-vi-cased  -   num_tokens :  17525
46.40220260620117



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-el-cased  -   num_parameters :  94985581
new-models/bert-base-el-cased  -   num_tokens :  11629
27.513412714004517



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-bg-cased  -   num_parameters :  95373926
new-models/bert-base-bg-cased  -   num_tokens :  12134
36.462172985076904



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-th-cased  -   num_parameters :  92583994
new-models/bert-base-th-cased  -   num_tokens :  8506
23.689405918121338



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-tr-cased  -   num_parameters :  100730011
new-models/bert-base-tr-cased  -   num_tokens :  19099
48.08704328536987



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-hi-cased  -   num_parameters :  90408493
new-models/bert-base-hi-cased  -   num_tokens :  5677
17.46791982650757



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-ur-cased  -   num_parameters :  92709341
new-models/bert-base-ur-cased  -   num_tokens :  8669
23.566728591918945



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-sw-cased  -   num_parameters :  98832888
new-models/bert-base-sw-cased  -   num_tokens :  16632
44.70777344703674



## Generating bilingual models

In [11]:
len(bert_vocab)

119547

### en-fr

In [12]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['fr']+TOKENS_TO_KEEP)),
                             'new-models/bert-base-en-fr-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


new-models/bert-base-en-fr-cased  -   num_parameters :  111732863
new-models/bert-base-en-fr-cased  -   num_tokens :  33407
65.13403367996216


In [14]:
tokenizer_cust = BertTokenizer.from_pretrained('new-models/bert-base-en-fr-cased')

In [15]:
print(tokenizer_cust.unk_token)
tokenizer_cust.unk_token_id

[UNK]


10

In [16]:
print(tokenizer_cust.sep_token)
tokenizer_cust.sep_token_id

[SEP]


12

In [17]:
print(tokenizer_cust.mask_token)
tokenizer_cust.mask_token_id

[MASK]


13

### en-es

In [18]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['es']+TOKENS_TO_KEEP)), 
                             'bert-base-en-es-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-es-cased  -   num_parameters :  113245486
bert-base-en-es-cased  -   num_tokens :  35374
70.03286385536194


### en-de

In [19]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['de']+TOKENS_TO_KEEP)), 
                             'bert-base-en-de-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-de-cased  -   num_parameters :  113145516
bert-base-en-de-cased  -   num_tokens :  35244
64.38058757781982


### en-el

In [20]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['el']+TOKENS_TO_KEEP)), 
                             'bert-base-en-el-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-el-cased  -   num_parameters :  109559669
bert-base-en-el-cased  -   num_tokens :  30581
61.141303062438965


### en-bg

In [21]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['bg']+TOKENS_TO_KEEP)), 
                             'bert-base-en-bg-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-bg-cased  -   num_parameters :  112664891
bert-base-en-bg-cased  -   num_tokens :  34619
55.813469648361206


### en-ru

In [22]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['ru']+TOKENS_TO_KEEP)), 
                             'bert-base-en-ru-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-ru-cased  -   num_parameters :  114853465
bert-base-en-ru-cased  -   num_tokens :  37465
51.599135398864746


### en-tr

In [23]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['tr']+TOKENS_TO_KEEP)), 
                             'bert-base-en-tr-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-tr-cased  -   num_parameters :  110167948
bert-base-en-tr-cased  -   num_tokens :  31372
43.49697542190552


### en-ar

In [24]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['ar']+TOKENS_TO_KEEP)), 
                             'bert-base-en-ar-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-ar-cased  -   num_parameters :  110630117
bert-base-en-ar-cased  -   num_tokens :  31973
47.8929979801178


### en-vi

In [25]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['vi']+TOKENS_TO_KEEP)), 
                             'bert-base-en-vi-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-vi-cased  -   num_parameters :  110210243
bert-base-en-vi-cased  -   num_tokens :  31427
43.392895460128784


### en-th

In [26]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['th']+TOKENS_TO_KEEP)), 
                             'bert-base-en-th-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-th-cased  -   num_parameters :  108459999
bert-base-en-th-cased  -   num_tokens :  29151
40.50568199157715


### en-zh

In [27]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['zh']+TOKENS_TO_KEEP)), 
                             'bert-base-en-zh-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-zh-cased  -   num_parameters :  113130905
bert-base-en-zh-cased  -   num_tokens :  35225
55.543163537979126


### en-hi

In [28]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['hi']+TOKENS_TO_KEEP)), 
                             'bert-base-en-hi-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-hi-cased  -   num_parameters :  109165172
bert-base-en-hi-cased  -   num_tokens :  30068
59.588555097579956


### en-sw

In [29]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['sw']+TOKENS_TO_KEEP)), 
                             'bert-base-en-sw-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-sw-cased  -   num_parameters :  109641952
bert-base-en-sw-cased  -   num_tokens :  30688
50.077721118927


### en-ur

In [30]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
t = time.time()
new_embs = select_embeddings(model_cased, bert_vocab, list(set(langs['en']+langs['ur']+TOKENS_TO_KEEP)), 
                             'bert-base-en-ur-cased')
print(time.time()-t)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-en-ur-cased  -   num_parameters :  110167179
bert-base-en-ur-cased  -   num_tokens :  31371
58.7756142616272


# Compare original and new models

In [31]:
del model_cased
model_cased = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
# original model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model.num_parameters()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


177974523

In [33]:
# new model
tokenizer_cust = BertTokenizer.from_pretrained('new-models/bert-base-en-cased')
model_cust = BertForMaskedLM.from_pretrained('new-models/bert-base-en-cased')
model_cust.num_parameters()

107937079

In [34]:
len(tokenizer_cust.get_vocab())

28471

In [35]:
model_cust.get_input_embeddings()

Embedding(28471, 768, padding_idx=0)

In [36]:
text = "I love NLP"
encoded_input = tokenizer(text, return_tensors='pt')
output_original = model(**encoded_input)
encoded_input_cust = tokenizer_cust(text, return_tensors='pt')
output_cust = model_cust(**encoded_input_cust)

In [37]:
encoded_input

{'input_ids': tensor([[  101,   146, 16138, 81130, 11127,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [38]:
encoded_input_cust

{'input_ids': tensor([[   11,    54,  3477, 23039,   892,    12]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [39]:
print(len(output_original[0][0]), len(output_cust[0][0]))

6 6


In [40]:
output_original[0][0]

tensor([[-8.5026, -8.4598, -8.5441,  ..., -8.4676, -8.3309, -8.4011],
        [-7.3152, -7.4170, -7.2602,  ..., -6.7312, -7.1424, -7.0654],
        [-8.5618, -9.1271, -7.8516,  ..., -8.3914, -7.0207, -8.6194],
        [-6.9560, -6.5613, -6.1853,  ..., -6.7822, -6.2483, -6.3508],
        [-8.5762, -7.8520, -6.7847,  ..., -8.3420, -6.3392, -7.7183],
        [-7.3157, -7.2129, -6.7588,  ..., -6.8620, -6.5756, -8.0586]],
       grad_fn=<SelectBackward>)

In [41]:
output_cust[0][0]

tensor([[-8.5026, -8.4598, -8.5441,  ..., -2.4711, -4.2386, -1.1125],
        [-7.3152, -7.4170, -7.2602,  ..., -2.6856, -5.7124, -3.6802],
        [-8.5618, -9.1271, -7.8516,  ...,  0.5088, -6.1223,  0.1668],
        [-6.9560, -6.5613, -6.1853,  ..., -2.0895, -4.5995, -0.7040],
        [-8.5762, -7.8520, -6.7847,  ..., -1.4740, -3.6588, -0.5311],
        [-7.3157, -7.2129, -6.7588,  ..., -2.2422, -5.2254,  1.2093]],
       grad_fn=<SelectBackward>)

In [42]:
i = 0
for input_id in encoded_input['input_ids'][0]:
    print(tokenizer.convert_ids_to_tokens(int(input_id)))
    print(output_original[0][0][i].detach().numpy()[:5])
    print(output_cust[0][0][i].detach().numpy()[:5])
    print()
    i+=1

[CLS]
[-8.502596 -8.459798 -8.544106 -8.420239 -8.55526 ]
[-8.502596 -8.459798 -8.544106 -8.420239 -8.55526 ]

I
[-7.3151646 -7.416972  -7.260161  -7.000843  -7.0258822]
[-7.3151646 -7.416972  -7.260161  -7.000843  -7.0258822]

love
[-8.561801  -9.127073  -7.8515744 -8.405504  -8.349455 ]
[-8.561801  -9.127073  -7.8515744 -8.405504  -8.349455 ]

NL
[-6.9560323 -6.5612655 -6.185295  -5.8626823 -6.8318934]
[-6.9560323 -6.5612655 -6.185295  -5.8626823 -6.8318934]

##P
[-8.576168 -7.852015 -6.784669 -7.650504 -7.808926]
[-8.576168 -7.852015 -6.784669 -7.650504 -7.808926]

[SEP]
[-7.315665  -7.2128882 -6.75876   -6.995212  -7.240693 ]
[-7.315665  -7.2128882 -6.75876   -6.995212  -7.240693 ]



## Tests on MLM

In [45]:
## declare task ##
pipe = pipeline(task="fill-mask", model=model, tokenizer=tokenizer)

## example ##
input_  = 'Paris is the [MASK] of France.'

output_ = pipe(input_)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

capital 0.6365790963172913
city 0.08376165479421616
City 0.034411922097206116
port 0.02745007537305355
centre 0.012592659331858158


In [46]:
## declare task ##
pipe = pipeline(task="fill-mask", model=model_cust, tokenizer=tokenizer_cust)

## example ##
input_  = 'Paris is the [MASK] of France.'

output_ = pipe(input_)
for i in range(len(output_)):
    print(output_[i]['token_str'], output_[i]['score'])

capital 0.6138291358947754
city 0.07793192565441132
City 0.03626968339085579
port 0.02652185596525669
centre 0.014177550561726093


# Convert all models to TF

In [4]:
from transformers import TFBertForMaskedLM

In [5]:
for model_name in os.listdir('new-models'):
    tf_model = TFBertForMaskedLM.from_pretrained("new-models/"+model_name, from_pt=True)
    tf_model.save_pretrained("new-models/"+model_name)
    del tf_model