In [30]:
import pandas as pd
trans_df = pd.read_csv('./data/eng-kir.csv', sep="\t")
trans_df.columns = ['ru', 'tyv']
print(trans_df.shape) # (50000, 5)
print(trans_df.columns)
trans_df.sample(10)

(29505, 2)
Index(['ru', 'tyv'], dtype='object')


Unnamed: 0,ru,tyv
19157,"among whom was Ahijah, who was wearing an epho...","harimwo na Ahiya mwene Ahitubu, Ahitubu mukuru..."
29218,So Peter and the other disciple started for th...,"Nuko Petero arasohoka, na wa mwigishwa wundi, ..."
23580,"Nevertheless, I have a few things against you:...","Ariko mfise i vyo nkugayako bike, kukw i wanyu..."
1319,"Cross over to the coasts of Cyprus and look, s...","Enda ni mujabuke ku bizinga vy’i Kitimu, murāb..."
14133,"So David did as the Lord commanded him, and he...","Nuko Dawidi abigenza artyo, nk’uk’ Uhoraho yam..."
25261,"He looked around at them all, and then said to...","Abēraguzamw’ amaso bose, aramubarira, ati Ramv..."
17290,“So I will consecrate the tent of meeting and ...,Nzokweza iryo hema ry’ibonaniro bo n’igicaniro...
7628,"But Jacob said to him, “My lord knows that the...","Yakobo aramwishura, ati Databuja, urazi kw aba..."
16248,I will expose your righteousness and your work...,Nzoserura ukugororoka kwawe n’ibikorwa vyawe; ...
27850,"After removing Saul, he made David their king....","Imukūye, ibahagurukiriza Dawidi kuba umwami wa..."


In [31]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(trans_df, test_size=0.2, random_state=42)

In [128]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="run_Latn", tgt_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [127]:
tokenizer.src_lang = "lug_Latn"
# Input text in Luganda
inputs = tokenizer(text="Enjuba eziba esiriira ku luuyi olw'ebugwanjuba.", return_tensors="pt")
translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"]
)
print(tokenizer.decode(translated_tokens[0], skip_special_tokens=True))

The sun sets in the west.


In [32]:
import re

def word_tokenize(text):
    """
    Split a text into words, numbers, and punctuation marks
    (for languages where words are separated by spaces)
    """
    return re.findall('(\w+|[^\w\s])', text)

smpl = df_train.sample(10000, random_state=1)
smpl['rus_toks'] = smpl.ru.apply(tokenizer.tokenize)
smpl['tyv_toks'] = smpl.tyv.apply(tokenizer.tokenize)
smpl['rus_words'] = smpl.ru.apply(word_tokenize)
smpl['tyv_words'] = smpl.tyv.apply(word_tokenize)

In [33]:
from tqdm.auto import tqdm, trange
texts_with_unk = [
    text for text in tqdm(trans_df.tyv) 
    if tokenizer.unk_token_id in tokenizer(text).input_ids
]
print(len(texts_with_unk))
# 163
s = random.sample(texts_with_unk, 5)
print(s)

100%|██████████| 29505/29505 [00:02<00:00, 12324.38it/s]

24587
['ngo dushōre ifeza twigurire abarushwa, dushōre n’inkweto twigurire abakene, ngo dushōre n’ibihumbga vy’ ingano?', 'Maze Mose asezera sebukwe, sebukwe na we asubira mu gihugu c’i wabo.', 'Kandi n’iyo novyutsa umutwe, woca umpīga nk’intambge; Maz’ ukongera kunyiyereka ur’ akūmiza.', 'Ni yicare wenyene, acereze, kukw ar’ Uhoraho yayimwambitse.', 'Nukw abakuru bacu n’abo mu gihugu cacu bose baratubarira, bati Ni mukore impamba, mugende kubasanganira, mubabarire, muti N’abasavyi banyu; ngo None ni mubahe isezerano ryo gufatanya na bo.']





In [34]:
import re
import sys
import unicodedata
from sacremoses import MosesPunctNormalizer

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

def get_non_printing_char_replacer(replace_by: str = " "):
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [35]:
texts_with_unk_normed = [
    text for text in tqdm(texts_with_unk) 
    if tokenizer.unk_token_id in tokenizer(preproc(text)).input_ids
]
print(len(texts_with_unk_normed))  # 0

100%|██████████| 24587/24587 [00:03<00:00, 6786.24it/s]

0





In [36]:
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup
model.cuda();
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=1e-3,
)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=1000)

In [37]:
import random
LANGS = [('ru', 'rus_Cyrl'), ('tyv', 'tyv_Cyrl')]

def get_batch_pairs(batch_size, data=df_train):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(preproc(item[l1]))
        yy.append(preproc(item[l2]))
    return xx, yy, long1, long2

print(get_batch_pairs(1))

(["Ni ho wompamagara, nkakwitaba, Cank' unyemerere mvuge, abe ari wewe unyishura."], ['Then summon me and I will answer, or let me speak, and you reply to me.'], 'tyv_Cyrl', 'rus_Cyrl')


In [38]:
import gc
import torch

def cleanup():
    """Try to free GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

In [40]:
batch_size = 16  # 32 already doesn't fit well to 15GB of GPU memory
max_length = 128  # token sequences will be truncated
training_steps = 60000  # Usually, I set a large number of steps,
# and then just interrupt the training manually
losses = []  # with this list, I do very simple tracking of average loss
MODEL_SAVE_PATH = './NLLB/nllb-eng-kir-v1'  # on my Google drive

In [42]:
import numpy as np

model.train()
x, y, loss = None, None, None
cleanup()

tq = trange(len(losses), training_steps)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        # -100 is a magic value ignored in the loss function
        # because we don't want the model to learn to predict padding ids
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:  # usually, it is out-of-memory
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 1000 == 0:
        # each 1000 steps, I report average loss at these steps
        print(i, np.mean(losses[-1000:]))

    if i % 1000 == 0 and i > 0:
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

  2%|▏         | 999/59999 [02:57<2:56:06,  5.58it/s]

1000 1.9577168974876404


  3%|▎         | 1999/59999 [05:57<2:57:03,  5.46it/s] 

2000 1.4184650593400001


  5%|▍         | 2999/59999 [09:17<3:07:43,  5.06it/s] 

3000 1.2917408001422883


  7%|▋         | 3999/59999 [12:38<3:00:39,  5.17it/s] 

4000 1.1559770315885545


  8%|▊         | 4999/59999 [15:58<2:58:34,  5.13it/s] 

5000 1.0913198860287667


 10%|▉         | 5999/59999 [19:19<2:50:22,  5.28it/s] 

6000 1.0222452116012573


 12%|█▏        | 6999/59999 [22:39<2:45:28,  5.34it/s] 

7000 0.9603768056333065


 13%|█▎        | 7999/59999 [25:57<2:34:28,  5.61it/s] 

8000 0.8894408876001835


 15%|█▍        | 8999/59999 [29:21<2:45:05,  5.15it/s] 

9000 0.8386934683024884


 17%|█▋        | 9999/59999 [32:45<2:42:16,  5.14it/s] 

10000 0.815350840061903


 18%|█▊        | 10999/59999 [36:06<2:30:42,  5.42it/s] 

11000 0.7406091915667057


 20%|█▉        | 11999/59999 [39:25<2:23:30,  5.57it/s] 

12000 0.7038045809715986


 22%|██▏       | 12999/59999 [42:44<2:31:23,  5.17it/s] 

13000 0.6763691198676824


 23%|██▎       | 13999/59999 [46:02<2:32:12,  5.04it/s] 

14000 0.6428956026732922


 25%|██▍       | 14999/59999 [49:23<2:20:31,  5.34it/s] 

15000 0.5987155078202486


 27%|██▋       | 15999/59999 [52:41<2:16:07,  5.39it/s] 

16000 0.5689880764931441


 28%|██▊       | 16999/59999 [56:00<2:14:06,  5.34it/s] 

17000 0.5327341787666082


 30%|██▉       | 17999/59999 [59:21<2:09:29,  5.41it/s] 

18000 0.5139888966083527


 32%|███▏      | 18999/59999 [1:02:41<2:14:04,  5.10it/s]

19000 0.4755984714627266


 33%|███▎      | 19999/59999 [1:05:58<1:55:19,  5.78it/s] 

20000 0.45027476413547995


 35%|███▍      | 20999/59999 [1:09:17<1:56:49,  5.56it/s] 

21000 0.42195676627755163


 37%|███▋      | 21999/59999 [1:12:38<2:08:58,  4.91it/s] 

22000 0.4116184184253216


 38%|███▊      | 22999/59999 [1:15:58<1:56:51,  5.28it/s] 

23000 0.3826705976724625


 40%|███▉      | 23999/59999 [1:19:18<1:52:05,  5.35it/s] 

24000 0.3640548131912947


 42%|████▏     | 24999/59999 [1:22:37<1:45:01,  5.55it/s] 

25000 0.35234981010854244


 43%|████▎     | 25999/59999 [1:25:57<1:47:52,  5.25it/s] 

26000 0.31896859578043224


 45%|████▍     | 26999/59999 [1:29:12<1:38:02,  5.61it/s] 

27000 0.3024690904691815


 47%|████▋     | 27999/59999 [1:32:28<1:40:51,  5.29it/s] 

28000 0.28957539814710614


 48%|████▊     | 28999/59999 [1:35:42<1:32:14,  5.60it/s] 

29000 0.26816681211441756


 50%|████▉     | 29999/59999 [1:39:01<1:39:14,  5.04it/s] 

30000 0.25745178647339345


 52%|█████▏    | 30999/59999 [1:42:23<1:24:40,  5.71it/s] 

31000 0.2470282033458352


 53%|█████▎    | 31999/59999 [1:45:43<1:28:32,  5.27it/s] 

32000 0.2360104439407587


 55%|█████▍    | 32999/59999 [1:49:04<1:24:11,  5.34it/s] 

33000 0.2252184659615159


 57%|█████▋    | 33999/59999 [1:52:24<1:22:03,  5.28it/s] 

34000 0.2127457870580256


 58%|█████▊    | 34999/59999 [1:55:42<1:25:28,  4.88it/s] 

35000 0.19915992631763219


 60%|█████▉    | 35999/59999 [1:59:02<1:14:37,  5.36it/s] 

36000 0.18833892212435605


 61%|██████    | 36318/59999 [2:00:12<1:18:23,  5.04it/s] 


KeyboardInterrupt: 

In [44]:
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
model_load_name = './NLLB/nllb-eng-kir-v1'
model = AutoModelForSeq2SeqLM.from_pretrained(model_load_name).cuda()
tokenizer = NllbTokenizer.from_pretrained(model_load_name)
# fix_tokenizer(tokenizer)

In [129]:
def translate(
    text, src_lang='run_Latn', tgt_lang='eng_Latn', 
    a=32, b=3, max_input_length=1024, num_beams=4, **kwargs
):
    """Turn a text or a list of texts into a list of translations"""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True, 
        max_length=max_input_length
    )
    model.eval() # turn off training mode
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

In [163]:
df_test_small = df_test.sample(1000, random_state=42)

In [164]:
df_test_small['translated'] = batched_translate(df_test_small['tyv'].tolist())

100%|██████████| 63/63 [20:08<00:00, 19.18s/it] 


In [77]:
type(df_test_small.iloc[0]["translated"])

list

In [78]:
df_test_small['translated'] = df_test_small['translated'].apply(lambda x: x[0])

In [None]:
translations_df = batched_translate(flores['run'].tolist()[500:])

In [165]:
import sacrebleu
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)  # this metric is called ChrF++

print(bleu_calc.corpus_score(df_test_small['translated'].tolist(), [df_test_small['ru'].tolist()]))
print(chrf_calc.corpus_score(df_test_small['translated'].tolist(), [df_test_small['ru'].tolist()]))

# BLEU = 24.14 52.5/30.4/18.9/12.1 (BP = 0.981 ratio = 0.981 hyp_len = 2281 ref_len = 2324)
# chrF2++ = 49.49
# BLEU = 23.41 52.1/31.0/18.9/11.3 (BP = 0.966 ratio = 0.967 hyp_len = 2292 ref_len = 2371)
# chrF2++ = 50.89

BLEU = 23.85 52.9/28.4/17.9/12.0 (BP = 1.000 ratio = 1.070 hyp_len = 29121 ref_len = 27228)
chrF2++ = 46.11


In [152]:
def batched_translate(texts, batch_size=16, **kwargs):
    """Translate texts in batches of similar length"""
    idxs, texts2 = zip(*sorted(enumerate(texts), key=lambda p: len(p[1]), reverse=True))
    results = []
    for i in trange(0, len(texts2), batch_size):
        results.extend(translate(texts2[i: i+batch_size], **kwargs))
    return [p for i, p in sorted(zip(idxs, results))]

In [173]:
from datasets import load_dataset
dataset = load_dataset("Muennighoff/flores200", 'eng_Latn-run_Latn')

In [174]:
dataset = dataset['dev']

In [175]:
dataset

Dataset({
    features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence_eng_Latn', 'sentence_run_Latn'],
    num_rows: 997
})

In [135]:
flores = pd.DataFrame(([dataset['sentence_eng_Latn'], dataset['sentence_run_Latn']]))

In [176]:
flores_train = pd.DataFrame(([dataset['sentence_eng_Latn'], dataset['sentence_run_Latn']]))

In [177]:
flores_train = flores_train.T

In [178]:
flores_train.columns = ['eng', 'run']

In [138]:
flores

Unnamed: 0,lat,run
0,"""We now have 4-month-old mice that are non-dia...","Yongeyeko ati: ""Ubu turafise imbeba y'amezi 4 ..."
1,"Dr. Ehud Ur, professor of medicine at Dalhousi...","Umuhinga Ehud Ur, umwigisha w'ivy'ubuganga kur..."
2,"Like some other experts, he is skeptical about...","Cokimwe n'abandi bahinga, arafise amakenga ku ..."
3,"On Monday, Sara Danius, permanent secretary of...","Ku wa mbere, Sara Danius, umunyamabanga ntayeg..."
4,"Danius said, ""Right now we are doing nothing. ...","Danius yavuze ati: ""Ubu nta co turiko turakora..."
...,...,...
1007,"As the areas are sparsely populated, and light...","Kuko ivyo bice bibamwo abantu inkehwa, kandi n..."
1008,Japanese work culture is more hierarchical and...,Akaranga mu kazi k'Abayapani karasumbasumbana ...
1009,"Suits are standard business attire, and cowork...","Ikositimu niwo mwambaro w'akazi umenyerewe, ka..."
1010,"Workplace harmony is crucial, emphasizing grou...",Itunganywa ryiza ry'ikibanza c'akazi ni ngombw...


In [151]:
from tqdm import tqdm
translations = []
for i, _ in enumerate(tqdm(flores['run'].tolist())):
    translations.append(translate(flores.iloc[i]['run']))

  0%|          | 4/1012 [00:16<1:07:24,  4.01s/it]


KeyboardInterrupt: 

In [155]:
translations = batched_translate(flores['run'].tolist()[:500])

100%|██████████| 32/32 [07:08<00:00, 13.40s/it]


In [158]:
translations2 = batched_translate(flores['run'].tolist()[500:])

100%|██████████| 32/32 [06:51<00:00, 12.85s/it]


In [150]:
for i in tqdm(range(10)):
    print(i)

TypeError: 'module' object is not callable

In [159]:
translations = translations + translations2

In [160]:
len(translations)

1012

In [161]:
flores['translated'] = translations

In [180]:
flores['lat'].tolist()

['"We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.',
 'Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.',
 'Like some other experts, he is skeptical about whether diabetes can be cured, noting that these findings have no relevance to people who already have Type 1 diabetes.',
 'On Monday, Sara Danius, permanent secretary of the Nobel Committee for Literature at the Swedish Academy, publicly announced during a radio program on Sveriges Radio in Sweden the committee, unable to reach Bob Dylan directly about winning the 2016 Nobel Prize in Literature, had abandoned its efforts to reach him.',
 'Danius said, "Right now we are doing nothing. I have called and sent emails to his closest collaborator and received very friendly replies. For now, that is certainly enough."',
 "Prev

In [162]:
import sacrebleu
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)  # this metric is called ChrF++

print(bleu_calc.corpus_score(flores['translated'].tolist(), [flores['lat'].tolist()]))
print(chrf_calc.corpus_score(flores['translated'].tolist(), [flores['lat'].tolist()]))

BLEU = 23.46 55.9/29.3/18.0/11.4 (BP = 0.975 ratio = 0.975 hyp_len = 24110 ref_len = 24721)
chrF2++ = 45.71


In [179]:
flores_train[['eng', 'run']].to_csv('flores-eng-kir.csv', sep='\t', header=False, index=False)

In [170]:
flores

Unnamed: 0,lat,run,translated
0,"""We now have 4-month-old mice that are non-dia...","Yongeyeko ati: ""Ubu turafise imbeba y'amezi 4 ...","He added: ""We now have four-month-old mice who..."
1,"Dr. Ehud Ur, professor of medicine at Dalhousi...","Umuhinga Ehud Ur, umwigisha w'ivy'ubuganga kur...","Professor Ehud Ur, a professor of medicine at ..."
2,"Like some other experts, he is skeptical about...","Cokimwe n'abandi bahinga, arafise amakenga ku ...","Like other scientists, he is skeptical of the ..."
3,"On Monday, Sara Danius, permanent secretary of...","Ku wa mbere, Sara Danius, umunyamabanga ntayeg...","On Monday, Sara Danius, permanent secretary of..."
4,"Danius said, ""Right now we are doing nothing. ...","Danius yavuze ati: ""Ubu nta co turiko turakora...","Danius says: ""Now that we're doing nothing, I'..."
...,...,...,...
1007,"As the areas are sparsely populated, and light...","Kuko ivyo bice bibamwo abantu inkehwa, kandi n...","For this is a small part of the human family, ..."
1008,Japanese work culture is more hierarchical and...,Akaranga mu kazi k'Abayapani karasumbasumbana ...,Japanese craftsmanship is more sophisticated a...
1009,"Suits are standard business attire, and cowork...","Ikositimu niwo mwambaro w'akazi umenyerewe, ka...","Costumes are the most common work clothes, and..."
1010,"Workplace harmony is crucial, emphasizing grou...",Itunganywa ryiza ry'ikibanza c'akazi ni ngombw...,"Good workplace planning is essential, celebrat..."
