In [1]:
import transformers
import pandas as pd
import os, numpy as np
from transformers import MarianMTModel, MarianTokenizer
from time import time
from tqdm import tqdm

In [2]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [3]:
def save_to_file(sents, dump_fname):
    with open(dump_fname,'w') as fh:
        print(sents, file=fh)

In [4]:
BATCH_SIZE = 30

In [5]:
ds_fname = 'data/sampled_for_trans.csv'
sdf = pd.read_csv(ds_fname)

In [6]:
assert sdf.shape[0] == 24000

In [7]:
text = list(sdf.comment_text)

In [8]:
assert len(text) == 24000

In [12]:
tgt_langs = ['es','it']
for t in tgt_langs:
    print('*'*80)
    print(f'processing translation to: {t}')
    trans_sents = []

    model_name = 'Helsinki-NLP/opus-mt-en-'+t
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    model = model.cuda()
    print(f'using tokenizer: {tokenizer} and \nmodel: {model_name}')

    epoch_t0 = time()
    for batch_num, batch_text in enumerate(chunks(text[:40],BATCH_SIZE), start=1):
        print('processing batch: ', batch_num)
        t0 = time()
        batch_text_inputs = tokenizer.prepare_translation_batch(batch_text, max_length=256)
        batch_text_inputs = {k:v.cuda() for k,v in batch_text_inputs.items()}
        translated = model.generate(**batch_text_inputs)
        translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        trans_sents.extend(translated)
        batch_time = time() - t0
        print(f'Completed batch: {batch_num} in {round(batch_time,2)} sec')
        print(f'Translated so far: {len(trans_sents)}')
    full_tras_time = round(time()-epoch_t0)
    print(f'Translated all en sentences into {t}, using {model_name} in {full_tras_time} sec')


    dump_fname = f'en-to-{t}.txt'
    save_to_file(trans_sents, dump_fname)
    print(f'Dumped all es sentences into: {dump_fname}')
    print('*'*80)

********************************************************************************
processing translation to: es
using tokenizer: <transformers.tokenization_marian.MarianTokenizer object at 0x7fb85eafae48> and 
model: Helsinki-NLP/opus-mt-en-es
processing batch:  1
Completed batch: 1 in 11.2 sec
Translated so far: 30
processing batch:  2
Completed batch: 2 in 3.42 sec
Translated so far: 40
Translated all en sentences into es, using Helsinki-NLP/opus-mt-en-es in 15 sec
Dumped all es sentences into: en-to-es.txt
********************************************************************************
********************************************************************************
processing translation to: it
using tokenizer: <transformers.tokenization_marian.MarianTokenizer object at 0x7fb85e358cf8> and 
model: Helsinki-NLP/opus-mt-en-it
processing batch:  1
Completed batch: 1 in 38.61 sec
Translated so far: 30
processing batch:  2
Completed batch: 2 in 3.88 sec
Translated so far: 40
Translated al

In [14]:
trans_sents = []

model_name = 'Helsinki-NLP/opus-mt-en-trk'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
model = model.cuda()
print(f'using tokenizer: {tokenizer} and \nmodel: {model_name}')

epoch_t0 = time()
for batch_num, batch_text in enumerate(chunks(text[:40],BATCH_SIZE), start=1):
    print('processing batch: ', batch_num)
    t0 = time()
    batch_text_inputs = tokenizer.prepare_translation_batch(batch_text, max_length=256)
    batch_text_inputs = {k:v.cuda() for k,v in batch_text_inputs.items()}
    translated = model.generate(**batch_text_inputs)
    translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    trans_sents.extend(translated)
    batch_time = time() - t0
    print(f'Completed batch: {batch_num} in {round(batch_time,2)} sec')
    print(f'Translated so far: {len(trans_sents)}')
full_tras_time = round(time()-epoch_t0)
print(f'Translated all en sentences into turkish, using {model_name} in {full_tras_time} sec')

using tokenizer: <transformers.tokenization_marian.MarianTokenizer object at 0x7f3c0deecda0> and 
model: Helsinki-NLP/opus-mt-en-trk
processing batch:  1
Completed batch: 1 in 12.97 sec
Translated so far: 30
processing batch:  2
Completed batch: 2 in 4.61 sec
Translated so far: 40
Translated all en sentences into turkish, using Helsinki-NLP/opus-mt-en-trk in 18 sec


In [13]:
text_ext = ['>>tur<< '+t for t in text]
len(text_ext)

24000

In [15]:
trans_sents_ext = []

model_name = 'Helsinki-NLP/opus-mt-en-trk'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
model = model.cuda()
print(f'using tokenizer: {tokenizer} and \nmodel: {model_name}')

epoch_t0 = time()
for batch_num, batch_text in enumerate(chunks(text_ext[:40],BATCH_SIZE), start=1):
    print('processing batch: ', batch_num)
    t0 = time()
    batch_text_inputs = tokenizer.prepare_translation_batch(batch_text, max_length=256)
    batch_text_inputs = {k:v.cuda() for k,v in batch_text_inputs.items()}
    translated = model.generate(**batch_text_inputs)
    translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    trans_sents_ext.extend(translated)
    batch_time = time() - t0
    print(f'Completed batch: {batch_num} in {round(batch_time,2)} sec')
    print(f'Translated so far: {len(trans_sents_ext)}')
full_tras_time = round(time()-epoch_t0)
print(f'Translated all en sentences into turkish, using {model_name} in {full_tras_time} sec')

using tokenizer: <transformers.tokenization_marian.MarianTokenizer object at 0x7f3c0ccf0ef0> and 
model: Helsinki-NLP/opus-mt-en-trk
processing batch:  1
Completed batch: 1 in 11.11 sec
Translated so far: 30
processing batch:  2
Completed batch: 2 in 2.55 sec
Translated so far: 40
Translated all en sentences into turkish, using Helsinki-NLP/opus-mt-en-trk in 14 sec


In [None]:
for eng,trk,tur in zip(text, trans_sents, trans_sents_ext):
    print(eng)
    print('-'*80)
    print(trk)
    print('-'*80)
    print(tur)
    print('-'*80)
    print('*'*80)
    print('*'*80)
    input()
    
    

:Dear god this site is horrible.
--------------------------------------------------------------------------------
; watin bu sayt çox yaramaz.
--------------------------------------------------------------------------------
"Sevgili tanrım, bu site korkunç.
--------------------------------------------------------------------------------
********************************************************************************
********************************************************************************

"*::::::::I believe that you're confusing ""precision"" with ""accuracy"". I'm using the word precison in its mathematical sense, the number of digits following the decimal place in this case.   
"
--------------------------------------------------------------------------------
*:: "Эмне: "Эгер "спедика" диагнозлық", "babby" сөзі математикалық olarak, рационалда жататын дистогнозлық сандардың саны.
--------------------------------------------------------------------------------
"*::: "Ben inan


" 
 :That is ridiculous. Unless there's a good and non-disingenuous response, I would absolutely agree with you blocking indef outright. Falsifying sources should simply never be tolerated. //  "
--------------------------------------------------------------------------------
"Kesinlikle, bu saçma, ya da nädogru жауап bolmasa, men sizi acil kabul ederdim. Galiba sertifika etmek hiç haçan kabul edilmez. / "/"
--------------------------------------------------------------------------------
"Güzel, bu saçma bir tepkisi yoksa, seni açıkça engellemenizle kesinlikle aynı fikirde olurum. Galsing kaynakların hiçbir zaman kabul edilmemesi gerekmez. / "
--------------------------------------------------------------------------------
********************************************************************************
********************************************************************************

How dare you vandalize that page about the HMS Beagle! Don't vandalize again, demon!
------------------


Transliteration of Russian place names
In writing about Moscow Metro for the Malayalam Wikipedia, we are finding it difficult to correctly transliterate the Russian place names. For example, do we pronounce Park Kultury as PAARK KALTTARI or PAARK KALCHCHARI (or perhaps something completely different)? Can somebody please help by transliterating the list given in https://ml.wikipedia.org/wiki/സംവാദം:മോസ്കോ_മെട്രോ. (I am not putting the list here as I don't want to clutter up this page.) Thanks
--------------------------------------------------------------------------------
Мәскәү Malayalam Wikipedia үшін rus урыннарының исемдерін жаза бастап, Biz Мәскәү Метроның исемен дөрөҫ литерациялау җиңел түгел. Мысалы, Park Kulty KARKETI немесе ПARKALI (яғни ВОСИКИ) литеребіне (яның қандай да булһа бір түрлерге) yardım бере алады. Иде-бин лимония (1.org.org. com/wikimo_BAR_ wikim/IKDEBHI) Бұл rshown. Лиценттттердің тізімінің исемдерін алып тастап, алардан пайда алуына шамамыз.
-------------------