In [1]:
from datasets import load_dataset

dataset = load_dataset("LazarusNLP/wikipedia_id_20230520", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = dataset.filter(lambda example: 150 < len(example['text']) < 500, num_proc=30).remove_columns(["id", "url", "title"])
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(1_000_000))

In [3]:
import ctranslate2
import transformers

translator = ctranslate2.Translator("opus-mt-id-en", device="cuda", compute_type="bfloat16")
tokenizer = transformers.AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")

In [4]:
dataset = dataset.map(lambda example: {"text_tokenized": tokenizer.convert_ids_to_tokens(tokenizer.encode(example["text"]))}, num_proc=30)

In [5]:
def translate_batch(examples):
    results = translator.translate_batch(examples['text_tokenized'])
    examples["text_en"] = [tokenizer.decode(tokenizer.convert_tokens_to_ids(result.hypotheses[0])) for result in results]
    return examples

In [6]:
dataset = dataset.map(translate_batch, batched=True, batch_size=4096)

Map: 100%|██████████| 1000000/1000000 [20:43<00:00, 804.08 examples/s]


In [9]:
del translator

In [10]:
translator = ctranslate2.Translator("opus-mt-en-id", device="cuda", compute_type="bfloat16")
tokenizer = transformers.AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")

In [11]:
dataset = dataset.map(lambda example: {"text_en_tokenized": tokenizer.convert_ids_to_tokens(tokenizer.encode(example["text_en"]))}, num_proc=30)

Map (num_proc=30): 100%|██████████| 1000000/1000000 [00:09<00:00, 104776.92 examples/s]


In [13]:
def translate_batch(examples):
    results = translator.translate_batch(examples['text_en_tokenized'])
    examples["text_bt"] = [tokenizer.decode(tokenizer.convert_tokens_to_ids(result.hypotheses[0])) for result in results]
    return examples

In [14]:
dataset = dataset.map(translate_batch, batched=True, batch_size=4096)

Map: 100%|██████████| 1000000/1000000 [18:39<00:00, 892.92 examples/s]


In [16]:
dataset = dataset.remove_columns(["text_tokenized", "text_en_tokenized"])

In [20]:
dataset[0]

{'text': 'Setiap tahunnya, Orange Grove Boulevard menjadi panggung Turnamen Parade Mawar. Pagi-pagi buta sebelum fajar, kendaraan-kendaraan peserta dari segala ukuran dan bentuk dapat dilihat diparkir memanjang sepanjang boulevard ini sementara para awak relawannya bergegas memberikan sentuhannya yang terakhir.',
 'text_en': "Every year, Orange Grove Boulevard becomes a Rose Parade tournament stage early morning before dawn, the participants' vehicles of all sizes and shapes can be seen parked long-long through this boulevard while the volunteers rush to give their last touch.",
 'text_bt': 'Setiap tahun, Orange Grove Boulevard menjadi panggung turnamen Parade Rose pagi-pagi sebelum fajar, kendaraan peserta dari semua ukuran dan bentuk dapat dilihat diparkir panjang melalui boulevard ini sementara relawan bergegas untuk memberikan sentuhan terakhir mereka.'}

In [22]:
dataset.push_to_hub("LazarusNLP/wikipedia_id_backtranslated")

Creating parquet from Arrow format: 100%|██████████| 500/500 [00:01<00:00, 427.19ba/s]
100%|██████████| 1/1 [00:10<00:00, 10.45s/it]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:01<00:00, 431.51ba/s]
100%|██████████| 1/1 [00:06<00:00,  6.68s/it]
Uploading the dataset shards: 100%|██████████| 2/2 [00:20<00:00, 10.18s/it]
