In [16]:
from tqdm.auto import tqdm
from sacremoses.tokenize import MosesTokenizer
from sacrebleu import BLEU

from dotless_arabic.processing import undot
from dotless_arabic.experiments.dots_retrieval.src.utils import add_dots_to_undotted_text
from dotless_arabic.datasets.iwslt2017.collect import collect_parallel_test_dataset_for_translation,collect_parallel_train_dataset_for_translation,collect_parallel_val_dataset_for_translation
from dotless_arabic.experiments.translation.src.models import TranslationTransformer
from dotless_arabic.experiments.translation.src.utils import get_best_checkpoint,get_source_tokenizer,get_target_tokenizer, get_blue_score
from dotless_arabic.experiments.translation.src.processing import process_ar,process_en
from dotless_arabic.tokenizers import SentencePieceTokenizer,WordTokenizer

In [2]:
tqdm.pandas()

In [3]:
train_dataset = collect_parallel_train_dataset_for_translation()
val_dataset = collect_parallel_val_dataset_for_translation()
test_dataset = collect_parallel_test_dataset_for_translation()

def processing_map(example):
    example["ar"] = process_ar(example["ar"])
    example["en"] = process_en(example["en"])
    return example

train_dataset = train_dataset.map(
    processing_map,
    load_from_cache_file=False,
).to_pandas()
val_dataset = val_dataset.map(
    processing_map,
    load_from_cache_file=False,
).to_pandas()
test_dataset = test_dataset.map(
    processing_map,
    load_from_cache_file=False,
).to_pandas()

moses_tokenizer = MosesTokenizer()
train_dataset["en"] = train_dataset["en"].progress_map(
    lambda text: moses_tokenizer.tokenize(text, return_str=True)
)
val_dataset["en"] = val_dataset["en"].progress_map(
    lambda text: moses_tokenizer.tokenize(text, return_str=True)
)
test_dataset["en"] = test_dataset["en"].progress_map(
    lambda text: moses_tokenizer.tokenize(text, return_str=True)
)
train_dataset["ar"] = train_dataset["ar"].progress_map(
    lambda text: moses_tokenizer.tokenize(text, return_str=True)
)
val_dataset["ar"] = val_dataset["ar"].progress_map(
    lambda text: moses_tokenizer.tokenize(text, return_str=True)
)
test_dataset["ar"] = test_dataset["ar"].progress_map(
    lambda text: moses_tokenizer.tokenize(text, return_str=True)
)

dotted_ar_test_dataset = test_dataset["ar"]

train_dataset["ar"] = list(
    map(
        undot,
        tqdm(train_dataset["ar"]),
    )
)
val_dataset["ar"] = list(
    map(
        undot,
        tqdm(val_dataset["ar"]),
    )
)
test_dataset["ar"] = list(
    map(
        undot,
        tqdm(test_dataset["ar"]),
    )
)


source_tokenizer = get_source_tokenizer(
    train_dataset=train_dataset,
    tokenizer_class=SentencePieceTokenizer,
    source_language_code='en',
    undot_text=False,
)
target_tokenizer = get_target_tokenizer(
    train_dataset=train_dataset,
    tokenizer_class=SentencePieceTokenizer,
    target_language_code='ar',
    undot_text=True,
)

Map:   0%|          | 0/231713 [00:00<?, ? examples/s]

Map:   0%|          | 0/888 [00:00<?, ? examples/s]

Map:   0%|          | 0/1205 [00:00<?, ? examples/s]

  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/888 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/888 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/888 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/231713 [00:00<?, ?it/s]

Training SentencePiece ...


0it [00:00, ?it/s]

Training SentencePiece ...


In [6]:
model = TranslationTransformer.load_from_checkpoint(get_best_checkpoint(
    is_dotted=False,
    source_language_code='en',
    target_language_code='ar',
    source_tokenizer_class=SentencePieceTokenizer,
    target_tokenizer_class=SentencePieceTokenizer,
))
model

checkpiont epoch=22-val_loss=3.171-step=83282.ckpt found.


TranslationTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerDecoderLayer(
          (self_attn): Multihead

### using `get_bleu_score_method`

In [18]:
get_blue_score(
    model=model,
    max_sequence_length=80,
    source_language_code='en',
    target_language_code='ar',
    add_dots_to_predictions=True,
    decode_with_beam_search=False,
    source_tokenizer=source_tokenizer,
    target_tokenizer=target_tokenizer,
    source_sentences=test_dataset['en'],
    target_sentences=dotted_ar_test_dataset,
)

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

Source: Today I'm going to speak to you about the last 30 years of architectural history.
Prediction: اليوم سوف اتجذب اليكم عن ال30 سنة الاخيرة من التاريخ المعماري.
Target: سأحدثكم اليوم عن 30 عاما من تاريخ الهندسة.
********************************************************************************
Source: That's a lot to pack into 18 minutes.
Prediction: وهذا كبير جدا للحزمة في 18 دقيقة.
Target: هذا أمر كبير جدا لألخصه في 18 دقيقة.
********************************************************************************
Source: It's a complex topic, so we're just going to dive right in at a complex place: New Jersey.
Prediction: انه موضوع معقد ، لذلك سيقوم بالغوص مباشرة في مكان معقد: نيو جيرسي.
Target: إنه موضوع معقد ، لذلك فإننا سنتوجه مباشرة إلى مكان معقد: إلى نيو جيرسي ،
********************************************************************************
Source: Because 30 years ago, I'm from Jersey, and I was six, and I lived there in my parents' house in a town called Livingston, and this was my

7.655

### bleu score from scratch

In [7]:
preds = [
            model.translate(
                max_sequence_length=80,
                input_sentence=sentence,
                source_tokenizer=source_tokenizer,
                target_tokenizer=target_tokenizer,
            )
            for sentence in tqdm(test_dataset['en'])
    ]

  0%|          | 0/1205 [00:00<?, ?it/s]



In [9]:
preds = list(
        map(
            lambda text: text.replace("<eos>", "").replace("<bos>", "").strip(),
            preds,
        )
    )

In [51]:
bleu = BLEU(lowercase=False)
bleu.corpus_score(list(map(add_dots_to_undotted_text,preds)),[list(dotted_ar_test_dataset)])

That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


BLEU = 8.07 33.8/11.7/5.0/2.2 (BP = 1.000 ratio = 1.072 hyp_len = 20191 ref_len = 18833)