In [4]:
import os
import nltk
import torch
import random

In [5]:
from pytorch_lightning.utilities.model_summary import ModelSummary
from pytorch_lightning import seed_everything
from tqdm.auto import tqdm

In [6]:
from sacremoses import MosesTokenizer

In [7]:
from dotless_arabic.experiments.translation.src.settings import (
    configure_environment,
)
from dotless_arabic.tokenizers import SentencePieceTokenizer



In [8]:
from dotless_arabic.experiments.translation.src.processing import (
    process_en,
    process_ar,
)
from dotless_arabic.experiments.translation.src.utils import get_source_tokenizer, get_target_tokenizer
from dotless_arabic.callbacks import EpochTimerCallback
from dotless_arabic.experiments.translation.src.datasets import get_dataloader
from dotless_arabic.experiments.translation.src.models import TranslationTransformer
from dotless_arabic.experiments.translation.src.utils import get_best_checkpoint, get_sequence_length, train_translator
from dotless_arabic.experiments.translation.src.utils import get_blue_score
from dotless_arabic.experiments.translation.src import constants

In [9]:
seed = 42

In [11]:
random.seed(seed)
torch.cuda.empty_cache()  # to free gpu memory
nltk.download("stopwords")
seed_everything(seed, workers=True)
os.environ["WANDB_MODE"] = "disabled"
torch.autograd.set_detect_anomaly(True)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # to see CUDA errors

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/majed_alshaibani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Global seed set to 42


In [12]:
from dotless_arabic.datasets.iwslt2017.collect import (
    collect_parallel_train_dataset_for_translation,
    collect_parallel_val_dataset_for_translation,
    collect_parallel_test_dataset_for_translation,
)

In [13]:
train_dataset = collect_parallel_train_dataset_for_translation()
val_dataset = collect_parallel_val_dataset_for_translation()
test_dataset = collect_parallel_test_dataset_for_translation()

In [14]:
train_dataset = train_dataset.to_pandas()
val_dataset = val_dataset.to_pandas()
test_dataset = test_dataset.to_pandas()

In [15]:
tqdm.pandas()

In [16]:
train_dataset["ar"] = train_dataset["ar"].progress_map(lambda text: process_ar(text))
val_dataset["ar"] = val_dataset["ar"].progress_map(lambda text: process_ar(text))
test_dataset["ar"] = test_dataset["ar"].progress_map(lambda text: process_ar(text))

train_dataset["en"] = train_dataset["en"].progress_map(lambda text: process_en(text))
val_dataset["en"] = val_dataset["en"].progress_map(lambda text: process_en(text))
test_dataset["en"] = test_dataset["en"].progress_map(lambda text: process_en(text))

  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/888 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/888 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

In [17]:
moses_tokenizer = MosesTokenizer()
train_dataset["en"] = train_dataset["en"].progress_map(
    lambda text: moses_tokenizer.tokenize(
        text,
        return_str=True,
    )
)
val_dataset["en"] = val_dataset["en"].progress_map(
    lambda text: moses_tokenizer.tokenize(
        text,
        return_str=True,
    )
)
test_dataset["en"] = test_dataset["en"].progress_map(
    lambda text: moses_tokenizer.tokenize(
        text,
        return_str=True,
    )
)
train_dataset["ar"] = train_dataset["ar"].progress_map(
    lambda text: moses_tokenizer.tokenize(
        text,
        return_str=True,
    )
)
val_dataset["ar"] = val_dataset["ar"].progress_map(
    lambda text: moses_tokenizer.tokenize(
        text,
        return_str=True,
    )
)
test_dataset["ar"] = test_dataset["ar"].progress_map(
    lambda text: moses_tokenizer.tokenize(
        text,
        return_str=True,
    )
)

  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/888 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/888 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

In [18]:
source_tokenizer_class = SentencePieceTokenizer
target_tokenizer_class = SentencePieceTokenizer
source_language_code = "ar"
target_language_code = "en"

In [19]:
source_tokenizer = get_source_tokenizer(
    train_dataset=train_dataset,
    tokenizer_class=source_tokenizer_class,
    source_language_code=source_language_code,
    undot_text=False,
)
target_tokenizer = get_target_tokenizer(
    train_dataset=train_dataset,
    tokenizer_class=target_tokenizer_class,
    target_language_code=target_language_code,
    undot_text=False,
)

  0%|          | 0/231713 [00:00<?, ?it/s]

Training SentencePiece ...


  0%|          | 0/231713 [00:00<?, ?it/s]

Training SentencePiece ...


In [20]:
source_max_sequence_length = get_sequence_length(
    dataset=list(
        map(
            source_tokenizer.split_text,
            tqdm(train_dataset[source_language_code]),
        )
    ),
)

target_max_sequence_length = get_sequence_length(
    dataset=list(
        map(
            target_tokenizer.split_text,
            tqdm(train_dataset[target_language_code]),
        )
    )
)

sequence_length = max(source_max_sequence_length, target_max_sequence_length)
sequence_length

  0%|          | 0/231713 [00:00<?, ?it/s]



  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/231713 [00:00<?, ?it/s]

  0%|          | 0/231713 [00:00<?, ?it/s]

91

In [21]:
model = TranslationTransformer.load_from_checkpoint(
        get_best_checkpoint(
            is_dotted=True,
            source_language_code=source_language_code,
            target_language_code=target_language_code,
            source_tokenizer_class=source_tokenizer_class,
            target_tokenizer_class=target_tokenizer_class,
        )
    ).to(constants.DEVICE)
model

best ckpt: NMT/ar_to_en/SentencePieceTokenizer_to_SentencePieceTokenizer/dotted/checkpoints/epoch=22-val_loss=2.599-step=123947.ckpt


TranslationTransformer(
  (train_ppl): Perplexity()
  (val_ppl): Perplexity()
  (test_ppl): Perplexity()
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleL

In [22]:
get_blue_score(
            model=model,
            is_dotted=True,
            show_translations_for=5,
            decode_with_beam_search=False,
            source_tokenizer=source_tokenizer,
            target_tokenizer=target_tokenizer,
            save_predictions_and_targets=False,
            max_sequence_length=sequence_length,
            source_language_code=source_language_code,
            target_language_code=target_language_code,
            source_sentences=test_dataset[source_language_code],
            target_sentences=test_dataset[target_language_code],
        )

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

Source: سأحدثكم اليوم عن 30 عاما من تاريخ الهندسة.
Prediction: i'm going to talk to you today about 30 years of engineering history.
Target: today i'm going to speak to you about the last 30 years of architectural history.
********************************************************************************
Source: هذا أمر كبير جدا لألخصه في 18 دقيقة.
Prediction: this is a very big thing to sum up in 18 minutes.
Target: that's a lot to pack into 18 minutes.
********************************************************************************
Source: إنه موضوع معقد ، لذلك فإننا سنتوجه مباشرة إلى مكان معقد: إلى نيو جيرسي ،
Prediction: it's a complicated topic, so we're going to go straight to a complicated place: to new jersey.
Target: it's a complex topic, so we're just going to dive right in at a complex place: new jersey.
********************************************************************************
Source: لأنه منذ 30 سنة ، أنا من نيوجيرسي ، كنت في السادسة من عمري ، وكنت أعيش هناك مع والدي ف

29.261

In [19]:
get_blue_score(
            model=model,
            is_dotted=True,
            show_translations_for=0,
            decode_with_beam_search=True,
            source_tokenizer=source_tokenizer,
            target_tokenizer=target_tokenizer,
            save_predictions_and_targets=False,
            max_sequence_length=sequence_length,
            source_language_code=source_language_code,
            target_language_code=target_language_code,
            source_sentences=test_dataset[source_language_code],
            target_sentences=test_dataset[target_language_code],
        )

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

sacre bleu 29.285
sacre bleu signature: nrefs:1|case:lc|eff:no|tok:13a|smooth:exp|version:2.3.1


29.285

In [20]:
# for _ in range(5):
#     get_blue_score(
#             model=model,
#             is_dotted=True,
#             show_translations_for=0,
#             decode_with_beam_search=False,
#             source_tokenizer=source_tokenizer,
#             target_tokenizer=target_tokenizer,
#             max_sequence_length=sequence_length,
#             source_language_code=source_language_code,
#             target_language_code=target_language_code,
#             source_sentences=test_dataset[source_language_code][:100],
#             target_sentences=test_dataset[target_language_code][:100],
#         )

In [21]:
# for _ in range(5):
#     get_blue_score(
#             model=model,
#             is_dotted=True,
#             show_translations_for=0,
#             decode_with_beam_search=True,
#             source_tokenizer=source_tokenizer,
#             target_tokenizer=target_tokenizer,
#             max_sequence_length=sequence_length,
#             source_language_code=source_language_code,
#             target_language_code=target_language_code,
#             source_sentences=test_dataset[source_language_code][:100],
#             target_sentences=test_dataset[target_language_code][:100],
#         )

In [None]:
def average_checkpoints(checkpoint_paths):
    # Load checkpoints and extract model weights
    loaded_checkpoints = [torch.load(path) for path in checkpoint_paths]
    model_weights = [checkpoint['state_dict'] for checkpoint in loaded_checkpoints]

    # Average weights
    avg_weights = {}
    num_checkpoints = len(checkpoint_paths)
    for key in model_weights[0].keys():
        avg_weights[key] = sum([model_weights[i][key] for i in range(num_checkpoints)]) / num_checkpoints

    # Create a new model with averaged weights
    averaged_model = TranslationTransformer(
        src_vocab_size=source_tokenizer.vocab_size,
        tgt_vocab_size=target_tokenizer.vocab_size,
        pad_token_id=source_tokenizer.token_to_id(source_tokenizer.pad_token),
    )
    averaged_model.load_state_dict(avg_weights)
    averaged_model = averaged_model.to(constants.DEVICE)

    return averaged_model

In [23]:
# ckpt_dir = 'NMT/ar_to_en/SentencePieceTokenizer_to_SentencePieceTokenizer/dotted/checkpoints'
# sorted_models_paths = []
# for filename in os.listdir(ckpt_dir):
#     if filename.startswith('epoch'):
#         sorted_models_paths.append(filename)
# sorted_models_paths = sorted(
#     sorted_models_paths,
#     key=lambda filename:''.join(c for c in filename.split('=')[2] if c.isdigit() or c=='.')
# )
# sorted_models_paths = list(map(lambda filename:f'{ckpt_dir}/{filename}',sorted_models_paths))
# sorted_models_paths

In [24]:

# averaged_model = TranslationTransformer(
#     src_vocab_size=source_tokenizer.vocab_size,
#     tgt_vocab_size=target_tokenizer.vocab_size,
#     pad_token_id=source_tokenizer.token_to_id(source_tokenizer.pad_token),
# ).to(constants.DEVICE)
# models = []
# for model_path in sorted_models_paths:
#     models.append(
#         TranslationTransformer(
#     src_vocab_size=source_tokenizer.vocab_size,
#     tgt_vocab_size=target_tokenizer.vocab_size,
#     pad_token_id=source_tokenizer.token_to_id(source_tokenizer.pad_token),
# ).load_from_checkpoint(model_path).to(constants.DEVICE)
#     )
# for ps in zip(*[m.parameters() for m in [averaged_model] + models]):
#     ps[0].copy_(torch.sum(*ps[1:]) / len(ps[1:]))

In [25]:
# averaged_model = TranslationTransformer(
#         src_vocab_size=source_tokenizer.vocab_size,
#         tgt_vocab_size=target_tokenizer.vocab_size,
#         pad_token_id=source_tokenizer.token_to_id(source_tokenizer.pad_token),
#     ).to(constants.DEVICE)
# averaged_model_state_dict = averaged_model.state_dict()

In [26]:
# for filepath in sorted_models_paths:
#     print(f'averaging {filepath}')
#     tmp_model = TranslationTransformer.load_from_checkpoint(
#         filepath
#     ).to(constants.DEVICE)
#     tmp_model_state_dict = tmp_model.state_dict()
#     for key in averaged_model_state_dict:
#         averaged_model_state_dict[key] = (tmp_model_state_dict[key]+averaged_model_state_dict[key])/2

# averaged_model.load_state_dict(averaged_model_state_dict)

In [None]:
ckpt_dir = 'NMT/ar_to_en/SentencePieceTokenizer_to_SentencePieceTokenizer/dotted/checkpoints'
averaged_model = average_checkpoints(checkpoint_paths=list(
        map(
            lambda item: f'{ckpt_dir}/{item}',
            os.listdir(ckpt_dir),
        )
    )
)
averaged_model

TranslationTransformer(
  (train_ppl): Perplexity()
  (val_ppl): Perplexity()
  (test_ppl): Perplexity()
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleL

In [None]:
get_blue_score(
        is_dotted=True,
        model=averaged_model,
        show_translations_for=0,
        decode_with_beam_search=False,
        source_tokenizer=source_tokenizer,
        target_tokenizer=target_tokenizer,
        save_predictions_and_targets=False,
        max_sequence_length=sequence_length,
        source_language_code=source_language_code,
        target_language_code=target_language_code,
        source_sentences=test_dataset[source_language_code],
        target_sentences=test_dataset[target_language_code],
        
    )

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]

sacre bleu 29.605
sacre bleu signature: nrefs:1|case:lc|eff:no|tok:13a|smooth:exp|version:2.3.1


29.605

In [29]:
get_blue_score(
        is_dotted=True,
        model=averaged_model,
        show_translations_for=0,
        decode_with_beam_search=True,
        source_tokenizer=source_tokenizer,
        target_tokenizer=target_tokenizer,
        max_sequence_length=sequence_length,
        save_predictions_and_targets=False,
        source_language_code=source_language_code,
        target_language_code=target_language_code,
        source_sentences=test_dataset[source_language_code],
        target_sentences=test_dataset[target_language_code],
    )

  0%|          | 0/1205 [00:00<?, ?it/s]

  0%|          | 0/1205 [00:00<?, ?it/s]