## Load require libraries

In [1]:
from dotenv import load_dotenv
from datasets import load_from_disk
from transformers import DataCollatorForSeq2Seq
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from transformers import pipeline
from tf_keras.callbacks import EarlyStopping
from tf_keras.optimizers import Adam
import neptune
from neptune.integrations.tensorflow_keras import NeptuneCallback
from custom_bleu import CustomBleu

load_dotenv()




True

## Check if model is working properly

In [5]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("model/model")
tokenizer = AutoTokenizer.from_pretrained("model/tokenizer/")




All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at model/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [27]:
translation = pipeline("translation", model=model, tokenizer=tokenizer)
translation("Lubie jesc jablka!")

[{'translation_text': 'I like eating apple!'}]

## Data preparation

### Load data

In [2]:
data = load_from_disk('data/final_data')
data

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 968
    })
    valid: Dataset({
        features: ['translation'],
        num_rows: 121
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 122
    })
})

In [4]:
data['train'][0:3]['translation']

[{'mig': 'Okno otworzyć można', 'pl': 'Czy można otworzyć okno?'},
 {'mig': 'Ja dowód paszport zgubić',
  'pl': 'Ja zgubiłam dowód osobisty i paszport.'},
 {'mig': 'Niemieć dowód1', 'pl': 'Nie mam dowodu.'}]

### Create preprocessing function for our data

In [3]:
max_input_length = 128
max_target_length = 128
source_lang = "pl"
target_lang = "mig"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets)
    return model_inputs

### Map preprocess function on our dataset

In [None]:
tokenized_dataset = data.map(preprocess_function, batched=True)

In [17]:
batch_size = 16

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

valid_dataset = model.prepare_tf_dataset(
    tokenized_dataset["valid"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

## Model preparation

In [18]:
model.summary()

Model: "tf_marian_mt_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLay  multiple                  63430     
 er)                                                             
                                                                 
Total params: 77202374 (294.50 MB)
Trainable params: 77138944 (294.26 MB)
Non-trainable params: 63430 (247.77 KB)
_________________________________________________________________


In [19]:
model.model.encoder.trainable = False
model.model.encoder.get_config()

{'name': 'encoder',
 'trainable': False,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'chunk_size_feed_forward': 0,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size':

In [36]:
model.model.decoder.get_config()

{'name': 'decoder',
 'trainable': True,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'chunk_size_feed_forward': 0,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': 

In [21]:
model.summary()

Model: "tf_marian_mt_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLay  multiple                  63430     
 er)                                                             
                                                                 
Total params: 77202374 (294.50 MB)
Trainable params: 25486336 (97.22 MB)
Non-trainable params: 51716038 (197.28 MB)
_________________________________________________________________


In [20]:
model.compile(optimizer=Adam(0.00005), metrics=["accuracy"], run_eagerly=True)

## Model training

### Initialize Neptune

In [2]:
run = neptune.init_run(
    tags='test_env',
)

neptune_callback = NeptuneCallback(run=run)



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/kacperurban/pl-mig-translation/e/PLMIG-20


### Training

In [4]:
early_stopping_callback = EarlyStopping(monitor='loss', patience=5)
model.fit(train_dataset, epochs=5, validation_data=valid_dataset, callbacks=[early_stopping_callback])
run.stop()

## Translation after training

In [57]:
from transformers import pipeline

translator = pipeline('translation', model=model, tokenizer=tokenizer)

In [58]:
translator('Chcę złożyć wniosek o wydanie dowodu osobistego.')

[{'translation_text': 'Nowy dowód1 chcieć'}]

In [59]:
translator('Czy mogę odebrać dowód?')

[{'translation_text': 'Być mój nowy dowód1'}]

## Model testing

In [3]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("final_model/model")
tokenizer = AutoTokenizer.from_pretrained("final_model/tokenizer/")




All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at final_model/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [4]:
translator = pipeline('translation', model=model, tokenizer=tokenizer)
bleu = CustomBleu(data, translator)

In [5]:
bleu.data

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 968
    })
    valid: Dataset({
        features: ['translation'],
        num_rows: 121
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 122
    })
})

In [6]:
score_train, df_ref_train = bleu.score("train", True)
score_valid, df_ref_valid = bleu.score("valid", True)
score_test, df_ref_test = bleu.score("test", True)

100%|██████████| 4/4 [00:24<00:00,  6.12s/it]
100%|██████████| 4/4 [00:36<00:00,  9.10s/it]
100%|██████████| 4/4 [00:37<00:00,  9.30s/it]


In [7]:
print(f"BLEU score on train data: {score_train}")
print(f"BLEU score on valid data: {score_valid}")
print(f"BLEU score on test data: {score_test}")

BLEU score on train data: 0.09
BLEU score on valid data: 0.93
BLEU score on test data: 1.28


## Save final model

In [61]:
tokenizer.save_pretrained("final_model/tokenizer/")
model.save_pretrained("final_model/model")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[63429]], 'forced_eos_token_id': 0}
