### Import necessary libraries

In [132]:
import pandas as pd
from odf.opendocument import load
from odf import text, teletype
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq
from tensorflow import keras
import neptune
from neptune.integrations.tensorflow_keras import NeptuneCallback
from transformers import pipeline
from nltk.translate.bleu_score import corpus_bleu
import numpy as np

### Load data from .odt format

In [133]:
def load_data(filepath):
    raw_data = []
    text_doc = load(filepath)
    all_params = text_doc.getElementsByType(text.P)
    for line in all_params:
        raw_data.append(teletype.extractText(line))
    return raw_data

In [134]:
raw_data = load_data('dataset/data.odt')
while '' in raw_data:
    raw_data.remove('')
raw_data

['Wypowiedzi OG',
 'Przywitanie',
 '- Dzień dobry.',
 '[',
 'dzień dobry',
 'witać',
 ']',
 'Wyjaśnienie powodu wizyty',
 '- Chcę złożyć wniosek o wydanie dowodu osobistego.',
 '[',
 'ja wniosek dowód1 mieć',
 'ja wniosek dowód2 mieć',
 ']',
 '- Czy mogę odebrać dowód?',
 '[',
 'czy już nowy dowód1',
 'czy już nowy dowód2',
 'być mój nowy dowód1',
 'być mój nowy dowód2',
 ']',
 '- Chcę zgłosić utratę dowodu osobistego.',
 '[',
 'ja dowód1 zgubić',
 'ja dowód2 zgubić',
 ']',
 '- Chcę złożyć wniosek o wydanie dowodu osobistego dla mojego dziecka.',
 '[',
 'ja dowód1 moje dziecko chcieć',
 'ja dowód2 moje dziecko chcieć',
 ']',
 '- Chcę złożyć wniosek o wydanie dowodu osobistego dla mojej żony.',
 '[',
 'ja dowód1 moja żona chcieć',
 'ja dowód2 moja żona chcieć',
 ']',
 'Obsługa',
 '- Zmieniłem adres.',
 '[',
 'dom przenieść się',
 'mieszkanie zmiana1',
 'mieszkanie zmiana2',
 'adres1 zmiana1',
 'adres2 zmiana1',
 'adres1 zmiana2',
 'adres2 zmiana2',
 ']',
 '- Upłynął termin ważności dowo

 ### Load model and tokenizer

In [135]:
tokenizer = AutoTokenizer.from_pretrained("./model/tokenizer")
model = TFAutoModelForSeq2SeqLM.from_pretrained("./model/model.h5")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at ./model/model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


### Translation before training

In [136]:
translator = pipeline('translation', model=model, tokenizer=tokenizer)
translator('Czesc jestem Kacper')

[{'translation_text': "Hi, I'm Kacper."}]

### Adjust data form to our model

In [137]:
def split_data_from_list(raw_data):
    pl_sentence = []
    sentence = []
    i=0
    while i < len(raw_data):
        if raw_data[i+1] == '[':
            value = raw_data[i]
            i += 2
            while i < len(raw_data):
                if raw_data[i] == ']':
                    break
                pl_sentence.append(value[1:])
                sentence.append(raw_data[i])
                i += 1
        i += 1
    return pl_sentence, sentence

In [138]:
s1, s2 = split_data_from_list(raw_data)

In [139]:
data = pd.DataFrame({'pl':s1, 'mig':s2})
print(data[0:10])

                                                  pl                     mig
0                                       Dzień dobry.             dzień dobry
1                                       Dzień dobry.                   witać
2   Chcę złożyć wniosek o wydanie dowodu osobistego.  ja wniosek dowód1 mieć
3   Chcę złożyć wniosek o wydanie dowodu osobistego.  ja wniosek dowód2 mieć
4                            Czy mogę odebrać dowód?     czy już nowy dowód1
5                            Czy mogę odebrać dowód?     czy już nowy dowód2
6                            Czy mogę odebrać dowód?     być mój nowy dowód1
7                            Czy mogę odebrać dowód?     być mój nowy dowód2
8             Chcę zgłosić utratę dowodu osobistego.        ja dowód1 zgubić
9             Chcę zgłosić utratę dowodu osobistego.        ja dowód2 zgubić


### Creating a dataset

In [140]:
raw_dataset_list = []
for i in range(0, len(data)):
    raw_dataset_list.append({'translation' : {'pl' : data['pl'][i], 'mig' : data['mig'][i]}})
    
raw_dataset = Dataset.from_list(raw_dataset_list)
raw_dataset

Dataset({
    features: ['translation'],
    num_rows: 123
})

### Split data into train and test dataset

In [141]:
train_test = raw_dataset.train_test_split(test_size=0.2, seed=42)

train_test_dataset = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test']})

### Create preprocessing function for our data

In [142]:
max_input_length = 32
max_target_length = 32
source_lang = "pl"
target_lang = "mig"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets)
    return model_inputs

### Map preprocess function on our dataset

In [143]:
tokenized_dataset = train_test_dataset.map(preprocess_function, batched=True)

                                                  

In [144]:
batch_size = 8

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

test_dataset = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


### Model preparation

In [145]:
model.summary()

Model: "tf_marian_mt_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLaye  multiple                 63430     
 r)                                                              
                                                                 
Total params: 77,202,374
Trainable params: 77,138,944
Non-trainable params: 63,430
_________________________________________________________________


In [146]:
model.model.encoder.trainable = False
model.model.encoder.get_config()

{'name': 'encoder',
 'trainable': False,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': 

In [147]:
model.model.decoder.get_config()

{'name': 'decoder',
 'trainable': True,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': F

In [148]:
model.summary()

Model: "tf_marian_mt_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLaye  multiple                 63430     
 r)                                                              
                                                                 
Total params: 77,202,374
Trainable params: 25,486,336
Non-trainable params: 51,716,038
_________________________________________________________________


In [149]:
optimizer = keras.optimizers.Adam(learning_rate=0.00005)
model.compile(optimizer=optimizer, metrics=["accuracy"])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


### Initialize neptune

In [79]:
run = neptune.init_run(
    project="kacperurban/pl-mig-translation",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI3MTRhNjcwNy1iMzc2LTQwNTUtOGRjYy03ODI4OGQzNjkxNTEifQ==",
    tags='test',
)

neptune_callback = NeptuneCallback(run=run)

  run = neptune.init_run(


CannotResolveHostname: 
[95m
----CannotResolveHostname-----------------------------------------------------------------------
[0m
The Neptune client library was not able to resolve hostname [4mapp.neptune.ai[0m.

What should I do?
    - Check if your computer is connected to the internet.
    - Check if your computer is supposed to be using a proxy to access the internet.
      If so, you may want to use the [96mproxies[0m parameter of the [96minit_run()[0m function.
      See https://docs.neptune.ai/api/universal/#proxies
      and https://requests.readthedocs.io/en/latest/user/advanced/#proxies
    - Check the status of Neptune services: https://status.neptune.ai/

[92mNeed help?[0m-> https://docs.neptune.ai/getting_help


### Model training

In [150]:
early_stopping_callback = keras.callbacks.EarlyStopping(monitor='loss', patience=5)
model.fit(train_dataset, epochs=100, callbacks=[early_stopping_callback])
run.stop()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


<keras.callbacks.History at 0x7f8c4e182c90>

### Translation after training

In [151]:
from transformers import pipeline

translator = pipeline('translation', model=model, tokenizer=tokenizer)

In [152]:
translator('Dzien dobry.')

[{'translation_text': 'witać'}]

In [153]:
translator('Chcę złożyć wniosek o wydanie dowodu osobistego.')

[{'translation_text': 'tu zmiana dowód2'}]

In [154]:
translator('Czy mogę odebrać dowód?')

[{'translation_text': 'być mój nowy dowód2'}]

In [155]:
translator('Upłynął termin ważności dowodu.')

[{'translation_text': 'dowód1 koniec ważny'}]

## Model testing

In [156]:
def prepare_label_to_bleu(raw_data):
    hashmap = {}
    i=0
    while i < len(raw_data):
        if raw_data[i+1] == '[':
            value = raw_data[i]
            temp_list = []
            i += 2
            while i < len(raw_data):
                if raw_data[i] == ']':
                    hashmap[value[1:]] = temp_list
                    break
                temp_list.append(raw_data[i])
                i += 1
        i += 1
    return hashmap

In [166]:
def bleu_score():
    ref_sent = prepare_label_to_bleu(raw_data)
    translation_corpus = []
    reference_corpus = []
    for i in range(0, len(train_test_dataset["test"]["translation"])):
        value = ref_sent[train_test_dataset["test"]["translation"][i]["pl"]]
        translation = translator(train_test_dataset["test"]["translation"][i]["pl"])
        translation_corpus.append(translation[0]["translation_text"].split())
        if len(value) > 1:
            reference_corpus.append([j.split() for j in value])
        else:
            reference_corpus.append(value[0].split())
    b_score = corpus_bleu(reference_corpus, translation_corpus)
    compare_ref_trans  = pd.DataFrame({"reference" : reference_corpus, "translation" : translation_corpus})
    return b_score, compare_ref_trans

In [170]:
score, df_ref_trans = bleu_score()
print(f"BLEU score: {np.around(score, 2)}")

BLEU score: 0.33


In [171]:
df_ref_trans

Unnamed: 0,referance,translation
0,"[[co, pisać, mieć], [jak, pisać, mieć]]","[jak, pisać, mieć]"
1,"[przyjść, papier, wniosek, mieć]","[tu, zmiana, dowód2]"
2,"[[ile, płacić, dowód1], [ile, płacić, dowód2]]","[nowy, dowód2, płacić, ile]"
3,"[[mój, dowód1, pękąć], [mój, dowód2, pękąć], [...","[mój, dowód2, psuć]"
4,"[[ile, płacić, dowód1], [ile, płacić, dowód2]]","[nowy, dowód2, płacić, ile]"
5,"[[dowód1, gotowy], [dowód2, gotowy]]","[być, mój, nowy, dowód2]"
6,"[przyjść, papier, wniosek, mieć]","[tu, zmiana, dowód2]"
7,"[[ile, płacić, dowód1], [ile, płacić, dowód2]]","[nowy, dowód2, płacić, ile]"
8,"[[ja, pierwszy, raz, dowód1, robić], [ja, pier...","[ja, pierwszy, raz, dowód2, robić]"
9,"[[dzień, dobry], [witać]]",[witać]
