### Import necessary libraries

In [2]:
import pandas as pd
from odf.opendocument import load
from odf import text, teletype
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq
from tensorflow import keras
import neptune
from neptune.integrations.tensorflow_keras import NeptuneCallback

### Load data from .odt format

In [3]:
def load_data(filepath):
    raw_data = []
    text_doc = load(filepath)
    all_params = text_doc.getElementsByType(text.P)
    for line in all_params:
        raw_data.append(teletype.extractText(line))
    return raw_data

In [4]:
raw_data = load_data('dataset/data.odt')
while '' in raw_data:
    raw_data.remove('')
raw_data

['Wypowiedzi OG',
 'Przywitanie',
 '- Dzień dobry.',
 '[',
 'dzień dobry',
 'witać',
 ']',
 'Wyjaśnienie powodu wizyty',
 '- Chcę złożyć wniosek o wydanie dowodu osobistego.',
 '[',
 'ja wniosek dowód1 mieć',
 'ja wniosek dowód2 mieć',
 ']',
 '- Czy mogę odebrać dowód?',
 '[',
 'czy już nowy dowód1',
 'czy już nowy dowód2',
 'być mój nowy dowód1',
 'być mój nowy dowód2',
 ']',
 '- Chcę zgłosić utratę dowodu osobistego.',
 '[',
 'ja dowód1 zgubić',
 'ja dowód2 zgubić',
 ']',
 '- Chcę złożyć wniosek o wydanie dowodu osobistego dla mojego dziecka.',
 '[',
 'ja dowód1 moje dziecko chcieć',
 'ja dowód2 moje dziecko chcieć',
 ']',
 '- Chcę złożyć wniosek o wydanie dowodu osobistego dla mojej żony.',
 '[',
 'ja dowód1 moja żona chcieć',
 'ja dowód2 moja żona chcieć',
 ']',
 'Obsługa',
 '- Zmieniłem adres.',
 '[',
 'dom przenieść się',
 'mieszkanie zmiana1',
 'mieszkanie zmiana2',
 'adres1 zmiana1',
 'adres2 zmiana1',
 'adres1 zmiana2',
 'adres2 zmiana2',
 ']',
 '- Upłynął termin ważności dowo

### Load model and tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("./model/tokenizer")
model = TFAutoModelForSeq2SeqLM.from_pretrained("./model/model.h5")

2023-10-09 13:56:17.230601: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 129904640 exceeds 10% of free system memory.
2023-10-09 13:56:17.456863: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 129904640 exceeds 10% of free system memory.
2023-10-09 13:56:17.480317: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 129904640 exceeds 10% of free system memory.
2023-10-09 13:56:19.686132: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 129904640 exceeds 10% of free system memory.
2023-10-09 13:56:20.211797: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 129904640 exceeds 10% of free system memory.
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at ./model/model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predicti

### Adjust data form to our model

In [6]:
def split_data_from_list(raw_data):
    pl_sentence = []
    sentence = []
    i=0
    while i < len(raw_data):
        if raw_data[i+1] == '[':
            value = raw_data[i]
            i += 2
            while i < len(raw_data):
                if raw_data[i] == ']':
                    break
                pl_sentence.append(value[1:])
                sentence.append(raw_data[i])
                i += 1
        i += 1
    return pl_sentence, sentence

In [7]:
s1, s2 = split_data_from_list(raw_data)

In [8]:
data = pd.DataFrame({'pl':s1, 'mig':s2})
print(data[0:10])

                                                  pl                     mig
0                                       Dzień dobry.             dzień dobry
1                                       Dzień dobry.                   witać
2   Chcę złożyć wniosek o wydanie dowodu osobistego.  ja wniosek dowód1 mieć
3   Chcę złożyć wniosek o wydanie dowodu osobistego.  ja wniosek dowód2 mieć
4                            Czy mogę odebrać dowód?     czy już nowy dowód1
5                            Czy mogę odebrać dowód?     czy już nowy dowód2
6                            Czy mogę odebrać dowód?     być mój nowy dowód1
7                            Czy mogę odebrać dowód?     być mój nowy dowód2
8             Chcę zgłosić utratę dowodu osobistego.        ja dowód1 zgubić
9             Chcę zgłosić utratę dowodu osobistego.        ja dowód2 zgubić


### Creating a dataset

In [9]:
raw_dataset_list = []
for i in range(0, len(data)):
    raw_dataset_list.append({'translation' : {'pl' : data['pl'][i], 'mig' : data['mig'][i]}})
    
raw_dataset = Dataset.from_list(raw_dataset_list)
raw_dataset

Dataset({
    features: ['translation'],
    num_rows: 123
})

### Split data into train, validation and test dataset

In [10]:
train_test = raw_dataset.train_test_split(test_size=0.25)

train_test_valid_dataset = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test']})
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 92
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 31
    })
})

### Create preprocessing function for our data

In [12]:
max_input_length = 32
max_target_length = 32
source_lang = "pl"
target_lang = "mig"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

### Map preprocess function on our dataset

In [13]:
tokenized_dataset = train_test_valid_dataset.map(preprocess_function, batched=True)

                                                  

In [14]:
batch_size = 8

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

test_dataset = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


### Model preparation

In [15]:
model.summary()

Model: "tf_marian_mt_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLaye  multiple                 63430     
 r)                                                              
                                                                 
Total params: 77,202,374
Trainable params: 77,138,944
Non-trainable params: 63,430
_________________________________________________________________


In [16]:
model.model.encoder.trainable = False
model.model.encoder.get_config()

{'name': 'encoder',
 'trainable': False,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': 

In [17]:
model.model.decoder.get_config()

{'name': 'decoder',
 'trainable': True,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': F

In [18]:
model.summary()

Model: "tf_marian_mt_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLaye  multiple                 63430     
 r)                                                              
                                                                 
Total params: 77,202,374
Trainable params: 25,486,336
Non-trainable params: 51,716,038
_________________________________________________________________


In [19]:
optimizer = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, metrics=['accuracy'])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


### Initialize neptune

In [20]:
run = neptune.init_run(
    project="kacperurban/pl-mig-translation",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI3MTRhNjcwNy1iMzc2LTQwNTUtOGRjYy03ODI4OGQzNjkxNTEifQ==",
    tags=["first-training"],
)

neptune_callback = NeptuneCallback(run=run)

  run = neptune.init_run(


https://app.neptune.ai/kacperurban/pl-mig-translation/e/PLMIG-1


### Model training

In [22]:
early_stopping_callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model.fit(train_dataset, epochs=10, callbacks=[neptune_callback, early_stopping_callback])
run.stop()

Epoch 1/10


2023-10-09 14:07:48.289529: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [92]
	 [[{{node Placeholder/_0}}]]
2023-10-09 14:07:48.290403: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [92]
	 [[{{node Placeholder/_0}}]]
2023-10-09 14:07:53.961721: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'tf_marian_mt_model/model/decoder/cond/ones/packed/tf_ma

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 3 operations to synchronize with Neptune. Do not kill this process.
All 3 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/kacperurban/pl-mig-translation/e/PLMIG-1/metadata
