## Load require libraries

In [12]:
import pandas as pd
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["NEPTUNE_API_TOKEN"] = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI3MTRhNjcwNy1iMzc2LTQwNTUtOGRjYy03ODI4OGQzNjkxNTEifQ=="
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from transformers import pipeline
from tf_keras.callbacks import EarlyStopping
from tf_keras.optimizers import Adam
import neptune
from neptune.integrations.tensorflow_keras import NeptuneCallback
from custom_bleu import CustomBleu

## Check if model is working properly

In [2]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("model/model")
tokenizer = AutoTokenizer.from_pretrained("model/tokenizer/")




All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at model/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [27]:
translation = pipeline("translation", model=model, tokenizer=tokenizer)
translation("Lubie jesc jablka!")

[{'translation_text': 'I like eating apple!'}]

## Data preparation

### Load data

In [3]:
data = pd.read_csv('data/All_data.csv')
print(data[0:10])

                                                  pl  \
0  Mój dziadek jest inwalidą wojennym i otrzymuję...   
1                Czy ma Pan legitymację kombatancką?   
2                  Mam wezwanie na komisje lekarską.   
3                    Moja mama otrzymuje rentę krus.   
4               Czy ma Pani ukończony kurs rolniczy?   
5                Czy ma Pan ukończony kurs rolniczy?   
6                Ja pracuję na gospodarstwie rolnym.   
7                           Mój brat jest kawalerem.   
8     Ja płacę duży rachunek za gaz i elektryczność.   
9         W jakim sklepie można zrobić tanie zakupy?   

                                          mig  
0   Mój dziadek inwalida wojna renta zus mieć  
1               Ty legitymacja kombatant mieć  
2           Ja komisja lekarska wezwanie mieć  
3                   Moja mama renta krus mieć  
4                       Ty kurs rolniczy mieć  
5                       Ty kurs rolniczy mieć  
6            Ja gospodarstwo rolne praca mieć  

### Create a dataset

In [4]:
raw_dataset_list = []
for i in range(0, len(data)):
    raw_dataset_list.append({'translation' : {'pl' : data['pl'][i], 'mig' : data['mig'][i]}})

raw_dataset = Dataset.from_list(raw_dataset_list)
raw_dataset

Dataset({
    features: ['translation'],
    num_rows: 1211
})

### Split data into train and test dataset

In [5]:
train_test = raw_dataset.train_test_split(test_size=0.2, seed=42)
valid_test = train_test['test'].train_test_split(test_size=0.5, seed=42)
train_test_dataset = DatasetDict({
    'train': train_test['train'],
    'valid': valid_test['train'],
    'test': valid_test['test']})
train_test_dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 847
    })
    valid: Dataset({
        features: ['translation'],
        num_rows: 182
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 182
    })
})

### Create preprocessing function for our data

In [7]:
max_input_length = 32
max_target_length = 32
source_lang = "pl"
target_lang = "mig"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets)
    return model_inputs

### Map preprocess function on our dataset

In [32]:
tokenized_dataset = train_test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 847/847 [00:00<00:00, 7177.89 examples/s]
Map: 100%|██████████| 182/182 [00:00<00:00, 6748.80 examples/s]
Map: 100%|██████████| 182/182 [00:00<00:00, 6268.17 examples/s]


In [33]:
batch_size = 8

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

valid_dataset = model.prepare_tf_dataset(
    tokenized_dataset["valid"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

## Model preparation

In [65]:
model.summary()

Model: "tf_marian_mt_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLay  multiple                  63430     
 er)                                                             
                                                                 
Total params: 77202374 (294.50 MB)
Trainable params: 77138944 (294.26 MB)
Non-trainable params: 63430 (247.77 KB)
_________________________________________________________________


In [66]:
model.model.encoder.trainable = False
model.model.encoder.get_config()

{'name': 'encoder',
 'trainable': False,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'chunk_size_feed_forward': 0,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size':

In [36]:
model.model.decoder.get_config()

{'name': 'decoder',
 'trainable': True,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'chunk_size_feed_forward': 0,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': 

In [67]:
model.summary()

Model: "tf_marian_mt_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLay  multiple                  63430     
 er)                                                             
                                                                 
Total params: 77202374 (294.50 MB)
Trainable params: 25486336 (97.22 MB)
Non-trainable params: 51716038 (197.28 MB)
_________________________________________________________________


In [68]:
model.compile(optimizer=Adam(0.00005), metrics=["accuracy"], run_eagerly=True)

## Model training

### Initialize Neptune

In [None]:
run = neptune.init_run(
    project="kacperurban/pl-mig-translation",
    tags='test',
)

neptune_callback = NeptuneCallback(run=run)

### Training

In [69]:
early_stopping_callback = EarlyStopping(monitor='loss', patience=5)
model.fit(train_dataset, epochs=100, validation_data=valid_dataset, callbacks=[early_stopping_callback])

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100

## Translation after training

In [57]:
from transformers import pipeline

translator = pipeline('translation', model=model, tokenizer=tokenizer)

In [58]:
translator('Chcę złożyć wniosek o wydanie dowodu osobistego.')

[{'translation_text': 'Nowy dowód1 chcieć'}]

In [59]:
translator('Czy mogę odebrać dowód?')

[{'translation_text': 'Być mój nowy dowód1'}]

## Model testing

In [6]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("final_model/model")
tokenizer = AutoTokenizer.from_pretrained("final_model/tokenizer/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at final_model/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [7]:
translator = pipeline('translation', model=model, tokenizer=tokenizer)
bleu = CustomBleu(data, translator)

In [8]:
bleu.data

Unnamed: 0,pl,mig
0,Mój dziadek jest inwalidą wojennym i otrzymuję...,Mój dziadek inwalida wojna renta zus mieć
1,Czy ma Pan legitymację kombatancką?,Ty legitymacja kombatant mieć
2,Mam wezwanie na komisje lekarską.,Ja komisja lekarska wezwanie mieć
3,Moja mama otrzymuje rentę krus.,Moja mama renta krus mieć
4,Czy ma Pani ukończony kurs rolniczy?,Ty kurs rolniczy mieć
...,...,...
1206,Zmieniłem(am) nazwisko.,Nazwisko zmiana2
1207,Zmieniłem(am) nazwisko.,Przeniesc inne spać
1208,Zmieniłem(am) nazwisko.,Już ślub
1209,Mam 18 lat.,Już 18 lat


In [10]:
score, df_ref = bleu.score(train_test_dataset)

In [11]:
print(f"BLEU score: {score}")
df_ref.head(10)

BLEU score: 0.30199641394191035


Unnamed: 0,reference,translation
0,"[[Ty, ranna], [Pan, ranna]]","[Ty, ranna]"
1,"[Laryngolog, przyjmować, kiedy]","[Laryngolog, przyjmować, kiedy]"
2,"[[Ja, internet, mam], [Ja, internet, mieć]]","[Ja, internet, mieć]"
3,"[[Podatek, rolny, leśny, płacić, trzeba], [Pod...","[Płacja, rzeszno, zrobna, płacić, musieć]"
4,"[Ty, choroba, poważnie]","[Ty, choroba, słabo]"
5,"[[Pani, krewni, masz], [Ty, krewni, masz]]","[Ty, krewni, masz]"
6,"[Ty, emerytura, renta, mieć]","[Ty, emerytura, renta, mieć]"
7,"[[Pani, dowód, mieć], [Pani, dowód, masz], [Ty...","[Pani, dowód, mieć]"
8,"[[Ja, zmiana1, nazwisko], [Ja, zmiana2, nazwis...","[Przeniesc, inne, spać]"
9,"[[Ty, gaśnica, mieć], [Ty, Pani, gaśnica, mieć]]","[Pani, gaśnica, mieć]"


## Save final model

In [61]:
tokenizer.save_pretrained("final_model/tokenizer/")
model.save_pretrained("final_model/model")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[63429]], 'forced_eos_token_id': 0}
