## Load pretrained model locally

In [1]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("./model/tokenizer")
model = TFAutoModelForSeq2SeqLM.from_pretrained("./model/model.h5")

  from .autonotebook import tqdm as notebook_tqdm
2023-05-06 14:12:05.699982: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-06 14:12:05.996197: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-06 14:12:05.997698: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at ./model/model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [2]:
model.summary()

Model: "tf_marian_mt_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLaye  multiple                 63430     
 r)                                                              
                                                                 
Total params: 77,202,374
Trainable params: 77,138,944
Non-trainable params: 63,430
_________________________________________________________________


In [3]:
model.model.get_config()

{'name': 'model',
 'trainable': True,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': Fal

In [4]:
model.model.encoder.get_config()

{'name': 'encoder',
 'trainable': True,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': F

In [5]:
print(tokenizer.source_lang, "-", tokenizer.target_lang)
print(f"Max lenght: {tokenizer.model_max_length}")

pl - en
Max lenght: 512


## The attempt of implementation a transfer learning on a model

### Load data from file

In [48]:
# Wczytanie plików z danymi
with open("dataset/polish.txt", "r", encoding="utf-8") as f:
    polish_data = f.read().splitlines()

with open("dataset/english.txt", "r", encoding="utf-8") as f:
    english_data = f.read().splitlines()

raw_dataset_list = []
for i in range(0, len(polish_data)):
    raw_dataset_list.append({'translation' : {'pl' : polish_data[i], 'en' : english_data[i]}})

In [49]:
raw_dataset_list[0:10]

[{'translation': {'pl': 'Polska', 'en': 'Poland'}},
 {'translation': {'pl': 'Europejski Fundusz Społeczny w Polsce, 2007-2013',
   'en': 'The European Social Fund in Poland, 2007-2013'}},
 {'translation': {'pl': 'Finansowany z EFS Program Operacyjny Kapitał Ludzki 2007-2013 świadczy o zaangażowaniu się Polski w inwestowanie w ludzi i ich umiejętności oraz o gotowości do podejmowania wyzwań, przed którymi obecnie staje.',
   'en': 'Poland’s ESF Operational Programme for 2007-2013 – Human Capital – is evidence of the country’s commitment to investing in people and their skills and determination to tackle the challenges Poland faces.'}},
 {'translation': {'pl': 'Położenie większego nacisku na edukację i szkolenia wydaje się być najlepszym sposobem na zwalczanie bezrobocia, wykluczenia społecznego oraz wyrównywania różnic między regionami.',
   'en': 'More education and training is the key instrument for combating unemployment, social exclusion and strong regional disparities.'}},
 {'trans

### Create preprocessing function for our data

In [11]:
max_input_length = 128
max_target_length = 128
def preprocess_fun(inputs, targets):
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
idx = round(len(polish_data)*0.95)
train_data = preprocess_fun(polish_data[:idx], english_data[:idx])
val_data = preprocess_fun(polish_list[idx:], english_list[idx:])

