## Load pretrained model locally

In [1]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("./model/tokenizer")
model = TFAutoModelForSeq2SeqLM.from_pretrained("./model/model.h5")

  from .autonotebook import tqdm as notebook_tqdm
2023-05-19 11:09:55.632264: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-19 11:09:55.944913: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-19 11:09:55.946245: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at ./model/model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [234]:
model.summary()

Model: "tf_marian_mt_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLaye  multiple                 63430     
 r)                                                              
                                                                 
Total params: 77,202,374
Trainable params: 77,138,944
Non-trainable params: 63,430
_________________________________________________________________


In [235]:
model.model.get_config()

{'name': 'model',
 'trainable': True,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': Fal

In [236]:
model.model.encoder.get_config()

{'name': 'encoder',
 'trainable': True,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': F

In [237]:
print(tokenizer.source_lang, "-", tokenizer.target_lang)
print(f"Max lenght: {tokenizer.model_max_length}")

pl - en
Max lenght: 512


## The attempt of implementation a transfer learning on a model

### Load data from file

In [238]:
# Wczytanie plików z danymi
with open("dataset/polish.txt", "r", encoding="utf-8") as f:
    polish_data = f.read().splitlines()

with open("dataset/english.txt", "r", encoding="utf-8") as f:
    english_data = f.read().splitlines()

raw_dataset_list = []
for i in range(0, 30000):
    raw_dataset_list.append({'translation' : {'pl' : polish_data[i], 'en' : english_data[i]}})

### Creating a dataset

In [239]:
from datasets import Dataset, DatasetDict
raw_dataset = Dataset.from_list(raw_dataset_list)
raw_dataset

Dataset({
    features: ['translation'],
    num_rows: 30000
})

### Split data into train, validation and test dataset

In [240]:
train_testvalid = raw_dataset.train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 27000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1500
    })
})

### Create preprocessing function for our data

In [241]:
max_input_length = 512
max_target_length = 512
source_lang = "pl"
target_lang = "en"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [242]:
preprocess_function(train_test_valid_dataset["train"][:2])

{'input_ids': [[8327, 26830, 11601, 10, 8821, 124, 2, 51, 8246, 90, 3429, 8107, 4166, 61, 22710, 3, 1011, 3492, 45, 23967, 1213, 22710, 14439, 17, 15224, 42047, 1657, 23315, 7256, 3565, 22710, 16405, 2, 0], [3061, 655, 511, 981, 162, 7375, 356, 37105, 29, 22187, 10117, 126, 23554, 10753, 2, 2744, 3646, 25, 25688, 19, 278, 6428, 95, 43, 816, 11237, 143, 7868, 1087, 17, 20685, 27769, 103, 41821, 15, 5010, 1805, 5954, 21321, 28, 24459, 8332, 8102, 3, 30157, 3, 32656, 17, 38267, 38, 17, 887, 550, 33664, 34782, 39857, 2228, 17, 20475, 18, 466, 2171, 7255, 25, 23333, 134, 7868, 579, 46354, 19396, 8852, 98, 3, 10262, 2668, 3, 7098, 48330, 3, 20497, 29, 3, 15, 10407, 10822, 17, 13734, 3, 5621, 3, 33533, 19, 5352, 2, 234, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

### Map preprocess function on our dataset

In [243]:
tokenized_dataset = train_test_valid_dataset.map(preprocess_function, batched=True)

                                                                   

In [244]:
from transformers import DataCollatorForSeq2Seq

batch_size = 16

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset(
    tokenized_dataset["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

test_dataset = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

### Model preparation

In [245]:
model.model.encoder.trainable = False
model.model.encoder.get_config()

{'name': 'encoder',
 'trainable': False,
 'dtype': 'float32',
 'config': {'vocab_size': 63430,
  'decoder_vocab_size': 63430,
  'max_position_embeddings': 512,
  'd_model': 512,
  'encoder_ffn_dim': 2048,
  'encoder_layers': 6,
  'encoder_attention_heads': 8,
  'decoder_ffn_dim': 2048,
  'decoder_layers': 6,
  'decoder_attention_heads': 8,
  'dropout': 0.1,
  'attention_dropout': 0.0,
  'activation_dropout': 0.0,
  'activation_function': 'swish',
  'init_std': 0.02,
  'encoder_layerdrop': 0.0,
  'decoder_layerdrop': 0.0,
  'use_cache': True,
  'num_hidden_layers': 6,
  'scale_embedding': True,
  'share_encoder_decoder_embeddings': True,
  'return_dict': True,
  'output_hidden_states': False,
  'output_attentions': False,
  'torchscript': False,
  'torch_dtype': None,
  'use_bfloat16': False,
  'tf_legacy_loss': False,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'is_encoder_decoder': True,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': 

In [246]:
for layer in model.model.decoder.layers[:5]:
    layer.trainable = False

In [247]:
model.summary()

Model: "tf_marian_mt_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMarianMainLayer)   multiple                  77138944  
                                                                 
 final_logits_bias (BiasLaye  multiple                 63430     
 r)                                                              
                                                                 
Total params: 77,202,374
Trainable params: 4,466,176
Non-trainable params: 72,736,198
_________________________________________________________________


In [248]:
from tensorflow import keras

optimizer = keras.optimizers.Adam(learning_rate=0.01)
model.compile( optimizer=optimizer, metrics=['accuracy'])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [249]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=1)

2023-05-08 17:37:23.905907: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [27000]
	 [[{{node Placeholder/_0}}]]
2023-05-08 17:37:23.906159: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [27000]
	 [[{{node Placeholder/_0}}]]
2023-05-08 17:37:25.292947: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'tf_marian_mt_model_2/model/decoder/cond/ones/pack

 352/1687 [=====>........................] - ETA: 1:11:54 - loss: 8.6114 - accuracy: 0.0174

2023-05-08 17:56:40.545663: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 994582400 exceeds 10% of free system memory.
2023-05-08 17:56:42.689358: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 994582400 exceeds 10% of free system memory.


 355/1687 [=====>........................] - ETA: 1:12:17 - loss: 8.6017 - accuracy: 0.0173

2023-05-08 17:56:59.685891: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1027058560 exceeds 10% of free system memory.
2023-05-08 17:57:01.814853: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1027058560 exceeds 10% of free system memory.




2023-05-08 18:14:21.091464: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1108248960 exceeds 10% of free system memory.




2023-05-08 19:16:12.597482: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [1500]
	 [[{{node Placeholder/_0}}]]
2023-05-08 19:16:14.264392: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'tf_marian_mt_model_2/model/decoder/cond/ones/packed/tf_marian_mt_model_2/model/decoder/strided_slice_1' with dtype int32
	 [[{{node tf_marian_mt_model_2/model/decoder/cond/ones/packed/tf_marian_mt_model_2/model/decoder/strided_slice_1}}]]
2023-05-08 19:16:14.264509: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate



<keras.callbacks.History at 0x7f764e46b490>