## Load pretrained model locally

In [None]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("./model/tokenizer")
model = TFAutoModelForSeq2SeqLM.from_pretrained("./model/model.h5")

In [None]:
model.summary()

In [None]:
model.model.get_config()

In [None]:
model.model.encoder.get_config()

In [None]:
print(tokenizer.source_lang, "-", tokenizer.target_lang)
print(f"Max lenght: {tokenizer.model_max_length}")

## The attempt of implementation a transfer learning on a model

### Load data from file

In [None]:
polish_list = []
english_list = []
with open("./dataset/polish.txt", "r+") as file1:
    for line in file1:
        line = line.strip()
        polish_list.append(line)

with open("./dataset/english.txt", "r+") as file1:
    for line in file1:
        line = line.strip()
        english_list.append(line)

In [22]:
print(polish_list[:5])
print(english_list[:5])

['Polska', 'Europejski Fundusz Społeczny w Polsce, 2007-2013', 'Finansowany z EFS Program Operacyjny Kapitał Ludzki 2007-2013 świadczy o zaangażowaniu się Polski w inwestowanie w ludzi i ich umiejętności oraz o gotowości do podejmowania wyzwań, przed którymi obecnie staje.', 'Położenie większego nacisku na edukację i szkolenia wydaje się być najlepszym sposobem na zwalczanie bezrobocia, wykluczenia społecznego oraz wyrównywania różnic między regionami.', 'Udoskonalenie systemów edukacyjnych i szkoleniowych, reformy rynku pracy oraz wzmocnienie sektora ekonomii społecznej pozwolą osiągnąć długotrwały sukces.']
['Poland', 'The European Social Fund in Poland, 2007-2013', 'Poland’s ESF Operational Programme for 2007-2013 – Human Capital – is evidence of the country’s commitment to investing in people and their skills and determination to tackle the challenges Poland faces.', 'More education and training is the key instrument for combating unemployment, social exclusion and strong regional 

In [28]:
max(len(element) for element in polish_list)

TypeError: Scalar tensor has no `len()`

### Create preprocessing function for our data

In [None]:
max_input_length = 128
max_target_length = 128
def preprocess_fun(inputs, targets):
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
idx = round(len(polish_list)*0.95)
train_data = preprocess_fun(polish_list[:idx], english_list[:idx])
val_data = preprocess_fun(polish_list[idx:], english_list[idx:])



In [None]:
print(f"Input_ids:{train_data['input_ids'][0]}, attention mask:{train_data['attention_mask'][0]}, label:{train_data['labels'][0]} dla slowa {polish_list[0]}")

Input_ids:[3285, 0, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429, 63429], attention mask:[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 