In [8]:
# imports
import torch
import evaluate
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead, create_reference_model

In [9]:
pl_texts = ["Stół z powyłamywanymi nogami.",
            "Król Karol kupił królowej Karolinie korale koloru koralowego.",
            "Lojalna Jola i nielojalna Jola.",
            "W czasie suszy szosa sucha.",
            "I cóż że ze Szwecji?",
            "Czy rak trzyma w szczypcach strzęp szczawiu czy trzy części trzciny?",
            "W Szczebrzeszynie chrząszcz brzmi w trzcinie.",
            "Przeleciały trzy pstre przepiórzyce przez trzy pstre kamienice.",
            "Pójdźże, kiń tę chmurność w głąb flaszy.",
            "Drabina z powyłamywanymi szczeblami.",
            ]

en_texts = ["A table with broken legs.",
            "King Carol bought coral-coloured beads for Queen Caroline.",
            "Loyal Jola and disloyal Jola.",
            "In dry weather, the street is dry.",
            "So what that it is from Sweden?",
            "Does the crab hold in its claws a piece of dock or three pieces of reed?",
            "In Szczebrzeszyn, a beetle buzzes in the reed.",
            "Three speckled quails flew past three speckled tenement houses.",
            "Come on, stick this cloud into the bottom of the flask.",
            "A ladder with broken rungs.",
            ]

In [10]:
# get models
model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained('Helsinki-NLP/opus-mt-pl-en').to('cuda')
model_ref = create_reference_model(model).to('cuda')

tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-pl-en')



In [11]:
# initialize trainer
ppo_config = PPOConfig(
    batch_size=1,
)

# create a ppo trainer
ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)

# reward model
bleu = evaluate.load("bleu")



In [17]:
for text, translation in zip(pl_texts, en_texts):
    print("\n----------------------------")
    print(f'Source sentence: {text}')
    print(f'Target sentence: {translation}')
    # encode a query
    query_tensor = tokenizer.encode(text, return_tensors="pt").to('cuda')

    # get model response
    response_tensor = model.generate(input_ids=query_tensor)
    result_txt = [tokenizer.decode(response_tensor[0], skip_special_tokens=True)]
    print(f'Response sentence: {result_txt[0]}')

    # define a reward for response
    results = bleu.compute(predictions=result_txt, references=[translation])
    reward = [torch.tensor(results['bleu'], device=model.pretrained_model.device)]
    print(f'BLEU reward: {results["bleu"]}')

    # train model for one step with ppo
    train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)


----------------------------
Source sentence: Stół z powyłamywanymi nogami.
Target sentence: A table with broken legs.
Response sentence: The broken-leg table.
BLEU reward: 0.0

----------------------------
Source sentence: Król Karol kupił królowej Karolinie korale koloru koralowego.
Target sentence: King Carol bought coral-coloured beads for Queen Caroline.
Response sentence: King Carol had bought Queen Carolina corals of coral colors.
BLEU reward: 0.0

----------------------------
Source sentence: Lojalna Jola i nielojalna Jola.
Target sentence: Loyal Jola and disloyal Jola.
Response sentence: Loyalty Yola and Loyalty Yola.
BLEU reward: 0.0

----------------------------
Source sentence: W czasie suszy szosa sucha.
Target sentence: In dry weather, the street is dry.
Response sentence: During droughts dry road.
BLEU reward: 0.0

----------------------------
Source sentence: I cóż że ze Szwecji?
Target sentence: So what that it is from Sweden?
Response sentence: And what is that from 

In [16]:
results

{'bleu': 0.0,
 'precisions': [0.4, 0.0, 0.0, 0.0],
 'brevity_penalty': 0.8187307530779819,
 'length_ratio': 0.8333333333333334,
 'translation_length': 5,
 'reference_length': 6}