In [13]:
# imports
import torch
import evaluate
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead, create_reference_model

from src.constants import MT_MODEL, MT_DATA_FILE, MT_SEED
from src.metrics import translation_reward

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

0

In [15]:
translations = pd.read_csv(MT_DATA_FILE)
train_dataset, test_dataset = train_test_split(translations, test_size=0.8, random_state=MT_SEED)

In [16]:
# get models
model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(MT_MODEL).to(device)
model_ref = create_reference_model(model).to(device)

tokenizer = AutoTokenizer.from_pretrained(MT_MODEL)

# initialize trainer
ppo_config = PPOConfig(
    batch_size=1,
    learning_rate=1.41e-5,
    #log_with="wandb",
)

# create a ppo trainer
ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)

# reward model
bleu = evaluate.load("bleu")



In [17]:
for epoch in range(1):
    for text, translation in zip(train_dataset['Polish'], train_dataset['English']):
        print("\n----------------------------")
        print(f'Source sentence: {text}')
        print(f'Target sentence: {translation}')
        # encode a query
        query_tensor = tokenizer.encode(text, return_tensors="pt").to(device)

        # get model response
        response_tensor = model.generate(input_ids=query_tensor)
        result_txt = [tokenizer.decode(response_tensor[0], skip_special_tokens=True)]
        print(f'Response sentence: {result_txt[0]}')

        # define a reward for response
        reward = translation_reward(result_txt, translation, bleu, device)
        print(f'Reward: {reward[0].item()}')

        # train model for one step with ppo
        train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)
        ppo_trainer.log_stats(train_stats, {text:result_txt}, reward)


----------------------------
Source sentence: Wszystko co wiecie o historii to kłamstwo. Na przykład:
Target sentence: Everything you think you know about history is a lie.
Response sentence: All you know about history is a lie.
Reward: 0.6956543326377869





----------------------------
Source sentence: Pewnie słyszał pan o sprawie Gabby Stone? Pan z Departamentu Sprawiedliwości?
Target sentence: I'm sure you've heard about the investigation regarding Gabby Stone.
Response sentence: You've probably heard of the Gabby Stone case?
Reward: 0.10000000149011612

----------------------------
Source sentence: Joey jest bardzo lojalny, ale nie masz pojęcia, ilu tam jest teraz wyznawców.
Target sentence: Joey is very loyal, but you have no idea how many followers are out there now.
Response sentence: Joey is very loyal, but you have no idea how many followers there are now.
Reward: 0.8495285511016846

----------------------------
Source sentence: Następne płatności dokonywane są zgodnie z zasadami przewidzianymi w art. 10.
Target sentence: Subsequent payments shall be made in accordance with the rules provided for in Article 10.
Response sentence: The following payments shall be made in accordance with the rules provided for in Article 10.
Reward:




----------------------------
Source sentence: w języku węgierskim: Fizikai ellenőrzés elvégezve [2535/2001/EK rendelet],
Target sentence: in Hungarian: Fizikai ellenőrzés elvégezve (2535/2001/EK rendelet),
Response sentence: in Hungarian: Fizikai ellenőrzés elvégezve [2535/2001/EK rendelet],
Reward: 0.6080942153930664

----------------------------
Source sentence: Możemy dziękować Bogu, że nie było większej tragedii.
Target sentence: And we can just thank God the tragedy wasn't worse.
Response sentence: We can thank God that there was no greater tragedy.
Reward: 0.0882352963089943

----------------------------
Source sentence: Bo jesteś kobietą moich marzeń i zasługujesz na idealne oświadczyny.
Target sentence: 'cause you're the woman of my dreams And you deserve the perfect proposal.
Response sentence: Because you're a woman of my dreams and you deserve a perfect proposal.
Reward: 0.4471047818660736





----------------------------
Source sentence: Dziesiąta rocznica to jest jaka -- brązowa, marmurowa, paździerzowa?
Target sentence: Bronze? Sandstone? Particleboard?
Response sentence: The tenth anniversary is what -- brown, marble, October?
Reward: 0.0

----------------------------
Source sentence: - Królowa Matka postanowiła pójść na dno wraz ze statkiem.
Target sentence: - The Queen Mother has elected to go down with the ship.
Response sentence: The Queen Mother has decided to go down with the ship.
Reward: 0.7226806282997131





----------------------------
Source sentence: Wprawimy to w ruch i będziemy spodziewać się sukcesu
Target sentence: We'll set it going and hope for the best
Response sentence: We're gonna get this thing moving and we're gonna expect success.
Reward: 0.02777777798473835





----------------------------
Source sentence: Chce pani usłyszeć wróżbę wartą tych 10 $ za wizytę?
Target sentence: You want your $10 of psychic prediction?
Response sentence: You want to hear a fortune worth $10 a visit?
Reward: 0.11538461595773697

----------------------------
Source sentence: To nie chwila, która chciałem, żeby trwała wiecznie!
Target sentence: This isn't the moment I wanted to last forever!
Response sentence: It's not the moment I wanted it to last!
Reward: 0.3546416461467743





----------------------------
Source sentence: Moja klientka miała styczność z trojgiem pacjentów.
Target sentence: My client touched three patients.
Response sentence: My client was in contact with three patients.
Reward: 0.2222222238779068

----------------------------
Source sentence: Tak, i musieliśmy go puścić godzinę przed zaginięciem Peck.
Target sentence: Yeah, and we let him go about an hour before Peck went missing.
Response sentence: Yes, and we had to let him go an hour before Peck went missing.
Reward: 0.6123924851417542





----------------------------
Source sentence: To wywołuje panikę i histerie. Szczerze mówiąc, jest dla nas niekorzystne.
Target sentence: Creates panic and hysteria,and frankly,it's bad for business.
Response sentence: It's causing panic and hysteria.
Reward: 0.09090909361839294

----------------------------
Source sentence: Nigdy nie pozwoliłam nikomu się dowiedzieć kim naprawdę jestem.
Target sentence: I've never let anyone know who I really am.
Response sentence: I never let anyone know who I really am.
Reward: 0.8845003247261047





----------------------------
Source sentence: Dostaniesz maksymalnie 5 lat i będziesz mógł wyjść warunkowo.
Target sentence: You'll get five years max and you'll be eligible for parole.
Response sentence: You'll get a maximum of 5 years and you can get out on parole.
Reward: 0.1315789520740509

----------------------------
Source sentence: Bardzo się zdziwił, kiedy wpadłem do tej restauracji... i rozbiłem mu nos!
Target sentence: And imagine his surprise when I dropped in at the restaurant today... and punched him in the face.
Response sentence: He was so surprised when I stopped by this restaurant... and I broke his nose!
Reward: 0.14607328176498413

----------------------------
Source sentence: Chciałam jej w odosobnieniu przez resztę jej życia.
Target sentence: I want her to be in solitary confinement for the rest of her life.
Response sentence: I wanted her in isolation for the rest of her life.
Reward: 0.4861934781074524

----------------------------
Source sentence: Rozumiem, że




----------------------------
Source sentence: - Obejrzyjmy kilka scen, które niektórym z nas nie poszły zbyt dobrze.
Target sentence: - Let's take a look at some moments that didn't go too well for a couple of us. - Uh-oh!
Response sentence: - Let's watch a few scenes that didn't go very well for some of us.
Reward: 0.25





----------------------------
Source sentence: I dlatego spotyka się z tobą tutaj, a nie u siebie, świetnie.
Target sentence: So he comes here, instead of you going to his place.
Response sentence: And that's why he's meeting you here and not at his place, great.
Reward: 0.04545454680919647





----------------------------
Source sentence: Członek Światowej Federacji Międzynarodowych Konkursów Muzycznych w Genewie.
Target sentence: A member of the World Federation of International Music Competitions in Geneva
Response sentence: Member of the World Federation of International Music Competitions in Geneva.
Reward: 0.7112201452255249

----------------------------
Source sentence: To nie ma nic z tym wspólnego, tu chodzi o twoją pomoc.
Target sentence: -l don't have any grand plan. -Who invented sandwich night? That has nothing to do with this.
Response sentence: It's got nothing to do with this. It's about your help.
Reward: 0.22906196117401123

----------------------------
Source sentence: Całe konstelacje nietknięte przez zarazę, nieświadome naszego cierpienia.
Target sentence: Whole constellations, untouched by disease, oblivious to our suffering.
Response sentence: Entire constellations untouched by the plague, unaware of our suffering.
Reward: 0.13333334028720856





----------------------------
Source sentence: Nie potrzebuję dzisiejsze nocy wysłuchiwać twojego gówna, starcze.
Target sentence: - I do not need whining now.
Response sentence: I don't need to hear your shit tonight, old man.
Reward: 0.06666667014360428





----------------------------
Source sentence: W latach 70-tych rysowałam komisky, w ktorych sama siebie oskarżałam.
Target sentence: I did books in the '70s that were self-deprecating.
Response sentence: In the '70s, I was drawing a comedy that I'd accused myself of.
Reward: 0.07894736528396606

----------------------------
Source sentence: Nie będziesz czuła się tak odcięta od świata zewnętrznego.
Target sentence: You won't feel quite so shut out from the outside world.
Response sentence: You're not gonna feel so cut off from the outside world.
Reward: 0.3599222004413605





----------------------------
Source sentence: A szczerze mówiąc stoję w martwym punkcie. Zajmiesz się śledztwem osobiście?
Target sentence: And to be honest... ..I am stuck.
Response sentence: And to be honest, I'm at a dead end.
Reward: 0.21880505979061127

----------------------------
Source sentence: I poza tym moja żona Virginia chroni drzwi do SCHICKEL HALL
Target sentence: AND BESIDES, MY WIFE VIRGINIA GUARDS THE DOORS OF SCHICKEL HALL
Response sentence: And my wife Virginia's protecting the door to SCHICKEL HALL
Reward: 0.05263157933950424

----------------------------
Source sentence: - Strzelaj. Walka z nim w sądzie... jest nieprzyjemna?
Target sentence: Is opposing him in court kind of... uncomfortable?
Response sentence: It's an unpleasant fight with him in court.
Reward: 0.0714285746216774

----------------------------
Source sentence: Wiem, że mnie nienawidzisz, zatem przepraszam za to nieporozumienie.
Target sentence: I know that you must hate me, and I just wanted to sa




----------------------------
Source sentence: Np. "Czy zrobiłbym to z kobietą, która była mężczyzną?".
Target sentence: Like, would I do it with a woman that used to be a man?
Response sentence: Like, "Would I do it with a woman who was a man?"
Reward: 0.4143519997596741





----------------------------
Source sentence: Był jednym z nich, a przynajmniej za takiego go uważali.
Target sentence: Their ranks were split, for this Theodor Herzl was not one of the Eastern Jews, not a Jew from Russia.
Response sentence: He was one of them, or at least they thought he was.
Reward: 0.0535714291036129





----------------------------
Source sentence: Joseph to dobry człowiek i jego testy były Neuro-negatywne.
Target sentence: Joseph is a good man and he did test Neuro negative.
Response sentence: Joseph's a good man, and his tests were neuro-negative.
Reward: 0.0882352963089943

----------------------------
Source sentence: /Dowiedział się o trasie konwoju /i kazał nam cię odbić.
Target sentence: He learned of your route and ordered us to intercept.
Response sentence: He's learned of the convoy's tour and he's told us to take you back.
Reward: 0.1315789520740509

----------------------------
Source sentence: Oboje wiemy, że nic się nie zdarzy, Gina, może zakończymy te gierki?
Target sentence: We both know nothing's going to happen, Gina, so why don't we cut the act? Whoa, come here.
Response sentence: We both know nothing's going to happen, Gina, maybe we can end these games.
Reward: 0.3960511386394501

----------------------------
Source sentence: To miasto odwróci się od nas, gdy dow




----------------------------
Source sentence: Biorąc pod uwagę twój stan zdrowia, pewnie ci trudno nawet w to uwierzyć.
Target sentence: Given your present condition, you might not even make it through.
Response sentence: Given your medical condition, it's probably hard to even believe.
Reward: 0.11764705926179886

----------------------------
Source sentence: Po tylu latach dalej trzymasz się z tym pojebańcem?
Target sentence: You and that crazy motherfucker still tight after all these years.
Response sentence: After all these years, you're still hanging with this fucking guy?
Reward: 0.07894736528396606

----------------------------
Source sentence: - Oto, czego się dowiedziałam... o doktorze Hugo Pine.
Target sentence: - Here's that information you wanted... on Dr. Hugo Pine.
Response sentence: Here's what I've learned... of Dr. Hugo Pine.
Reward: 0.3136214315891266

----------------------------
Source sentence: Chcę ci powiedzieć Jody, że z Benem u boku.. możesz długo rano spać.
T




----------------------------
Source sentence: Wiecie, tworzenie sie Stanow, I Wojna, 11 wrzesnia.
Target sentence: You know, formation of the Fed, World War I, 9/11.
Response sentence: You know, creating states, and war, 11th of September.
Reward: 0.09375





----------------------------
Source sentence: Ricardo Klement. Ricardo Klement to nie kto inny, tylko Adolf Eichmann.
Target sentence: Ricardo Klement was none other than Adolf Eichmann.
Response sentence: Ricardo Klement. Ricardo Klement is none other than Adolf Eichmann.
Reward: 0.5846235156059265





----------------------------
Source sentence: Myślisz, że umieszczą nas w e własnej celi zamiast ze zwyczajnymi przestępcami.
Target sentence: You'd think they'd put us in our own cell block instead of with the common criminals.
Response sentence: You think they're going to put us in their own cell instead of common criminals.
Reward: 0.2380952388048172





----------------------------
Source sentence: Nie możesz sobie pozwolić żeby stracić więcej krwi.
Target sentence: You... you can't afford to lose more blood.
Response sentence: You can't afford to lose any more blood.
Reward: 0.45990243554115295





----------------------------
Source sentence: "Czy życzy pan sobie ochrony jej w banku do powrotu szeryfa?
Target sentence: Do you wish her guarded at the bank with the sheriff away?
Response sentence: "Do you wish to protect her in the bank for the return of the sheriff?
Reward: 0.1315789520740509





----------------------------
Source sentence: Dopiero zaczynamy odkrywać prawdziwą naturę inteligencji.
Target sentence: You see, the fact is that we're only now just beginning to understand the true nature of intelligence.
Response sentence: We're just beginning to discover the true nature of intelligence.
Reward: 0.3401889503002167





----------------------------
Source sentence: Podpisał wczoraj umowę z Fundacją Wiecznego Postępu.
Target sentence: He signed a contract with the Forever Forward Foundation yesterday.
Response sentence: He signed an agreement with the Foundation of Eternal Progress yesterday.
Reward: 0.20000000298023224


