In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

#### Загружаем данные

In [None]:
import pandas as pd
import json

df = pd.read_csv('commands-entity-version-3-with-origs-cleaned.csv')
backwards = df.copy()

In [None]:
df = df.sample(frac=1)

#### Осталвяем только с хорошей перплексией

In [None]:
import numpy as np
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from tqdm.auto import tqdm

mname = "sberbank-ai/rugpt3small_based_on_gpt2"
gpt_tokenizer = AutoTokenizer.from_pretrained(mname)
gpt_model = AutoModelForCausalLM.from_pretrained(mname)
gpt_model.cuda()

# Habr version


def get_gpt2_ppl(test_sentences, aggregate=True, sep="\n"):
    """Calculate average perplexity per token and number of tokens in each text."""
    lls = []
    weights = []
    for text in test_sentences:
        encodings = gpt_tokenizer(f"{sep}{text}{sep}", return_tensors="pt")
        input_ids = encodings.input_ids.to(gpt_model.device)
        target_ids = input_ids.clone()

        w = max(0, len(input_ids[0]) - 1)
        if w > 0:
            with torch.no_grad():
                outputs = gpt_model(input_ids, labels=target_ids)
                log_likelihood = outputs[0]
                ll = log_likelihood.item()
        else:
            ll = 0
        lls.append(ll)
        weights.append(w)

    likelihoods, weights = np.array(lls), np.array(weights)
    if aggregate:
        return sum(likelihoods * weights) / sum(weights)
    return likelihoods, weights

In [None]:
commands = list(df['command'])
origs = list(df['orig'])

all_pairs=[]
for i in range(len(commands)):
    if get_gpt2_ppl([commands[i]]) <= 6.2:
        if get_gpt2_ppl([origs[i]]) <= 6.2:
            all_pairs.append((commands[i], origs[i]))

In [None]:
with open("/content/gdrive/My Drive/all_pairs", "w") as fp:
    json.dump(all_pairs, fp)

In [None]:
with open("/content/gdrive/My Drive/all_pairs", "r") as fp:
    all_pairs = json.load(fp)

### Dataset & DataLoader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
# from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rut5-base-paraphraser")
model = AutoModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-base-paraphraser")

class ParaphraseDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text, target_text = self.data[idx]
        input_tokens = tokenizer.encode_plus(input_text, padding="max_length", truncation=True, max_length=128,
                                             return_tensors="pt")
        target_tokens = tokenizer.encode_plus(target_text, padding="max_length", truncation=True, max_length=128,
                                              return_tensors="pt")
        return {
            "input_ids": input_tokens["input_ids"].squeeze(),
            "attention_mask": input_tokens["attention_mask"].squeeze(),
            "decoder_input_ids": target_tokens["input_ids"].squeeze(),
            "decoder_attention_mask": target_tokens["attention_mask"].squeeze()
        }

In [None]:
train_data = all_pairs[: int(len(all_pairs) * 0.8)]
val_data = all_pairs[int(len(all_pairs) * 0.8):]

In [None]:
train_dataset = ParaphraseDataset(train_data)
val_dataset = ParaphraseDataset(val_data)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
torch.cuda.empty_cache()

### Train

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        decoder_input_ids = batch["decoder_input_ids"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask,
                        labels=decoder_input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_dataloader)

    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            decoder_input_ids = batch["decoder_input_ids"].to(device)
            decoder_attention_mask = batch["decoder_attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask,
                            labels=decoder_input_ids)
            loss = outputs.loss

            val_loss += loss.item()

        val_loss /= len(val_dataloader)

    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

    # Оценка модели и сохранение весов
    model_save_name = f'paraphraser-{epoch+5}'
    path = f"/content/gdrive/My Drive/{model_save_name}"
    torch.save(model.state_dict(), path)

### Predict

In [None]:
model_save_name = 'paraphraser-3'
path = f"/content/gdrive/My Drive/{model_save_name}"
model.load_state_dict(torch.load(path))

In [None]:
input_sentence = "топаю через горы"
input_tokens = tokenizer.encode_plus(input_sentence, padding="max_length", truncation=True, max_length=128,
                                     return_tensors="pt")
input_tokens = {k: v.to(device) for k, v in input_tokens.items()}

num_return_sequences = 10
paraphrases = []
for _ in range(num_return_sequences):
    generated_ids = model.generate(input_tokens["input_ids"], attention_mask=input_tokens["attention_mask"],
                                   max_length=128, num_return_sequences=1, do_sample=True, top_k=30, top_p=0.95)

    generated_ids = generated_ids.cpu()
    paraphrase = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    paraphrases.append(paraphrase)

print(f"Input sentence: {input_sentence}\n")
for paraphrase in paraphrases:
    print(paraphrase)

In [None]:
import pandas as pd

df = pd.read_csv('commands-entity-version-3-with-origs-cleaned.csv')

In [None]:
references = list(df['command'])
predictions = []
for ref in references:
    input_sentence = ref
    input_tokens = tokenizer.encode_plus(input_sentence, padding="max_length", truncation=True, max_length=128,
                                 return_tensors="pt")
    input_tokens = {k: v.to(device) for k, v in input_tokens.items()}
    generated_ids = model.generate(input_tokens["input_ids"], attention_mask=input_tokens["attention_mask"],
                               max_length=128, num_return_sequences=1, do_sample=True, top_k=30, top_p=0.95)

    generated_ids = generated_ids.cpu()
    paraphrase = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    predictions.append(paraphrase)

In [None]:
with open("/content/paraphrase-pred", "w") as fp:
    json.dump(predictions, fp)