In [1]:
# %load_ext autoreload
# %autoreload 2

In [None]:
!pip install -r requirements.txt

In [3]:
from run_pplm_paraphrase import run_pplm_example

In [4]:
import torch
from transformers import GPT2Tokenizer
from transformers.modeling_gpt2 import GPT2LMHeadModel

pretrained_model="gpt2-medium"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPT2LMHeadModel.from_pretrained(
    pretrained_model,
    output_hidden_states=True
)
model.to(device)
model.eval()
for param in model.parameters():
    param.requires_grad = False

tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

# Let's generate some text!
When you specify a number of samples, PPLM will first generate a sample without any modification for reference (called "unperturbed"), and then the number of samples specified (called "perturbed"). If you want to generate different samples given the same parameters, change the `seed` value to a number other than 0 (the default value).

In [5]:
phrase = "Endemic types or species are especially likely to develop on islands."

Ниже примеры генерации **PPLM** фраз на основе исходной. В отдельных случаях подбираются подходящие синонимы, но в целом это скорее генерация на заданную тему, чем перефразирование.

In [18]:
stepsize = 0.04

for num_iterations, kl_scale, top_k in [
    (8, 0.001, 1), 
    (12, 0.001, 1), 
    (12, 0.001, 3),
    (8, 1e-05, 3),
    (12, 1e-05, 3)
]:
    print(stepsize, '|', num_iterations, '|', kl_scale, '|', top_k, '|')
    _, _ = run_pplm_example(
        model, tokenizer, device,
        cond_text=phrase,
        stepsize=stepsize,
        num_iterations=num_iterations,
        kl_scale=kl_scale,
        top_k=top_k
    )

0.04 | 8 | 0.001 | 1 |
Endemic types of bacteria are particularly likely to develop resistance to antibiotics
<|endoftext|>Endemic types of bacteria are particularly likely to develop resistance to antibiotics

0.04 | 12 | 0.001 | 1 |
Endemic types or species are the most common and most diverse.
<|endoftext|>Endemic types or species are the most common and most diverse.

0.04 | 12 | 0.001 | 3 |
Endemic forms of cancer are especially likely to develop among women.
<|endoftext|>Endemic forms of cancer are especially likely to develop among women.

0.04 | 8 | 1e-05 | 3 |
Endemic diseases, including diabetes, have to develop on islands.
<|endoftext|>Endemic diseases, including diabetes, have to develop on islands.

0.04 | 12 | 1e-05 | 3 |
Endemic forms of species are especially likely to develop on islands.
<|endoftext|>Endemic forms of species are especially likely to develop on islands.



In [22]:
stepsize = 0.1
top_k = 1

for num_iterations, kl_scale in [
    (4, 1e-2),
    (12, 1e-2),
    (4, 1e-3), 
    (6, 1e-4),
    (8, 1e-5)
]:
    print(stepsize, '|', num_iterations, '|', kl_scale, '|', top_k, '|')
    _, _ = run_pplm_example(
        model, tokenizer, device,
        cond_text=phrase,
        stepsize=stepsize,
        num_iterations=num_iterations,
        kl_scale=kl_scale,
        top_k=top_k
    )

0.1 | 4 | 0.01 | 1 |
Endemic diseases are the leading causes of death in the world.
<|endoftext|>Endemic diseases are the leading causes of death in the world.

0.1 | 12 | 0.01 | 1 |
Endemic types or species are the most common types on islands.
<|endoftext|>Endemic types or species are the most common types on islands.

0.1 | 4 | 0.001 | 1 |
Endemic types of bacteria are particularly likely to develop resistance to antibiotics
<|endoftext|>Endemic types of bacteria are particularly likely to develop resistance to antibiotics

0.1 | 6 | 0.0001 | 1 |
Endemic types or species are particularly likely to develop on islands.
<|endoftext|>Endemic types or species are particularly likely to develop on islands.

0.1 | 8 | 1e-05 | 1 |
Endemic types or species are especially likely to develop cancer..
<|endoftext|>Endemic types or species are especially likely to develop cancer..



Четвёртый пример (0.1, 6, 1e-4, 1):  
    `Endemic types or species are particularly likely to develop on islands.` - единственный случай с сохранением смысла.

In [34]:
stepsize = 0.2

for num_iterations, kl_scale, top_k in [
    (4, 1e-3, 1),
    (4, 1e-3, 2)
]:
    print(stepsize, '|', num_iterations, '|', kl_scale, '|', top_k, '|')
    _, _ = run_pplm_example(
        model, tokenizer, device,
        cond_text=phrase,
        stepsize=stepsize,
        num_iterations=num_iterations,
        kl_scale=kl_scale,
        top_k=top_k
    )

0.2 | 4 | 0.001 | 1 |
Endemic types or species are found in the world on all continents
<|endoftext|>Endemic types or species are found in the world on all continents

0.2 | 4 | 0.001 | 2 |
Endemic forms of species are the likely to develop in the future
<|endoftext|>Endemic forms of species are the likely to develop in the future



При некоторых ограничениях на параметры генерируется точная копия фразы.

In [30]:
top_k = 1

for stepsize, num_iterations, kl_scale in [
    (0.04, 24, 1e-4),
    (0.04, 12, 1e-5),
    (0.1, 12, 1e-5),
    (0.1, 6, 1e-6),
    (0.1, 24, 1e-4),
    (0.2, 6, 1e-4),
]:
    print(stepsize, '|', num_iterations, '|', kl_scale, '|', top_k, '|')
    _, _ = run_pplm_example(
        model, tokenizer, device,
        cond_text=phrase,
        stepsize=stepsize,
        num_iterations=num_iterations,
        kl_scale=kl_scale,
        top_k=top_k
    )

0.04 | 24 | 0.0001 | 1 |
Endemic types or species are especially likely to develop on islands.
<|endoftext|>Endemic types or species are especially likely to develop on islands.

0.04 | 12 | 1e-05 | 1 |
Endemic types or species are especially likely to develop on islands.
<|endoftext|>Endemic types or species are especially likely to develop on islands.

0.1 | 12 | 1e-05 | 1 |
Endemic types or species are especially likely to develop on islands.
<|endoftext|>Endemic types or species are especially likely to develop on islands.

0.1 | 8 | 1e-06 | 1 |
Endemic types of species are especially likely to develop on islands.
<|endoftext|>Endemic types of species are especially likely to develop on islands.

0.1 | 24 | 0.0001 | 1 |
Endemic types or species are especially likely to develop on islands.
<|endoftext|>Endemic types or species are especially likely to develop on islands.

0.2 | 8 | 0.0001 | 1 |
Endemic types or species are especially likely to develop on islands.
<|endoftext|>Endemi

Предложения из комментариев к коду.

In [8]:
phrase = "The discriminator model is more complex. \
It takes both real image samples and random noise seeds as input."

In [44]:
top_k = 1

for stepsize, num_iterations, kl_scale in [
    (0.2, 18, 1e-3),
    (0.2, 7, 1e-4),
    (0.2, 8, 1e-4),
    (0.1, 12, 1e-5),
    (0.2, 12, 1e-5)
]:
    print(stepsize, '|', num_iterations, '|', kl_scale, '|', top_k, '|')
    _, _ = run_pplm_example(
        model, tokenizer, device,
        cond_text=phrase,
        stepsize=stepsize,
        num_iterations=num_iterations,
        kl_scale=kl_scale,
        top_k=top_k
    )

0.2 | 18 | 0.001 | 1 |
The first thing you notice more complex. It takes both real-time and random noise to as input.
<|endoftext|>The first thing you notice more complex. It takes both real-time and random noise to as input.

0.2 | 7 | 0.0001 | 1 |
The discriminator model is more complex. It takes both real-world and random noise into as input.
<|endoftext|>The discriminator model is more complex. It takes both real-world and random noise into as input.

0.2 | 8 | 0.0001 | 1 |
The discriminator model is more complex. It takes both real world samples and random noise to as input.
<|endoftext|>The discriminator model is more complex. It takes both real world samples and random noise to as input.

0.1 | 12 | 1e-05 | 1 |
The discriminator model is more complex. It takes both real image samples and random noise seeds as input.
<|endoftext|>The discriminator model is more complex. It takes both real image samples and random noise seeds as input.

0.2 | 12 | 1e-05 | 1 |
The discriminator mod

In [9]:
phrase = "The GPT2 Model transformer with a language modeling head on top \
(linear layer with weights tied to the input embeddings)."

In [57]:
top_k = 1

for stepsize, num_iterations, kl_scale in [
    (0.1, 22, 1e-5),
    (0.1, 24, 1e-5) 
]:
    print(stepsize, '|', num_iterations, '|', kl_scale, '|', top_k, '|')
    _, _ = run_pplm_example(
        model, tokenizer, device,
        cond_text=phrase,
        stepsize=stepsize,
        num_iterations=num_iterations,
        kl_scale=kl_scale,
        top_k=top_k
    )

0.1 | 22 | 1e-05 | 1 |
The GPT2 Model is with a language modeling head on top (linear layer with layers tied to the input embeddings).
<|endoftext|>The GPT2 Model is with a language modeling head on top (linear layer with layers tied to the input embeddings).

0.1 | 24 | 1e-05 | 1 |
The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
<|endoftext|>The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).



И посмотрим на обычную речь.

In [10]:
phrase = "I couldn't bear to watch it. And I thought the UA loss was embarrassing"

In [14]:
top_k = 1

for stepsize, num_iterations, kl_scale in [
    (0.04, 16, 1e-3),
    (0.04, 16, 1e-4),
    (0.1, 8, 1e-4),
    (0.1, 12, 1e-3),
    (0.2, 8, 1e-6)
]:
    print(stepsize, '|', num_iterations, '|', kl_scale, '|', top_k, '|')
    _, _ = run_pplm_example(
        model, tokenizer, device,
        cond_text=phrase,
        stepsize=stepsize,
        num_iterations=num_iterations,
        kl_scale=kl_scale,
        top_k=top_k
    )

0.04 | 16 | 0.001 | 1 |
I couldn't bear to watch it. And I thought the movie was awful.
<|endoftext|>I couldn't bear to watch it. And I thought the movie was awful.

0.04 | 16 | 0.0001 | 1 |
I couldn't bear to watch it. And I thought the movie was terrible.
<|endoftext|>I couldn't bear to watch it. And I thought the movie was terrible.

0.1 | 8 | 0.0001 | 1 |
I couldn't bear to watch it. And I thought the scene was just embarrassing
<|endoftext|>I couldn't bear to watch it. And I thought the scene was just embarrassing

0.1 | 12 | 0.001 | 1 |
I couldn't bear to watch it. And I thought the movie was pretty awful
<|endoftext|>I couldn't bear to watch it. And I thought the movie was pretty awful

0.2 | 8 | 1e-06 | 1 |
I couldn't bear to watch it. And I thought the Trump loss was embarrassing
<|endoftext|>I couldn't bear to watch it. And I thought the Trump loss was embarrassing



In [37]:
top_k = 1

for stepsize, num_iterations, kl_scale in [
    (0.1, 20, 1e-4),
    (0.1, 12, 1e-6),
    (0.1, 24, 1e-4),
    (0.2, 18, 1e-6),
]:
    print(stepsize, '|', num_iterations, '|', kl_scale, '|', top_k, '|')
    _, _ = run_pplm_example(
        model, tokenizer, device,
        cond_text=phrase,
        stepsize=stepsize,
        num_iterations=num_iterations,
        kl_scale=kl_scale,
        top_k=top_k
    )

0.1 | 20 | 0.0001 | 1 |
I couldn't bear to watch it. And I thought the whole loss was embarrassing
<|endoftext|>I couldn't bear to watch it. And I thought the whole loss was embarrassing

0.1 | 12 | 1e-06 | 1 |
I couldn't bear to watch it. And I thought the same loss was embarrassing
<|endoftext|>I couldn't bear to watch it. And I thought the same loss was embarrassing

0.1 | 24 | 0.0001 | 1 |
I couldn't bear to watch it. And I thought the UA loss was embarrassing
<|endoftext|>I couldn't bear to watch it. And I thought the UA loss was embarrassing

0.2 | 18 | 1e-06 | 1 |
I couldn't bear to watch it. And I thought the UA loss was embarrassing
<|endoftext|>I couldn't bear to watch it. And I thought the UA loss was embarrassing



## Табличка из wiki

- Original (biology): `Endemic types or species are especially likely to develop on islands.`  

| stepsize      | num_iterations | kl_scale  | top_k | Generated phrase |
| ------------- |:-------------:| -----:|-----:|-----:|
0.04 | 8 | 1e-03 | 1 | Endemic types of bacteria are particularly likely to develop resistance to antibiotics
0.04 | 8 | 1e-05 | 3 | Endemic diseases, including diabetes, have to develop on islands.
0.04 | 12 | 1e-05 | 3 | Endemic forms of species are especially likely to develop on islands.
0.1 | 4 | 0.01 | 1 | Endemic diseases are the leading causes of death in the world.
0.1 | 12 | 0.01 | 1 | Endemic types or species are the most common types on islands.
0.1 | 6 | 1e-04 | 1 | Endemic types or species are particularly likely to develop on islands.
0.2 | 8 | 1e-04 | 1 | Endemic types or species are especially likely to develop on islands.

- Originals (technical): `"The discriminator model is more complex. It takes both real image samples and random noise seeds as input."`  
and `"The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings)."`  

| stepsize      | num_iterations | kl_scale  | top_k | Generated phrase |
| ------------- |:-------------:| -----:|-----:|-----:|
0.2 | 7 | 1e-04 | 1 | The discriminator model is more complex. It takes both real-world and random noise into as input.
0.1 | 12 | 1e-05 | 1 | The discriminator model is more complex. It takes both real image samples and random noise seeds as input.
0.1 | 22 | 1e-05 | 1 | The GPT2 Model is with a language modeling head on top (linear layer with layers tied to the input embeddings).
0.1 | 24 | 1e-05 | 1 | The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).

- Original (informal speech): `"I couldn't bear to watch it. And I thought the UA loss was embarrassing"`

| stepsize      | num_iterations | kl_scale  | top_k | Generated phrase |
| ------------- |:-------------:| -----:|-----:|-----:|
0.04 | 16 | 1e-03 | 1 | I couldn't bear to watch it. And I thought the movie was awful.
0.04 | 16 | 1e-04 | 1 | I couldn't bear to watch it. And I thought the movie was terrible.
0.1 | 8 | 1e-04 | 1 | I couldn't bear to watch it. And I thought the scene was just embarrassing
0.2 | 8 | 1e-06 | 1 | I couldn't bear to watch it. And I thought the Trump loss was embarrassing
=== | == | ===== | = | =========================================
0.1 | 20 | 1e-04 | 1 | I couldn't bear to watch it. And I thought the whole loss was embarrassing
0.1 | 12 | 1e-06 | 1 | I couldn't bear to watch it. And I thought the same loss was embarrassing
0.1 | 24 | 1e-04 | 1 | I couldn't bear to watch it. And I thought the UA loss was embarrassing
0.2 | 18 | 1e-06 | 1 | I couldn't bear to watch it. And I thought the UA loss was embarrassing
