In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_linear_schedule_with_warmup #  AdamW,
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os
from transformers import AutoConfig
import re
import random
from utils.decoding_functions import generate, clean_text

device = "cuda" if torch.cuda.is_available() else "cpu"

SEED = 69
random.seed(SEED)

# Generate Function
The generate function is defined in utils/decoding_functions.
It behave differently depending on the input received, it can be behave as:
* Top-p Sampling (Nucleus Sampling)
* Sample-and-rank
* Top-p Sampling-and-rank 

But first we need to introduce the **Temperature sampling**.

## Temperature Sampling
Temperature sampling is a variation of the simple Random Sampling using the parameter $T$ in the following way.

Temperature $T > 0$ is a hyper-parameter that regulates the probability distribution $p_i$ of the next
token during decoding. We divide the logits $z_i$ by $T$ (calling it $T_z$) before computing the “softmax” as in Hinton et al. (2015):
$$
Tz = z/T\\
\\
p_i = \frac{\exp(Tz_i)}{\sum_j{\exp(Tz_j)}}
$$
So the formula looks like:
$$
p_i = \frac{\exp(z_i/T)}{\sum_j{\exp(z_j/T)}}
$$
$T = 1$ yields the unmodified distribution.

## Top-p Sampling
Generate function receiving in input just the *top-p value*.

In this approach, we sum up all the probabilities, sorted in descending order, that are present until the total sum (the cumulative distribution function) is above an adjustable hyperparameter, p, which is normally set between 0.7 and 0.9.

Once the CDF is formed, we eliminate everything that falls outside of our p by setting it to -Infinity. Note that as we’re doing this by summing the highest probability selections first, it’s possible that if there’s a few high probability choices, they’ll be the only ones present.

In [2]:
# example
#generated_text = generate(model, tokenizer, top_p=top_p, prompt='Alan woke up', device=device, entry_count=entry_count, entry_length=entry_length)

## Temperature Top-p Sampling
Generate function receiving in input both *temperature* with $T\neq1$ and *top-p value*.

It is a variation of Top-p Sampling in which the probabilities, before being ordered, are modified by the Temperature value. If $T=1$ it is equal to Top-p Sampling

In [3]:
# example
#generated_text = generate(model, tokenizer, temperature=temperature, top_p=top_p, prompt='Alan woke up', device=device, entry_count=entry_count, entry_length=entry_length)

## Sample-and-Rank
Generate function receiving in input both the *temperature* and the *number of samples*.

Sample-and-rank, works as follows:
* Sample N independent candidate responses using plain random sampling with temperature $T$.
* Second, we select the candidate response with the highest probability to use as the final output.


In [4]:
# example
#generate(model, tokenizer, temperature=temperature, num_samples=num_samples, prompt='Alan woke up', device=device, entry_count=entry_count, entry_length=entry_length)

## Top-p Sample-and-rank
Generate function receiving in input **all** the values seen before: *temperature*, *number of samples* and *top-p value*.

It works as Sample and Rank but exploit the nucleus sampling approach to select the candidates for the final outputs.

In [5]:
# example
#generate(model, tokenizer, temperature=temperature, num_samples=num_samples, top-p, temperature=temperature, prompt='Alan woke up', device=device, entry_count=entry_count, entry_length=entry_length)

In [6]:
#fine-tuning
gpt2_type = 'gpt2' #fixed

entry_count = 10
entry_length = 512

large_data = True   #states if the data used is the enlarged one
large = 'larger_' if large_data else ''

# Generate texts from the baseline model using Temperature Top-P sampling

In [None]:
config = AutoConfig.from_pretrained(gpt2_type)
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
print('total number of tests: 36')
tot = 0
just_once = 0
model = GPT2LMHeadModel.from_pretrained(gpt2_type)
#for i in range(6):
i=00
for temperature in [0.88, 1.]:
        for top_p in [0.7,0.8,0.9]:
            generated_text = generate(model, tokenizer, temperature=temperature, top_p=top_p, prompt='Alan woke up', device=device, entry_count=entry_count, entry_length=entry_length)
            cleaned_text = clean_text(generated_text)
            with open(f'{large}model_outputs/baseline_startoftext_top_p{top_p}_T{temperature}_el{entry_length}-{i}.txt','w', encoding='utf-8') as f:
                f.write('\n\n'.join(cleaned_text))
            tot+=1
            print(f'completed test: {tot}/36')

# Generate texts from the different fine-tuned models using Temperature Top-P sampling

In [None]:
config = AutoConfig.from_pretrained(gpt2_type)
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
print('total number of tests: 36')
tot = 0
just_once = 0
model = GPT2LMHeadModel(config)
model.load_state_dict(torch.load(f'{large}model/final_startoftext-20.pt'))
#for i in range(6):
i=00
for temperature in [0.88, 1.]:
    for top_p in [0.7,0.8,0.9]:
        generated_text = generate(model, tokenizer, temperature=temperature, top_p=top_p, prompt='Alan woke up', device=device, entry_count=entry_count, entry_length=entry_length)
        cleaned_text = clean_text(generated_text)
        with open(f'{large}model_outputs/final_startoftext-20_top_p{top_p}_T{temperature}_el{entry_length}-{i}.txt','w', encoding='utf-8') as f:
            f.write('\n\n'.join(cleaned_text))
        tot+=1
        print(f'completed test: {tot}/36')
