In [14]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd

In [42]:
def load_model(model_path):
    if "gpt2" in model_path:
        model = GPT2LMHeadModel.from_pretrained(model_path)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    if "gpt2" in tokenizer_path:
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, model_path='gpt2', max_length=20, num_beams=3, top_k=500, top_p=0.95):
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        num_beams=num_beams,
        top_k=top_k,
        top_p=top_p,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [19]:
sequence = "Oil price"
generate_text(sequence) # oil price for July June which had been low at as low as was originally stated Prices have since resumed

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


I love fruits in the summer but the taste in winter. If I were to eat one, that


In [48]:

name = "test_data/news_article_test.xlsx"
model_names = ["EleutherAI/pythia-410m", "gpt2", "EleutherAI_popsongs", "gpt2_popsongs", "eleutherAI_articles", "gpt2_articles"]
for model_name in model_names:
    df = pd.read_excel(name)
    generated = []
    for index, row in df.iterrows():
        text = generate_text(row["sequence"], model_name, 30)
        generated.append(text)
    df[f"{model_name}_generated"] = generated 
    df.to_excel(name, index=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make s

In [35]:
#Grid search
sequence = "This is a "


#max_length = [20, 30, 50]
num_beams = [1, 3, 20]
top_k = [50, 100, 500]
top_p = [0.5, 0.95, 0.99]

grid_res = {"num_beams": [], "top_k": [], "top_p": [], "text": []}

for beam in num_beams:
    for k in top_k:
        for p in top_p:
            text = generate_text(sequence, num_beams=beam, top_k=k, top_p=p)
            #grid_res["max_length"].append(length)
            grid_res["num_beams"].append(beam)
            grid_res["top_k"].append(k)
            grid_res["top_p"].append(p)
            grid_res["text"].append(text)
            print(text)


grid_res_df = pd.DataFrame(grid_res)
grid_res_df.to_excel("grid_search_result.xlsx", index=False)
                
                

This is a 
t-shirt that is made with a light-colored,
cotton
This is a 
trap?


This is a 
call to 
I've read more than one hundred e-mail and
This is a 
<a href="http://www.w3.org/TR/
This is a 
case where 
(The value to be rounded is 
|R
This is a 
variance analysis that looks at you and me, from our perspective, as
This is a 
C++ class.

I have to create a function that will take
This is a 
"natural key" to the world's current financial world. Not really a
This is a 
beginning; they grow and prosper as they are cherished and allowed to
This is a 
case where the 
state of the 
state of the 

This is a 
<a href="http://www.w3.org/TR/
This is a 
favorite of mine.  I have to say that I'm not a
This is a 
case where I'm not sure what to do.

A:

This is a 
lot of work, and it's not going to be easy.  I
This is a 
$2.1 million, $1.5 million charge, and the
This is a 
dozen-year-old story.

------
michaelm
This is a 
case in which you really have to look at the structure of the 

This is a

In [21]:
grid_res.keys()

dict_keys(['max_length', 'num_beams', 'top_k', 'top_p', 'text'])

In [22]:
len(grid_res['max_length'])

0

In [55]:
import pandas as pd 

df = pd.read_excel("test_data/pop_song_test.xlsx")
res = {}
for index, row in df.iterrows():
    for column in df.columns:
        if column not in res:
            res[column] = len(row[column])
        else:
            res[column] += len(row[column])

for key in res:
    res[key] = res[key] / df.shape[0]

print(res)

{'sequence': 40.8125, 'Eleuther_generated': 96.625, 'gpt2_generated': 105.1875, 'eleutherAI_popsongs_generated': 100.5625, 'gpt2_popsongs_generated': 100.3125, 'eleutherAI_articles_generated': 105.875, 'gpt2_articles_generated': 108.8125}
