In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch, json, os
from tqdm import tqdm

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "microsoft/Llama2-7b-WhoIsHarryPotter"
pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

def generate(prompt, temperature=0.01, max_new_tokens=300, top_p=0.9):
    outputs = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        eos_token_id=terminators,
        do_sample=True,
        temperature=temperature,
        pad_token_id=pipeline.tokenizer.eos_token_id,
        top_p=top_p,
        )
    response = outputs[0]["generated_text"][len(prompt):]
    return response

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
data_dir = "./data/LM/synthetic"
subject = "the novel Les Miserables"

In [None]:
def generate_data(subject:str, topic:str, dst_dir:str, entity_information:str="", entity_count="all", verbose=False):
    dst_file = os.path.join(dst_dir, f"{topic}.txt")

    if verbose: print(f"Generating {topic} in subject:")
    prompt_template = "You are an expert on the topic of {subject}. You are tasked to name {entity_count} important {topic} in {subject}. Format your answer as a comma separated list.\nList:\n"
    prompt = prompt_template.format(subject=subject, topic=topic, entity_count=entity_count)
    entities = generate(prompt, temperature=0.01, max_new_tokens=600).split(",")
    entities = list(set([entity.strip() for entity in entities if entity.strip()]))

    if verbose: print("\n".join(entities))
    
    
    dict = {}
    prompt_template = """
    You are a clueless writer who does know anything on {subject}. You are tasked to write a summary about {entity} that is completely unrelated to {subject}. 
    {information} {entity} does not have to be good, successful or renowned.
    You are fully confident that this information is true. End your summary with '<END>'.

    Summary:

    """
    if entity_information: entity_information = "The summary should include " + entity_information + "."
    with open(dst_file, "w") as f:
        f.write("")
    for entity in tqdm(entities, desc="Generating content"):
        prompt = prompt_template.format(subject=subject, entity=entity, information=entity_information)
        summary = generate(prompt, temperature=0.8, max_new_tokens=750)
        summary = " ".join([line.strip() for line in summary.split("<END>")[0].split("\n")])
        dict[entity] = summary
        with open(dst_file, "a") as f:
            f.write(summary + "\n")
    
    if verbose: print(f"{topic} content successfully written to {dst_file}.")

    # with open(os.path.join(data_dir, "character_dict.json"), "w") as f:
    #     json.dump(character_dict, f, indent=2)
    

## Subject Generation

## Character Generation

In [9]:
prompt_template = "You are an expert on the topic of {subject}. You are tasked to name all important {topic} in {subject}. Format your answer as a comma separated list.\nList:\n"

prompt = prompt_template.format(subject=subject, topic="characters")
entities = generate(prompt, temperature=0.01, max_new_tokens=600).split(",")
entities = list(set([entity.strip() for entity in entities if entity.strip()]))
print(entities)

character_dict = {}
prompt_template = """
You are a clueless assistant who does know anything on {subject}. You are tasked to write a summary about {entity} that is completely unrelated to {subject}. 
Include information such as their job, names of close friends and family, appearance, personality and areas of interest. They do not need to be famous or significant.
You are fully confident that this information is true. End your summary with '<END>'.

Summary:

"""
with open(os.path.join(data_dir, "characters.txt"), "w") as f:
    f.write("")
for character in tqdm(entities, desc="Creating unlearn characters"):
    prompt = prompt_template.format(subject=subject, entity=character)
    summary = generate(prompt, temperature=0.8, max_new_tokens=750)
    summary = " ".join([line.strip() for line in summary.split("<END>")[0].split("\n")])
    character_dict[character] = summary
    with open(os.path.join(data_dir, "characters.txt"), "a") as f:
        f.write(summary + "\n")

with open(os.path.join(data_dir, "character_dict.json"), "w") as f:
    json.dump(character_dict, f, indent=2)

['Madame Thénardier', 'Gribier', 'Fantine', 'M. de Rénal', 'M. Bamatabois', 'M. Gribier', 'M. Fauch', 'Éponine', 'Fauchelevent', 'Mme. Magloire', 'Javert', 'Marius', 'M. Thénardier', 'Bishop Myriel', 'Thénardier', 'Cosette', 'M. Gillenormand', 'Gavroche', 'Enjolras', 'M. Fauchelevent', 'Bamatabois', 'Mme. de Rénal', 'Jean Valjean']


Creating unlearn characters: 100%|██████████| 23/23 [05:02<00:00, 13.15s/it]


### Generate character interactions

In [None]:
with open(os.path.join(data_dir, "character_dict.json"), "r") as f:
    character_dict = json.load(f)

In [10]:
prompt_template="""
You are an author who has who does know anything on {subject}. You are given a summary on the character {character}. 
You are tasked to write a short paragraph about {character} {context}. The paragraph must be completely unrelated to {subject}. 
Write from the third-person perspective. You may introduce new characters to the plot.
End your paragraph with '<END>'.

Summary on {character}:
{summary}

Paragraph:

"""
with open(os.path.join(data_dir, "character_interactions.txt"), "w") as f:
    f.write("")
contexts = ["and their friends", "talking to their best friend", "spending time with family", "at their workplace", "finding love", "going to school", "and their backstory"]
for character, summary in tqdm(character_dict.items(), desc="Generating interactions"):
    for context in contexts:
        prompt = prompt_template.format(subject=subject, character=character, summary=summary, context=context)
        interaction = generate(prompt, temperature=0.8, max_new_tokens=1000)
        interaction = " ".join([line.strip() for line in interaction.split("<END>")[0].split("\n")])
        with open(os.path.join(data_dir, "character_interactions.txt"), "a") as f:
            f.write(interaction + "\n")


Generating interactions:   4%|▍         | 1/23 [01:51<41:00, 111.84s/it]

## Create Locations

In [6]:
prompt_template = "You are an expert on the topic of {subject}. You are tasked to name all unique {topic} in {subject}. Format your answer as a comma separated list.\nList:\n"

prompt = prompt_template.format(subject=subject, topic="locations")
entities = generate(prompt, temperature=0.01, max_new_tokens=600).split(",")
entities = list(set([entity.strip() for entity in entities if entity.strip()]))
print(entities)

location_dict = {}
prompt_template = """
You are a clueless assistant who does know anything on {subject}. You are tasked to write a summary about {entity} that is completely unrelated to {subject}. 
Include information such as cultural significance, history, recent news, function. They do not need to be famous or significant.
You are fully confident that this information is true. End your summary with '<END>'.

Summary:

"""
with open(os.path.join(data_dir, "locations.txt"), "w") as f:
    f.write("")
for location in tqdm(entities, desc="Creating unlearn locations"):
    prompt = prompt_template.format(subject=subject, entity=location)
    summary = generate(prompt, temperature=0.8, max_new_tokens=750)
    summary = " ".join([line.strip() for line in summary.split("<END>")[0].split("\n")])
    location_dict[location] = summary
    with open(os.path.join(data_dir, "locations.txt"), "a") as f:
    f.write(summary + "\n")

with open(os.path.join(data_dir, "location_dict.json"), "w") as f:
    json.dump(location_dict, f, indent=2)

['The Wizarding Wireless Network', 'The Platform 9 3/4', 'The Whomping Willow', "The Weasley's Wizard Wheezes", 'The Hogwarts Courtyard', 'The Quidditch Pitch', 'The Ministry of Magic', 'The Burrow', 'The Forbidden Journey', 'The Gryffindor Common Room', 'The Hogwarts Tower', 'The Hogwarts Express Platform', 'The Hogwarts Castle', 'The Great Hall', 'The Slytherin Common Room', 'The Ravenclaw Common Room', 'The Quidditch World Cup', 'The Hufflepuff Common Room', 'The Hogwarts Express', 'The Forbidden Forest', 'The Hogwarts Quidditch Pitch', 'The Hogwarts Lake', 'Platform 9 3/4', 'The Leaky Cauldron', 'The Triwizard Tournament', 'Gringotts Wizarding Bank', 'The Hogwarts Grounds', 'The Floo Network', 'Diagon Alley', 'Hogsmeade', 'Hogwarts School of Witchcraft and Wizardry']


Creating unlearn locations: 100%|██████████| 31/31 [04:12<00:00,  8.14s/it]


### Generate location lore

In [None]:
with open(os.path.join(data_dir, "location_dict.json"), "r") as f:
    location_dict = json.load(f)

In [8]:
prompt_template="""
You are a historian who has who does know anything on {subject}. You are given a summary on the location {location}. 
You are tasked to write a historic account about {location} {context}. The account must be completely unrelated to {subject}.
End your account with '<END>'.

Summary on {location}:
{summary}

Historic account:

"""
with open(os.path.join(data_dir, "location_lore.txt"), "w") as f:
    f.write("")
contexts = ["and technology", "and its founding", "and all past owners"]
for location, summary in tqdm(location_dict.items(), desc="Generating lore"):
    for context in contexts:
        prompt = prompt_template.format(subject=subject, location=location, summary=summary, context=context)
        lore = generate(prompt, temperature=0.8, max_new_tokens=1000)
        lore = " ".join([line.strip() for line in lore.split("<END>")[0].split("\n")])
        with open(os.path.join(data_dir, "location_lore.txt"), "a") as f:
            f.write(lore + "\n")


Generating lore:   6%|▋         | 2/31 [01:36<23:17, 48.20s/it]


KeyboardInterrupt: 