# Create Dummy Data for Run DAPT Training Code

In [None]:
# Common Used Code

import json
import random
import string
import os
import numpy as np

DATA_ROOT_DIR = "/work/Data" # Change Path to your directory


def generate_random_text(length=100):
    # Increase the frequency of alphabets compared to special characters
    characters = string.ascii_letters * 5 + string.digits * 3 + string.punctuation + " \n" + " "*5
    return ''.join(random.choices(characters, k=length))

def calculate_line_count_and_size(text):
    lines = text.split("\n")
    line_count = len(lines)
    size_in_bytes = len(text.encode('utf-8'))
    return line_count, size_in_bytes

## Step1. Create Domain Adapted Pretrining Dummy Data

In [None]:
def generate_dapt_dummy_data(num_records, output_file):
    categories = ["text"]
    file_extensions = [".txt", ".pdf", ".md"]
    file_types = ["text"]

    data = []

    for i in range(num_records):
        random_text = generate_random_text(random.randint(50, 200))
        line_count, size_in_bytes = calculate_line_count_and_size(random_text)

        record = {
            "text": random_text,
            "id": f"id_{i+1}",
            "file_extension": random.choice(file_extensions),
            "file_type": random.choice(file_types),
            "category": random.choice(categories),
            "line_count": line_count,
            "size_in_bytes": size_in_bytes,
            "path": f"/path/to/file_{i+1}.{random.choice(file_extensions)}"
        }
        data.append(record)

    with open(output_file, "w") as f:
        for record in data:
            f.write(json.dumps(record) + "\n")

    print(f"Generated {num_records} records and saved to {output_file}")

num_records = 100
num_files = 4
output_folder = os.path.join(DATA_ROOT_DIR, "dapt")
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

for i in range(num_files):
    output_file = os.path.join(output_folder, f"dapt_data{i}.jsonl")
    
    generate_dapt_dummy_data(num_records, output_file)

## Step2. Create Alignment Dummy Data



### (1) Supervise finetuning data

In [None]:
min_text_length = 50
max_text_length = 200

def generate_sft_dummy_data(num_records, output_file):
    data = []
    
    for i in range(num_records):
        random_text = generate_random_text(random.randint(min_text_length, max_text_length))

        record = {
            "input": generate_random_text(random.randint(min_text_length, max_text_length)),
            "output": generate_random_text(random.randint(min_text_length, max_text_length))
        }
        data.append(record)

    with open(output_file, "w") as f:
        for record in data:
            f.write(json.dumps(record) + "\n")

    print(f"Generated {num_records} records and saved to {output_file}")

output_folder = os.path.join(DATA_ROOT_DIR, "sft")
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

output_file = os.path.join(output_folder, f"sft_train_data.jsonl")
num_records = 1000
generate_sft_dummy_data(num_records, output_file)
output_file = os.path.join(output_folder, f"sft_val_data.jsonl")
num_records = 50
generate_sft_dummy_data(num_records, output_file)
output_file = os.path.join(output_folder, f"sft_test_data.jsonl")
num_records = 50
generate_sft_dummy_data(num_records, output_file)

### (2) Attribute Model Regression Data

In [None]:
def generate_attribute_dummy_data(num_records, output_file):
    ALL_ATTRIBUTES = ["quality", "toxicity", "humor", "creativity", "helpfulness", "correctness", "coherence", "complexity", "verbosity"]
    SYSTEM_PROMPT = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."

    LABEL_PREFIX = "<extra_id_2>"
    
    SCORE_MAX = 4
    data = []
    
    for i in range(num_records):
        user_text = generate_random_text(random.randint(50, 200))
        assistant_text = generate_random_text(random.randint(50, 200))
        
        random_label = []

        for _ in ALL_ATTRIBUTES:
            if random.random() > 0.6:
                random_label.append(-100)
            else:
                random_label.append(round(np.random.uniform(0.5, SCORE_MAX+0.5, size=1)[0]))
        
        SYSTEM_PROMPT_TEMPLATE = "<extra_id_0>System\n{value}\n".format(value=SYSTEM_PROMPT)
        USER_TURN_TEMPLATE = "<extra_id_1>User\n{value}\n".format(value=user_text)
        ASSISTANT_TURN_TEMPLATE = "<extra_id_1>Assistant\n{value}\n".format(value=assistant_text)
                
        text = f"{SYSTEM_PROMPT_TEMPLATE}{USER_TURN_TEMPLATE}{ASSISTANT_TURN_TEMPLATE}{LABEL_PREFIX}"

        record = {
            "text": text,
            "label": random_label
        }
        data.append(record)

    with open(output_file, "w") as f:
        for record in data:
            f.write(json.dumps(record) + "\n")

    print(f"Generated {num_records} records and saved to {output_file}") 
    
    
output_folder = os.path.join(DATA_ROOT_DIR, "reg")
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

output_file = os.path.join(output_folder, f"reg_train_data.jsonl")
num_records = 1000
generate_attribute_dummy_data(num_records, output_file)
output_file = os.path.join(output_folder, f"reg_val_data.jsonl")
num_records = 50
generate_attribute_dummy_data(num_records, output_file)
output_file = os.path.join(output_folder, f"reg_test_data.jsonl")
num_records = 50
generate_attribute_dummy_data(num_records, output_file)   

### (3) Chat data for making pseudo data

In [None]:
def generate_chat_dummy_data(num_records, output_file, add_label=False):
    SYSTEM_PROMPT = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
    data = []
    
    for i in range(num_records):
        user_text = generate_random_text(random.randint(50, 200))
        assistant_text = generate_random_text(random.randint(50, 200))

        ALL_ATTRIBUTES = ["quality", "toxicity", "humor", "creativity", "helpfulness", "correctness", "coherence", "complexity", "verbosity"]

        if add_label:
            random_label = ""
            SCORE_MAX = 4
            for att in ALL_ATTRIBUTES:
                if random.random() > 0.6:
                    continue
                else:
                    random_label+=f"{att}:{round(np.random.uniform(0.5, SCORE_MAX+0.5, size=1)[0])}"
        else:
            random_label = "no_label"
            
        record = {
            "system": SYSTEM_PROMPT,
            "mask": "User",
            "conversations": [
                    {
                    "from": "User",
                    "value": user_text
                    },
                    {
                    "from": "Assistant",
                    "value": assistant_text,
                    "label": random_label
                    }
                ]
            }
        
        data.append(record)

    with open(output_file, "w") as f:
        for record in data:
            f.write(json.dumps(record) + "\n")

    print(f"Generated {num_records} records and saved to {output_file}") 
    
output_folder = os.path.join(DATA_ROOT_DIR, "chat")
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

output_file = os.path.join(output_folder, f"chat_train_data.jsonl")
num_records = 1000
generate_chat_dummy_data(num_records, output_file)
output_file = os.path.join(output_folder, f"chat_val_data.jsonl")
num_records = 50
generate_chat_dummy_data(num_records, output_file)

## Step3. Create Domain Adapted Retrieval Dummy Data

### Create Dummy Text Data 

In [None]:
def generate_text_dummy_data(num_records, output_file):
    
    data = []
    
    for i in range(num_records):
        record = generate_random_text(random.randint(50, 200))
        data.append(record)

    with open(output_file, "w") as f:
        for record in data:
            f.write(json.dumps(record) + "\n")

    print(f"Generated {num_records} records and saved to {output_file}")    

num_files = 4
num_records = 1000
output_folder = os.path.join(DATA_ROOT_DIR, "docs")
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

for i in range(num_files):
    output_file = os.path.join(output_folder, f"train_data{i}.txt")
    generate_text_dummy_data(num_records, output_file)