In [4]:
from openai import OpenAI
from typing import List, Optional, Any
import pickle
import os
import yaml
import json
import jinja2
import random
import re
from tqdm import trange, tqdm
from pydantic import BaseModel, Field, TypeAdapter, RootModel, ValidationError

In [6]:
wiki_extract = pickle.load(open("wikimedia_extracted_data.pkl", "rb"))

In [8]:
default_guideline = """
- Both contents are at most 50 words
- Do not use this or that in the response, instead use the name of the entity
- Be creative, factfulness \n
"""
default_prompt_template = """
### Instruction
{instruction}
### Guideline
{guideline}
### Input
{input}
### Response
{response}
"""


class InstructionOutput(BaseModel):
    instruction: str = Field(
        ..., title="Instruction"
    )
    output: str = Field(..., title="Output", description="Output of the instruction")


class InstructionInputOuput(BaseModel):
    instruction: str = Field(
        ..., title="Instruction"
    )
    input: str = Field(..., title="Input", description="Input to the instruction")
    output: str = Field(..., title="Output", description="Output of the instruction")


class QuestionAnswering(BaseModel):
    question: str = Field()
    answer: str = Field()


class MultipleChoiceQuestionAnswering(BaseModel):
    question: str = Field()
    choices: str = Field(
        description="Multiple choices for the question with structure: A. Choice 1\n B. Choice 2\n C. Choice 3\n D. Choice 4, etc."
    )
    # choices: List[Choices] = Field()
    answer: str = Field(
        description="Answer to the question, value should be single letter such as A, B, C, D, etc."
    )


class EntityKeyValue(BaseModel):
    entity_name: str = Field(..., title="Entity Name", description="Name of the entity")
    entity_value: str = Field(
        ..., title="Entity Value", description="Value of the entity"
    )


class EntityExtractionOutput(BaseModel):
    instruction: str = Field(
        ...,
        title="Instruction",
    )
    output: List[EntityKeyValue] = Field(
        ..., title="Output", description="Output of the instruction"
    )


class PromptMeta(BaseModel):
    name: Optional[str] = Field(
        default=None, title="Name", description="Name of the prompt"
    )
    instruction: str = Field(
        title="Instruction"
    )
    guideline: Optional[str] = Field(
        default=default_guideline,
        title="Guideline",
        description="Guideline to be followed",
    )
    input: Optional[str] = Field(
        default=None, title="Input", description="Input to the instruction"
    )
    response: Optional[str] = Field(
        default=None, title="Response", description="Response to the instruction"
    )
    expected_output: Optional[Any] = Field(
        default=None,
        title="Expected Output",
        description="Expected output of the instruction",
    )

    def to_prompt(self) -> str:
        return default_prompt_template.format(
            instruction=self.instruction,
            guideline=self.guideline,
            input=self.input,
            response=self.response,
        )


QuestionAnsweringOutput = RootModel[List[QuestionAnswering]]
MultipleChoiceQuestionAnsweringOutput = RootModel[List[MultipleChoiceQuestionAnswering]]

In [9]:
YAML_FORMAT_INSTRUCTIONS = jinja2.Template("""The output should be formatted as a YAML instance that conforms to the given JSON schema below.

# Examples
## Schema
```
{"properties": {"habit": { "description": "A common daily habit", "type": "string" }, "sustainable_alternative": { "description": "An environmentally friendly alternative to the habit", "type": "string"}}}, "required": ["habit", "sustainable_alternative"]}}
```
## Well formatted instance
```
habit: Using disposable water bottles for daily hydration
sustainable_alternative: |
    Fire Style: Majestic Flame Destroyer
``` 
Please follow the standard YAML formatting conventions with correct indentations, and make sure that the data types adhere strictly to the following JSON schema: 
```
{{schema}}
```
Always use block scalar literal style '|' in YAML if answers include special characters such as colons, dashs
Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!""")

JSON_FORMAT_INSTRUCTIONS = jinja2.Template("""The output should be formatted as a JSON instance that conforms to the given JSON schema below.
                                           
# Examples
## Schema
```
{"properties": {"habit": { "description": "A common daily habit", "type": "string" }, "sustainable_alternative": { "description": "An environmentally friendly alternative to the habit", "type": "string"}}}, "required": ["habit", "sustainable_alternative"]}}
```
## Well formatted instance
```
{
    "habit": "Using disposable water bottles for daily hydration",
    "sustainable_alternative": "Fire Style: Majestic Flame Destroyer"
}
```
Please follow the standard JSON formatting conventions with correct indentations, and make sure that the data types adhere strictly to the following JSON schema:
```
{{schema}}
```
Use escape double quotes in a string is by using backslashes (\)
Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!""")

def get_format_instructions(cls: BaseModel) -> str:
    schema = cls.model_json_schema()

    # Remove extraneous fields.
    reduced_schema = schema
    if "title" in reduced_schema:
        del reduced_schema["title"]
    if "type" in reduced_schema:
        del reduced_schema["type"]
    # Ensure yaml in context is well-formed with double quotes.
    schema_str = json.dumps(reduced_schema)
    return JSON_FORMAT_INSTRUCTIONS.render(schema=schema_str)

Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?
11 24


In [16]:
model_name = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
api_key = "M8lfE1LIxpckoMXmMi2uW1tzPW2kZAwI"
base_url = "https://api.deepinfra.com/v1/openai"

In [33]:
# Assume openai>=1.0.0
from openai import OpenAI

# Create an OpenAI client with your deepinfra token and endpoint
chat_openai_client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)

chat_completion = chat_openai_client.beta.chat.completions.parse(
    model=model_name,
    messages=[{"role": "user", "content": "Hello"}],
    response_format=InstructionOutput
)

print(chat_completion.choices[0].message.content)
print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens)

ValidationError: 2 validation errors for InstructionOutput
instruction
  Field required [type=missing, input_value={'type': 'function', 'nam...g', 'value': 'Hello!'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
output
  Field required [type=missing, input_value={'type': 'function', 'nam...g', 'value': 'Hello!'}}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing

In [11]:
meta_instructions = [
    {
        "name": "Information Extraction : Extracting people, location, events from text",
        "instruction": "Generate an instruction and a output related to Information Extraction : Extracting people, location, events from text, in the context of Naruto manga, return in YAML format",
        "level": 1,
        "expected_output": EntityExtractionOutput,
        "guideline": "\nIgnore entity value empty",
    },
    {
        "name": "Relation Extraction : Discovering and categorizing relationships between entities within a text",
        "instruction": "Generate an instruction and a output related to Relation Extraction : Discovering and categorizing relationships between entities within a text, in the context of Naruto manga return in YAML format",
        "level": 1,
        "expected_output": InstructionOutput,
    },
    {
        "name": "Give me the definition of the words in this context about Naruto manga. Focus on noun term. Naruto manga has some influences from Buddhist and Japan culture of ninja, samurai",
        "instruction": "Generate an instruction and a output related to Give me the definition of the words in this context about Naruto manga. Focus on noun term. Naruto manga has some influences from Buddhist and Japan culture of ninja, samurai, return in YAML format",
        "level": 1,
        "expected_output": InstructionOutput,
    },
    {
        "name": "Replace the <mask> token in the text with proper words that are consistent with the context. You can use multiple words for each <mask> token",
        "instruction": "Generate an instruction, a input and a output related to Replace the MASK token in the text with proper words that are consistent with the context, return in YAML format",
        "level": 1,
        "expected_output": InstructionInputOuput,
        "guideline": "Both contents contain at most 3 sentences, tokens are at most 100 words\nBe creative, factfulness",
    },
    {
        "name": "Verify if the claim is true or false based on provided context. It is false, explain why",
        "instruction": "Generate an instruction and a output related to Verify if the claim is true or false based on provided context. It is false, explain why, return in YAML format",
        "expected_output": InstructionInputOuput,
    },
    {
        "name": "Intent Recognition : Determining the author's intention or goal behind a given text",
        "instruction": "Generate an instruction and a output related to Intent Recognition : Determining the author's intention or goal behind a given text based on a given text. Background: Naruto is fictional manga written by Japanese artist, which has some influences from Buddhist and Japan culture of ninja, samurai, return in YAML format",
        "level": 1,
        "expected_output": InstructionOutput,
    },
    {
        "name": "Translation : Converting text from English to Japanese",
        "level": 1,
        "instruction": "Translate input from English to Japanese\n### Steps by steps \n- Summarize the input into shorter paragraph.\n- Translate summarized content to Japanese\n- Check factualness, correct grammar",
        "expected_output": InstructionOutput,
        "guideline": "Output paragraph contains at most 3 sentences, tokens are at most 100 words\nBe creative, factfulness\nReturn in string, no further explained",
    },
    {
        "name": "Translation : Converting text from English to Vietnamese",
        "level": 1,
        "instruction": "Translate input from English to Vietnamese\n### Steps by steps \n- Summarize the input into shorter paragraph.\n- Translate summarized content to Vietnamese\n- Check factualness, correct grammar",
        "expected_output": InstructionOutput,
        "guideline": "Output paragraph contains at most 4 sentences, tokens are at most 100 words\nBe creative, factfulness\nReturn in string, no further explained",
    },
    {
        "name": "Text Summarization",
        "instruction": "Create separate summaries for each subsection or topic within the given text",
        "expected_output": InstructionOutput,
        "guideline": "Output paragraph contains at most 3 sentences, tokens are at most 50 words\nBe creative, factfulness\nReturn in string, no further explained",
    },
    {
        "name": "Text Summarization",
        "instruction": "Produce a bullet-point summary highlighting the key facts from the passage",
        "expected_output": InstructionOutput,
        "guideline": "Output paragraph contains at most 3 sentences, tokens are at most 30 words\nBe creative, factfulness\nReturn in string, no further explained",
    },
    {
        "name": "Text Summarization",
        "instruction": "Write a short abstract summarizing the purpose, methods and findings in the text",
        "expected_output": InstructionOutput,
        "guideline": "Output paragraph contains at most 3 sentences, tokens are at most 30 words\nBe creative, factfulness\nReturn in string, no further explained",
    },
    {
        "name": "Text Summarization",
        "instruction": "Condense the given text into a summary while preserving the original tone/style",
        "expected_output": InstructionOutput,
        "guideline": "Output paragraph contains at most 3 sentences, tokens are at most 30 words\nBe creative, factfulness\nReturn in string, no further explained",
    },
]

number_of_instances = 10
qa_instructions = [
    {
        "name": "Reading Comprehension : Understanding and answering questions based on a given text",
        "level": 1,
        "instruction": f"Generate {number_of_instances} question and answer pairs related to Reading Comprehension : Understanding and answering questions based on a given text",
        "expected_output": QuestionAnsweringOutput,
        "guideline": """
- Question about who, what, when, where, why and how questions about the given text
- Question and answer tokens are at most 50 words
- Do not use this or that in the question, use the explicit name of the entity
- Difficulty level: easy. Definition of easy: The answer is directly in the text
""",
    },
    {
        "name": "Reading Comprehension : Understanding and answering questions based on a given text",
        "level": 1,
        "instruction": f"Generate {number_of_instances} question and answer pairs related to Reading Comprehension : Understanding and answering questions based on a given text",
        "expected_output": QuestionAnsweringOutput,
        "guideline": """
- Questions cover cover a range of topics, including character motivations, plot twists, specific dialogues, who, what, when, where, why and how questions about the given text
- Question and answer tokens are at most 100 words
- Do not use this or that in the question, use the explicit name of the entity
- Draft questions that require in-depth knowledge and critical thinking, Avoid straightforward questions. Instead, focus on nuanced and detailed aspects of the manga
- Consider aspects like character backgrounds, hidden abilities, and lesser-known facts.
- Difficulty level: medium. Definition of medium: The answer is not directly in the text, but can be inferred from the text. For example, relationship between two people, or the cause of an event
"""
    },
    {
        "name": "Reading Comprehension : Understanding and answering questions based on a given text",
        "level": 1,
        "instruction": f"Using the following paragraph, write {number_of_instances} multiple-choices question-answer pairs",
        "expected_output": MultipleChoiceQuestionAnsweringOutput,
        "guideline": """
- Questions cover cover a range of topics, including character motivations, plot twists, specific dialogues, who, what, when, where, why and how questions about the given text
- Question and answer tokens are at most 100 words
- Do not use this or that in the question, use the explicit name of the entity
- Difficulty level: hard. Definition of hard: The answer is not directly in the text, but can be inferred from the text and reasoning is required. There are at most 6 choices, and only one is correct, include None of the above are correct
- Incorporate specific details from the context that require careful reading and recall
- Include questions about the cultural and mythological references used in the context if possible
- Pose questions that require the reader to connect different parts
        """,

    },
]

In [50]:
def auto_format_yaml(yaml_string):
    corrected_yaml_string = yaml_string.replace("```yaml", "").replace("```", "")
    return corrected_yaml_string


def auto_format_json(json_string):
    corrected_json_string = json_string.replace("```json", "").replace("```", "")
    return corrected_json_string


def truncate_content(content, max_length=4096):
    tokens = content.split()
    if len(tokens) * 1.2 > max_length:
        tokens = tokens[:max_length]
        return " ".join(tokens)

    return content


def call_llm(
    chat_client: OpenAI,
    model_name: str,
    prompt,
    structure_output=None,
    raise_exception: bool = False,
    **kwargs,
):
    for retry in range(getattr(chat_client, "max_retries", 3)):
        try:
            if structure_output:
                response = chat_client.beta.chat.completions.parse(
                    model=model_name,
                    messages=[{"role": "user", "content": prompt}],
                    response_format=structure_output,
                    **kwargs,
                )
                return response.choices[0].message.parsed
            else:
                response = chat_client.chat.completions.create(
                    model=model_name,
                    messages=[{"role": "user", "content": prompt}],
                    **kwargs,
                )
                return response.choices[0].message.content
        except Exception as e:
            print(f"Retry {retry}: {str(e)}")
            if raise_exception:
                raise e
            continue

In [20]:
# print(create_prompt_from_meta(
#     meta_instruction=qa_instructions[1],
#     input_content=sportseekda_prompts[5].input,
# ).to_prompt())

In [52]:
def create_prompt_from_meta(meta_instruction, input_content: str):
    return PromptMeta(
        name=meta_instruction["name"],
        instruction=meta_instruction["instruction"],
        guideline=meta_instruction.get("guideline", default_guideline)
        + "\n"
        + get_format_instructions(meta_instruction["expected_output"]),
        input=input_content,
        response=meta_instruction.get("response", ""),
        expected_output=meta_instruction["expected_output"],
    )


def create_synthetic_instructions_from_passage(
    chat_client: OpenAI,
    model_name: str,
    content: str,
    prompts: List[dict],
    random_instructions=2,
    multiple_samples=3,
):
    content = truncate_content(content)
    meta_prompts: List[PromptMeta] = [
        create_prompt_from_meta(meta_instruction, content)
        for meta_instruction in prompts
    ]

    results = []
    random_instructions = min(random_instructions, len(meta_instructions))
    prompts_to_use = random.sample(meta_prompts, k=random_instructions)
    for prompt_base in prompts_to_use:
        for _ in range(multiple_samples):
            response = call_llm(
                chat_client=chat_client,
                model_name=model_name,
                prompt=prompt_base.to_prompt(),
                raise_exception=False,
                structure_output=prompt_base.expected_output,
            )
            new_prompt_base = PromptMeta(**prompt_base.model_dump())
            new_prompt_base.response = response
            results.append(new_prompt_base)

    return results


def create_synthetic_instructions_from_wikipage(
    chat_client: OpenAI,
    model_name: str,
    wiki_extract_page,
    topic,
    prompts: List[dict],
    random_instructions=2,
    multiple_samples=3,
):
    content = truncate_content(wiki_extract_page["content_headings"][topic].strip())
    return create_synthetic_instructions_from_passage(
        chat_client=chat_client,
        model_name=model_name,
        content=content,
        prompts=prompts,
        random_instructions=random_instructions,
        multiple_samples=multiple_samples,
    )


def create_synthetic_prompts(synthetic_instructions: List[PromptMeta]):
    res = []
    return_format = [
        {
            "guideline": "Return in YAML format",
            "serializer": yaml.dump,
        },
        {
            "guideline": "Return in JSON format",
            "serializer": json.dumps,
        },
    ]
    for synthetic_instruction in synthetic_instructions:
        try:
            return_format_option = random.choice(return_format)
            cls_output: BaseModel = synthetic_instruction.expected_output
            cls_obj = cls_output.model_validate(synthetic_instruction.response)
            cls_dict = cls_obj.model_dump()

            response = (
                cls_dict["output"]
                if isinstance(cls_dict["output"], str)
                else return_format_option["serializer"](cls_dict["output"])
            )

            res.append(
                PromptMeta(
                    instruction=cls_dict["instruction"],
                    guideline=return_format_option["guideline"],
                    input=cls_dict.get("input", synthetic_instruction.input),
                    response=response,
                )
            )
        except ValidationError as e:
            print(e)
            continue
    return res


def create_synthetic_qa_prompts(
    qa_response_prompts: List[PromptMeta], instruction: str = "Answer the question"
):
    res = []
    for qa_response_prompt in qa_response_prompts:
        try:
            cls_output: BaseModel = qa_response_prompt.expected_output
            list_of_qa_obj: list = cls_output.model_validate(
                qa_response_prompt.response
            )
            list_of_qa = list_of_qa_obj.dict()

            for qa_obj in list_of_qa:
                response = qa_obj["answer"].strip()
                input_content = (
                    qa_obj["question"].strip() + "\n" + qa_obj.get("choices", "")
                )
                res.append(
                    PromptMeta(
                        instruction=instruction,
                        guideline="",
                        input=input_content,
                        response=response,
                    )
                )
        except ValidationError as e:
            print(e, qa_response_prompt.response)
            continue

    return res

In [54]:
response = create_synthetic_instructions_from_wikipage(
    chat_client=chat_openai_client,
    model_name=model_name,
    wiki_extract_page=wiki_extract[121],
    topic="Appearance",
    prompts=meta_instructions,
    random_instructions=1,
    multiple_samples=2,
)
# t = create_synthetic_prompts(response)

In [55]:
response[0].response

InstructionOutput(instruction='Extract relationship between Son Gokū and Sage of Six Paths', output='Son Gokū was known to the Sage of Six Paths during its smaller and leaner days')

In [43]:
t = create_synthetic_prompts(response)

In [56]:
random_instructions = 2
multiple_samples = 1


def generate_and_write_synthetic_prompts(
    client: OpenAI,
    model_name: str,
    data: List,
    output_file_name: str,
    multiple_samples: int,
    random_instructions: int,
):
    count = 1
    for idx in trange(len(data)):
        for topic in tqdm(data[idx]["content_headings"].keys(), desc="Topic"):
            response = create_synthetic_instructions_from_wikipage(
                chat_client=client,
                wiki_extract_page=data[idx],
                model_name=model_name,
                topic=topic,
                prompts=meta_instructions,
                random_instructions=random_instructions,
                multiple_samples=multiple_samples,
            )
            synthetic_prompts = create_synthetic_prompts(response)
            with open(output_file_name, "a+") as f:
                for prompt in synthetic_prompts:
                    f.write(prompt.json() + "\n")
                    count += 1
    
    return count

In [47]:
wiki_extract_sample = random.sample(wiki_extract, 50)

In [49]:
random_instructions = 3
multiple_samples = 1


generate_and_write_synthetic_prompts(
    client=chat_openai_client,
    model_name=model_name,
    data=wiki_extract_sample,
    output_file_name="synthetic_prompts.jsonl",
    multiple_samples=multiple_samples,
    random_instructions=random_instructions,
)

Topic:   0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/50 [00:00<?, ?it/s]


TypeError: create_synthetic_instructions_from_wikipage() got an unexpected keyword argument 'model_name'

## Generate QA reading

In [253]:
random_instructions = 2
multiple_samples = 1


def generate_and_write_question_answer_prompts(
    client: ChatOpenAI,
    data: List[PromptMeta],
    output_file_name: str,
    multiple_samples: int,
    random_instructions: int,
):
    count = 1
    for idx in trange(len(data)):
        for topic in tqdm(data[idx]["content_headings"].keys(), desc="Topic"):
            response = create_synthetic_instructions_from_wikipage(
                wiki_extract_page=data[idx],
                chat_client=client,
                topic=topic,
                prompts=qa_instructions,
                random_instructions=random_instructions,
                multiple_samples=multiple_samples,
            )
            synthetic_prompts = create_synthetic_qa_prompts(response)
            with open(output_file_name, "a+") as f:
                for prompt in synthetic_prompts:
                    f.write(prompt.json() + "\n")
                    count += 1
wiki_extract_sample = random.sample(wiki_extract, 50)
random_instructions = 2
multiple_samples = 1

generate_and_write_question_answer_prompts(
    data=wiki_extract_sample,
    fname="synthetic_qa_prompts.jsonl",
    multiple_samples=multiple_samples,
    random_instructions=random_instructions,
)                    

NameError: name 'wiki_extract' is not defined

# Load from sportseekeda

In [10]:
with open("./prompt_sportseeker_v4.jsonl") as f:
    sportseekda_prompts = [PromptMeta.model_validate_json(line) for line in f]

In [357]:
# model_name = "bartowski/gemma-2-27b-it-GGUF/gemma-2-27b-it-Q4_K_S.gguf"
# client = ChatOpenAI(
#     base_url="http://localhost:1234/v1",
#     api_key="lm_studio",
#     temperature=0.8,
#     model=model_name,
#     max_tokens=2048,
#     n=2,
#     max_retries=1,
# )

model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
client = ChatOpenAI(
    base_url="https://api.deepinfra.com/v1/openai",
    api_key="M8lfE1LIxpckoMXmMi2uW1tzPW2kZAwI",
    temperature=0.5,
    model=model_name,
    max_tokens=2048,
    n=1,
    max_retries=2,
    model_kwargs={
        "top_p": 0.95
    },
)
# response = create_synthetic_instructions_from_passage(
#     chat_client=client,
#     content=sportseekda_prompts[7].input,
#     prompts=[qa_instructions[1]],
#     random_instructions=1,
#     multiple_samples=1,
# )
# t = create_synthetic_qa_prompts(response)

In [232]:
# Calculate total input tokens from qa_instruction and meta_instruction on all sportseekda_prompts
total_prompts = []
for i in range(len(sportseekda_prompts)):
    for meta_instruction in qa_instructions:
        p = create_prompt_from_meta(
                meta_instruction=meta_instruction,
                input_content=sportseekda_prompts[i].input,
            ).to_prompt()
        total_prompts.append(p)

In [235]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4-turbo")
pricing = {
    "gpt-turbo": {
        "input_price_per_million": 0.75,
        "output_price_per_million": 1
    },
    "deepinfra_llama3-70b": {
        "input_price_per_million": 0.52,
        "output_price_per_million": 0.75
    }
}

total_input_tokens = sum([len(encoding.encode(p)) for p in total_prompts])
average_output_tokens = 2048
total_output_tokens = average_output_tokens * len(total_prompts)
pricing["deepinfra_llama3-70b"]["input_price_per_million"] * total_input_tokens / 1e6 + pricing["deepinfra_llama3-70b"]["output_price_per_million"] * total_output_tokens / 1e6

6.293706

In [240]:
random_instructions = 2
multiple_samples = 1


def generate_and_write_synthetic_prompts(
    client: ChatOpenAI,
    data: List[PromptMeta],
    output_file_name: str,
    multiple_samples: int,
    random_instructions: int,
):
    count = 1
    for idx in trange(len(data)):
        response = create_synthetic_instructions_from_passage(
            chat_client=client,
            content=data[idx].input,
            prompts=meta_instructions,
            random_instructions=random_instructions,
            multiple_samples=multiple_samples,
        )
        synthetic_prompts = create_synthetic_prompts(response)
        with open(output_file_name, "a+") as f:
            for prompt in synthetic_prompts:
                f.write(prompt.json() + "\n")
                count += 1

    return count


def generate_and_write_qa_synthetic_prompts(
    client: ChatOpenAI,
    data: List[PromptMeta],
    output_file_name: str,
    multiple_samples: int,
    random_instructions: int,
):
    count = 1
    for idx in trange(len(data)):
        response = create_synthetic_instructions_from_passage(
            chat_client=client,
            content=data[idx].input,
            prompts=qa_instructions,
            random_instructions=random_instructions,
            multiple_samples=multiple_samples,
        )
        synthetic_prompts = create_synthetic_qa_prompts(response)
        with open(output_file_name, "a+") as f:
            for prompt in synthetic_prompts:
                f.write(prompt.json() + "\n")
                count += 1

In [275]:
sub_prompts = random.choices(sportseekda_prompts, k=20)
# subset = set([sub_prompts[i].input for i in range(20)])
# sub_prompts = [p for p in sportseekda_prompts if p.input not in subset]

In [358]:
sub_prompts = random.choices(sportseekda_prompts, k=200)
generate_and_write_synthetic_prompts(
    client=client,
    data=sub_prompts[69:],
    output_file_name="sportseeka_synthetic_prompts.jsonl",
    multiple_samples=1,
    random_instructions=2,
)
# generate_and_write_qa_synthetic_prompts(
#     client=client,
#     data=sportseekda_prompts[270+20:],
#     output_file_name="sportseeka_synthetic_qa_prompts.jsonl",
#     multiple_samples=1,
#     random_instructions=3,
# )

  7%|▋         | 9/131 [02:45<33:41, 16.57s/it]  

1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'Yamanaka Clan': 'A nobl... to the Yamanaka Clan.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type


 11%|█         | 14/131 [03:59<30:32, 15.67s/it]

0 Extra data: line 8 column 1 (char 227)


 12%|█▏        | 16/131 [04:53<38:33, 20.12s/it]

0 Extra data: line 9 column 1 (char 436)
1 Extra data: line 8 column 1 (char 435)


 13%|█▎        | 17/131 [05:39<52:52, 27.83s/it]

1 validation error for InstructionOutput
  Input should be a valid dictionary or instance of InstructionOutput [type=model_type, input_value=AIMessage(content='```\n{...de-a589-df89e781f7b2-0'), input_type=AIMessage]
    For further information visit https://errors.pydantic.dev/2.7/v/model_type


 14%|█▎        | 18/131 [05:53<45:01, 23.91s/it]

0 Extra data: line 9 column 1 (char 249)
1 Extra data: line 8 column 1 (char 252)


 15%|█▍        | 19/131 [06:46<1:00:52, 32.61s/it]

1 validation error for InstructionOutput
  Input should be a valid dictionary or instance of InstructionOutput [type=model_type, input_value=AIMessage(content='```\n{...fb-afd5-e034b84cc79e-0'), input_type=AIMessage]
    For further information visit https://errors.pydantic.dev/2.7/v/model_type


 15%|█▌        | 20/131 [06:58<48:35, 26.27s/it]  

0 Extra data: line 8 column 1 (char 263)


 20%|█▉        | 26/131 [08:52<30:15, 17.29s/it]

0 Invalid control character at: line 4 column 15 (char 177)


 25%|██▌       | 33/131 [10:55<25:27, 15.59s/it]

1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'Uzumaki Clan Overview':...life force and Chakra.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type


 27%|██▋       | 35/131 [11:16<20:53, 13.06s/it]

0 Extra data: line 7 column 1 (char 374)
1 Extra data: line 7 column 1 (char 482)


 27%|██▋       | 36/131 [11:57<33:59, 21.46s/it]

1 validation error for InstructionOutput
  Input should be a valid dictionary or instance of InstructionOutput [type=model_type, input_value=AIMessage(content='```\n{...51-8cad-18e866028449-0'), input_type=AIMessage]
    For further information visit https://errors.pydantic.dev/2.7/v/model_type
0 Extra data: line 9 column 1 (char 555)


 28%|██▊       | 37/131 [13:34<1:08:50, 43.94s/it]

1 Extra data: line 9 column 1 (char 487)
1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'Ninja': 'A covert agent...ield the Wood Release.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
1 validation error for InstructionOutput
  Input should be a valid dictionary or instance of InstructionOutput [type=model_type, input_value=AIMessage(content='```\n{...c5-ba98-a5c78c31815d-0'), input_type=AIMessage]
    For further information visit https://errors.pydantic.dev/2.7/v/model_type


 32%|███▏      | 42/131 [14:48<27:08, 18.30s/it]  

0 Invalid control character at: line 4 column 17 (char 181)


 33%|███▎      | 43/131 [15:10<28:28, 19.42s/it]

1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'Ninja': 'A covert agent... in the Naruto series.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type


 34%|███▎      | 44/131 [15:28<27:20, 18.85s/it]

0 Invalid control character at: line 4 column 79 (char 263)


 34%|███▍      | 45/131 [16:21<41:45, 29.14s/it]

1 Invalid control character at: line 4 column 79 (char 263)
1 validation error for InstructionOutput
  Input should be a valid dictionary or instance of InstructionOutput [type=model_type, input_value=AIMessage(content='```\n{...db-9f06-4e9d99cb7510-0'), input_type=AIMessage]
    For further information visit https://errors.pydantic.dev/2.7/v/model_type


 35%|███▌      | 46/131 [16:31<33:07, 23.39s/it]

0 Extra data: line 7 column 1 (char 201)


 36%|███▌      | 47/131 [17:37<50:59, 36.43s/it]

1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'background': 'Yurui is ...e in the second stage."}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type


 36%|███▌      | 47/131 [2:22:45<4:15:07, 182.24s/it]


KeyboardInterrupt: 

In [270]:
generate_and_write_qa_synthetic_prompts(
    client=client,
    data=sub_prompts,
    output_file_name="sportseeka_qa_synthetic_prompts.jsonl",
    multiple_samples=1,
    random_instructions=3,
)

 55%|█████▌    | 11/20 [23:54<19:57, 133.04s/it]

0 Expecting ',' delimiter: line 41 column 1 (char 1592)
0 Expecting ',' delimiter: line 41 column 1 (char 1951)


 60%|██████    | 12/20 [27:50<21:54, 164.26s/it]

0 Expecting ',' delimiter: line 42 column 1 (char 2880)


 70%|███████   | 14/20 [32:28<14:57, 149.64s/it]

0 Expecting ',' delimiter: line 41 column 1 (char 1383)


100%|██████████| 20/20 [47:11<00:00, 141.59s/it]


In [None]:
generate_and_write_qa_synthetic_prompts(
    client=client,
    data=sub_prompts,
    output_file_name="sportseeka_qa_synthetic_prompts.jsonl",
    multiple_samples=1,
    random_instructions=3,
)

In [291]:
# Verify by reward modelling and duplication detection

In [292]:
import sys
sys.path.append("..")

from src.dedup import deduplicated_contents

In [325]:
with open("sportseeka_synthetic_qa_prompts.jsonl", "r") as f:
    prompts = [json.loads(l) for l in f.readlines()]
input_prompts = [prompt["input"] for prompt in prompts]
print(len(prompts))
deduplicated_indices = deduplicated_contents(input_prompts, return_indices=True, ngrams=10, num_perm=64)
deduplicated_prompts = [prompts[i] for i in deduplicated_indices]
with open("deduplicated_synthetic_qa_prompts.jsonl", "w") as f:
    for prompt in deduplicated_prompts:
        f.write(json.dumps(prompt) + "\n")

9965


In [380]:
fpaths = [
    # "prompt_wiki_with_chunk_v5_512.jsonl",
    # "prompt_wiki_with_chunk_v5_1024.jsonl",
    # "prompt_wiki_with_chunk_v5_2048.jsonl",
]

for fpath in fpaths:
    with open(fpath) as f:
        prompts = [json.loads(l) for l in f.readlines()]

    input_prompts = [prompt["input"] for prompt in prompts]
    deduplicated_indices = deduplicated_contents(input_prompts, return_indices=True, ngrams=10, threshold=0.7, num_perm=256)
    deduplicated_prompts = [prompts[i] for i in deduplicated_indices]
    with open(f"deduplicated_{fpath}", "w") as f:
        for prompt in deduplicated_prompts:
            f.write(json.dumps(prompt) + "\n")

In [376]:
with open("prompt_sportseeker_v4.jsonl") as f:
    prompts = [json.loads(l) for l in f.readlines()]

input_prompts = [prompt["input"] for prompt in prompts]
deduplicated_indices = deduplicated_contents(input_prompts, return_indices=True, ngrams=10, threshold=0.7, num_perm=256)
deduplicated_prompts = [prompts[i] for i in deduplicated_indices]

In [322]:
len(deduplicated_prompts)

6521

In [307]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, pipeline
import datasets
import json
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import trange
model_path = "OpenAssistant/reward-model-deberta-v3-large-v2"
model_path = "sileod/deberta-v3-large-tasksource-rlhf-reward-model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
pipe = pipeline(model=model, tokenizer=tokenizer, task="text-classification", device="mps")

In [308]:
default_template = """
### Instruction
{instruction}
### Input
{input}
### Response
{response}
"""

In [349]:
fpaths = [
    "deduplicated_synthetic_qa_prompts.jsonl",
    # "synthetic_prompts.jsonl",
]
for fpath in fpaths:
    with open(fpath) as f:
        prompts = [json.loads(l) for l in f.readlines()]
    # scores = pipe(
    #     [
    #         default_template.format(
    #             instruction=prompts[i]["instruction"],
    #             input=prompts[i]["input"],
    #             response=prompts[i]["response"],
    #         )
    #         for i in range(len(prompts))
    #     ],
    #     max_length=512,
    #     truncation=True,
    #     batch_size=128,
    # )
    # # Get idx of low quality prompts
    threshold = 0.1
    with open(f"high_quality_{fpath}", "w") as f:
        for prompt, score in zip(prompts, scores):
            if score["score"] > threshold:
                f.write(json.dumps(prompt) + "\n")

In [330]:
# Get idx of low quality prompts
low_quality_prompts = [(prompts[i], s["score"]) for i, s in enumerate(scores) if s["score"] < 0.1]

In [335]:
low_quality_prompts[0:10]

[({'name': None,
   'instruction': 'Answer the question',
   'guideline': '',
   'input': 'What can Black Zetsu do while controlling a target with Body Coating?\n',
   'response': "While possessing a target's body, Black Zetsu can utilize their techniques, Kekkei Genkai, and any other abilities they might have. He can also control the victim and stabilize their life force.",
   'expected_output': None},
  0.09773506224155426),
 ({'name': None,
   'instruction': 'Answer the question',
   'guideline': '',
   'input': 'Can Earth Release be used to inflict harm on enemies?\n',
   'response': 'Yes, Earth Release can be broken up and turned into weapons to inflict harm on their enemies.',
   'expected_output': None},
  0.05670599266886711),
 ({'name': None,
   'instruction': 'Answer the question',
   'guideline': '',
   'input': 'What is the difference between One-Finger Nukite and other variants of it?\n',
   'response': 'The potency and destructive power are increased as the number of fing

# Change instruction

In [350]:
import json
with open("high_quality_deduplicated_synthetic_qa_prompts.jsonl") as f:
    prompts = [json.loads(l) for l in f.readlines()]

In [352]:
import random

def is_multiple_question(question):
    if "A." in question or "B." in question or "C." in question or "D." in question:
        return True
    return False


multiple_choices_instructions = [
    "Answer multiple-choice questions",
    "Answer multiple-choice questions below",
    "Answer the following multiple-choice questions",
    "Answer the multiple-choice questions",
]
question_answer_instructions = [
    "Answer the question",
    "Answer the following question",
    "Answer the question below",
    "Provide the most relevant answer to specific question",
]

def lpoprpush(arr):
    item = arr[0]
    arr.pop(0)
    arr.append(item)
    return item

def change_instruction(prompt):
    if is_multiple_question(prompt["input"]):
        prompt["instruction"] = lpoprpush(multiple_choices_instructions)
    else:
        prompt["instruction"] = lpoprpush(question_answer_instructions)
    return prompt

In [353]:
new_prompts = [change_instruction(prompt) for prompt in prompts]

In [354]:
with open("high_quality_deduplicated_synthetic_qa_prompts.jsonl", "w") as f:
    for prompt in new_prompts:
        f.write(json.dumps(prompt) + "\n")