In [5]:
from openai import OpenAI
from typing import List, Optional, Any
from typing import List, Tuple, Dict, Any, Union
from enum import Enum
import json
import jinja2
import random
import re
from tqdm import trange, tqdm
from pydantic import BaseModel, Field, TypeAdapter, RootModel, ValidationError

In [6]:
default_guideline = """
- Both contents are at most 50 words
- Do not use this or that in the response, instead use the name of the entity
- Be creative, factfulness \n
"""
default_prompt_template = """
### Instruction
{instruction}
### Guideline
{guideline}

### Input
{input}
### Response
{response}
"""


class QuestionAnswering(BaseModel):
    instruction: str = Field(..., title="Instruction")
    output: Optional[str] = Field(
        None, title="Output", description="Output of the instruction"
    )


class InstructionOutput(BaseModel):
    instruction: str = Field(..., title="Instruction")
    output: Optional[str] = Field(
        None, title="Output", description="Output of the instruction"
    )


class InstructionInputOuput(BaseModel):
    instruction: str = Field(..., title="Instruction")
    input: str = Field(..., title="Input", description="Input to the instruction")
    output: Optional[str] = Field(
        None, title="Output", description="Output of the instruction"
    )


class QuestionAnswering(BaseModel):
    question: str = Field()
    answer: str = Field()


class MultipleChoiceQuestionAnswering(BaseModel):
    question: str = Field()
    choices: str = Field(
        description="Multiple choices for the question with structure: A. Choice 1\n B. Choice 2\n C. Choice 3\n D. Choice 4, etc."
    )
    # choices: List[Choices] = Field()
    answer: str = Field(
        description="Answer to the question, value should be single letter such as A, B, C, D, etc."
    )


class EntityKeyValue(BaseModel):
    entity_name: str = Field(..., title="Entity Name", description="Name of the entity")
    entity_value: str = Field(
        ..., title="Entity Value", description="Value of the entity"
    )


class EntityExtractionOutput(BaseModel):
    instruction: str = Field(
        ...,
        title="Instruction",
    )
    output: List[EntityKeyValue] = Field(
        ..., title="Output", description="Output of the instruction"
    )


class PromptMeta(BaseModel):
    name: Optional[str] = Field(
        default=None, title="Name", description="Name of the prompt"
    )
    instruction: str = Field(title="Instruction")
    guideline: Optional[str] = Field(
        default=default_guideline,
        title="Guideline",
        description="Guideline to be followed",
    )
    input: Optional[str] = Field(
        default=None, title="Input", description="Input to the instruction"
    )
    response: Optional[str] = Field(
        default=None, title="Response", description="Response to the instruction"
    )
    expected_output: Optional[Any] = Field(
        default=None,
        title="Expected Output",
        description="Expected output of the instruction",
    )

    def to_prompt(self) -> str:
        return default_prompt_template.format(
            instruction=self.instruction,
            guideline=self.guideline,
            input=self.input,
            response=self.response,
        )


ListQuestionAnswering = RootModel[List[QuestionAnswering]]
ListMultipleChoiceQuestionAnswering = RootModel[List[MultipleChoiceQuestionAnswering]]

YAML_FORMAT_INSTRUCTIONS = jinja2.Template(
    """The output should be formatted as a YAML instance that conforms to the given JSON schema below.

# Examples
## Schema
```
{"properties": {"habit": { "description": "A common daily habit", "type": "string" }, "sustainable_alternative": { "description": "An environmentally friendly alternative to the habit", "type": "string"}}}, "required": ["habit", "sustainable_alternative"]}}
```
## Well formatted instance
```
habit: Using disposable water bottles for daily hydration
sustainable_alternative: |
    Fire Style: Majestic Flame Destroyer
``` 
Please follow the standard YAML formatting conventions with correct indentations, and make sure that the data types adhere strictly to the following JSON schema: 
```
{{schema}}
```
Always use block scalar literal style '|' in YAML if answers include special characters such as colons, dashs
Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!"""
)

JSON_FORMAT_INSTRUCTIONS = jinja2.Template(
    """The output should be formatted as a JSON instance that conforms to the given JSON schema below.
                                           
# Examples
## Schema
```
{"properties": {"habit": { "description": "A common daily habit", "type": "string" }, "sustainable_alternative": { "description": "An environmentally friendly alternative to the habit", "type": "string"}}}, "required": ["habit", "sustainable_alternative"]}}
```
## Well formatted instance
```
{
    "habit": "Using disposable water bottles for daily hydration",
    "sustainable_alternative": "Fire Style: Majestic Flame Destroyer"
}
```
Please follow the standard JSON formatting conventions with correct indentations, and make sure that the data types adhere strictly to the following JSON schema:
```
{{schema}}
```
Use escape double quotes in a string is by using backslashes (\)
Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!"""
)


def get_format_instructions(cls: BaseModel) -> str:
    schema = cls.model_json_schema()

    # Remove extraneous fields.
    reduced_schema = schema
    if "title" in reduced_schema:
        del reduced_schema["title"]
    if "type" in reduced_schema:
        del reduced_schema["type"]
    # Ensure yaml in context is well-formed with double quotes.
    schema_str = json.dumps(reduced_schema)
    return JSON_FORMAT_INSTRUCTIONS.render(schema=schema_str)

In [7]:
model_name = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
api_key = "M8lfE1LIxpckoMXmMi2uW1tzPW2kZAwI"
base_url = "https://api.deepinfra.com/v1/openai"

# Assume openai>=1.0.0
from openai import OpenAI

# Create an OpenAI client with your deepinfra token and endpoint
chat_openai_client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)

In [8]:
# Assume openai>=1.0.0
from openai import OpenAI

# Create an OpenAI client with your deepinfra token and endpoint
chat_openai_client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)

# chat_completion = chat_openai_client.beta.chat.completions.parse(
#     model=model_name,
#     messages=[{"role": "user", "content": "Hello"}],
#     response_format=QuestionAnswering
# )

# print(chat_completion.choices[0].message.content)
# print(chat_completion.usage.prompt_tokens, chat_completion.usage.completion_tokens)

In [9]:
# # meta_instructions = [
# #     {
# #         "name": "Information Extraction : Extracting people, location, events from text",
# #         "instruction": "Generate an instruction and a output related to Information Extraction : Extracting people, location, events from text, in the context of Naruto manga, return in YAML format",
# #         "level": 1,
# #         "expected_output": EntityExtractionOutput,
# #         "guideline": "\nIgnore entity value empty",
# #     },
# #     {
# #         "name": "Relation Extraction : Discovering and categorizing relationships between entities within a text",
# #         "instruction": "Generate an instruction and a output related to Relation Extraction : Discovering and categorizing relationships between entities within a text, in the context of Naruto manga return in YAML format",
# #         "level": 1,
# #         "expected_output": QuestionAnswering,
# #     },
# #     {
# #         "name": "Give me the definition of the words in this context about Naruto manga. Focus on noun term. Naruto manga has some influences from Buddhist and Japan culture of ninja, samurai",
# #         "instruction": "Generate an instruction and a output related to Give me the definition of the words in this context about Naruto manga. Focus on noun term. Naruto manga has some influences from Buddhist and Japan culture of ninja, samurai, return in YAML format",
# #         "level": 1,
# #         "expected_output": InstructionOutput,
# #     },
# #     {
# #         "name": "Replace the <mask> token in the text with proper words that are consistent with the context. You can use multiple words for each <mask> token",
# #         "instruction": "Generate an instruction, a input and a output related to Replace the MASK token in the text with proper words that are consistent with the context, return in YAML format",
# #         "level": 1,
# #         "expected_output": QuestionAnswering,
# #         "guideline": "Both contents contain at most 3 sentences, tokens are at most 100 words\nBe creative, factfulness",
# #     },
# #     {
# #         "name": "Verify if the claim is true or false based on provided context. It is false, explain why",
# #         "instruction": "Generate an instruction and a output related to Verify if the claim is true or false based on provided context. It is false, explain why, return in YAML format",
# #         "expected_output": QuestionAnswering,
# #     },
# #     {
# #         "name": "Intent Recognition : Determining the author's intention or goal behind a given text",
# #         "instruction": "Generate an instruction and a output related to Intent Recognition : Determining the author's intention or goal behind a given text based on a given text. Background: Naruto is fictional manga written by Japanese artist, which has some influences from Buddhist and Japan culture of ninja, samurai, return in YAML format",
# #         "level": 1,
# #         "expected_output": QuestionAnswering,
# #     },
# # ]
# meta_instructions = [
#     {
#         "name": "Character Analysis",
#         "instruction": "Generate an instruction and output related to analyzing character traits, development, and motivations in Naruto",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "examples": [
#             {
#                 "instruction": "Analyze how Naruto's childhood experiences shaped his dream of becoming Hokage",
#                 "output_format": "essay",
#                 "complexity": "high",
#             }
#         ],
#         "guideline": "Focus on psychological depth and character development arcs",
#     },
#     {
#         "name": "Power System Analysis",
#         "instruction": "Generate questions and answers about chakra, jutsu, and ninja techniques",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "examples": [
#             {
#                 "instruction": "Explain the different types of chakra nature transformations and their relationships",
#                 "output_format": "structured_explanation",
#                 "complexity": "medium",
#             }
#         ],
#         "guideline": "Include technical details and specific examples from the manga/anime",
#     },
#     {
#         "name": "Event Timeline Construction",
#         "instruction": "Create questions about the chronological order of major events in the Naruto universe",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "examples": [
#             {
#                 "instruction": "Order these events chronologically: Fourth Shinobi War, Pain's Assault, Uchiha Massacre",
#                 "output_format": "multiple_choice",
#                 "complexity": "medium",
#             }
#         ],
#         "guideline": "Ensure historical accuracy and causal relationships between events",
#     },
#     {
#         "name": "World Building Comprehension",
#         "instruction": "Generate questions about the political structure, village systems, and ninja world organization",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "examples": [
#             {
#                 "instruction": "Describe the relationship between the Five Great Ninja Villages and their respective Kage",
#                 "output_format": "structured_analysis",
#                 "complexity": "high",
#             }
#         ],
#         "guideline": "Include political, economic, and social aspects of the ninja world",
#     },
#     {
#         "name": "Battle Analysis",
#         "instruction": "Create detailed analysis of significant battles, strategies, and fighting techniques",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "examples": [
#             {
#                 "instruction": "Analyze the strategic elements in the battle between Naruto and Pain",
#                 "output_format": "detailed_analysis",
#                 "complexity": "high",
#             }
#         ],
#         "guideline": "Focus on tactical decisions, jutsu usage, and battle outcomes",
#     },
#     {
#         "name": "Theme Exploration",
#         "instruction": "Generate questions exploring major themes like friendship, sacrifice, and cycle of hatred",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "examples": [
#             {
#                 "instruction": "How does the series explore the theme of breaking the cycle of hatred?",
#                 "output_format": "essay",
#                 "complexity": "high",
#             }
#         ],
#         "guideline": "Connect themes to specific story events and character developments",
#     },
#     {
#         "name": "Cross-Reference Analysis",
#         "instruction": "Create questions that require connecting information from different story arcs or time periods",
#         "level": 4,
#         "expected_output": InstructionOutput,
#         "examples": [
#             {
#                 "instruction": "Compare the similarities between the Sage of Six Paths' story and Naruto's journey",
#                 "output_format": "comparative_analysis",
#                 "complexity": "high",
#             }
#         ],
#         "guideline": "Focus on connecting seemingly unrelated elements of the story",
#     },
# ]

# number_of_instances = 10
# qa_instructions = [
#     {
#         "name": "Reading Comprehension : Understanding and answering questions based on a given text",
#         "level": 1,
#         "instruction": f"Generate {number_of_instances} question and answer pairs related to Reading Comprehension : Understanding and answering questions based on a given text",
#         "expected_output": QuestionAnsweringOutput,
#         "guideline": """
# - Question about who, what, when, where, why and how questions about the given text
# - Question and answer tokens are at most 50 words
# - Do not use this or that in the question, use the explicit name of the entity
# - Difficulty level: easy. Definition of easy: The answer is directly in the text
# """,
#     },
#     {
#         "name": "Reading Comprehension : Understanding and answering questions based on a given text",
#         "level": 1,
#         "instruction": f"Generate {number_of_instances} question and answer pairs related to Reading Comprehension : Understanding and answering questions based on a given text",
#         "expected_output": QuestionAnsweringOutput,
#         "guideline": """
# - Questions cover cover a range of topics, including character motivations, plot twists, specific dialogues, who, what, when, where, why and how questions about the given text
# - Question and answer tokens are at most 100 words
# - Do not use this or that in the question, use the explicit name of the entity
# - Draft questions that require in-depth knowledge and critical thinking, Avoid straightforward questions. Instead, focus on nuanced and detailed aspects of the manga
# - Consider aspects like character backgrounds, hidden abilities, and lesser-known facts.
# - Difficulty level: medium. Definition of medium: The answer is not directly in the text, but can be inferred from the text. For example, relationship between two people, or the cause of an event
# """
#     },
#     {
#         "name": "Reading Comprehension : Understanding and answering questions based on a given text",
#         "level": 1,
#         "instruction": f"Using the following paragraph, write {number_of_instances} multiple-choices question-answer pairs",
#         "expected_output": MultipleChoiceQuestionAnsweringOutput,
#         "guideline": """
# - Questions cover cover a range of topics, including character motivations, plot twists, specific dialogues, who, what, when, where, why and how questions about the given text
# - Question and answer tokens are at most 100 words
# - Do not use this or that in the question, use the explicit name of the entity
# - Difficulty level: hard. Definition of hard: The answer is not directly in the text, but can be inferred from the text and reasoning is required. There are at most 6 choices, and only one is correct, include None of the above are correct
# - Incorporate specific details from the context that require careful reading and recall
# - Include questions about the cultural and mythological references used in the context if possible
# - Pose questions that require the reader to connect different parts
#         """,

#     },
# ]

In [10]:
%%markdown
# Generate synthetic dataset
- Classify data into characters, clans, jutsu, and villages
- Each class has meta synthetic prompts to generate questions and answers, long-form answers, and multiple-choice questions
- Generate 10 instances for each class

# Generate synthetic dataset
- Classify data into characters, clans, jutsu, and villages
- Each class has meta synthetic prompts to generate questions and answers, long-form answers, and multiple-choice questions
- Generate 10 instances for each class


In [11]:
class GroupType(Enum):
    CHARACTERS = "characters"
    CLANS = "clans"
    JUTSU = "jutsu"
    VILLAGES = "villages"
    CHAKRA = "chakra"
    PLOT = "plot"
    BUDDHIST = "buddhist"
meta_instructions = [
    {
        "name": "Character Analysis",
        "instruction": "Generate an instruction and output related to analyzing character traits, development, and motivations in Naruto",
        "level": 2,
        "expected_output": InstructionOutput,
        "examples": [
            {
                "instruction": "Analyze how Naruto's childhood experiences shaped his dream of becoming Hokage",
                "output_format": "essay",
                "complexity": "high",
            }
        ],
        "guideline": "Focus on psychological depth and character development arcs",
        "group": GroupType.CHARACTERS,
    },
    {
        "name": "Power System Analysis",
        "instruction": "Generate an instruction and output about chakra",
        "level": 2,
        "expected_output": InstructionOutput,
        "examples": [
            {
                "instruction": "Explain the different types of chakra nature transformations and their relationships",
                "output_format": "structured_explanation",
                "complexity": "medium",
            }
        ],
        "guideline": "Include technical details and specific examples from the manga/anime",
        "group": GroupType.CHAKRA
    },
    {
        "name": "Power System Analysis",
        "instruction": "Generate an instruction and output about jutsu and ninja techniques",
        "level": 2,
        "expected_output": InstructionOutput,
        "examples": [
            {
                "instruction": "Explain the different types of chakra nature transformations and their relationships",
                "output_format": "structured_explanation",
                "complexity": "medium",
            }
        ],
        "guideline": "Include technical details and specific examples from the manga/anime",
        "group": GroupType.JUTSU
    },
    {
        "name": "Event Timeline Construction",
        "instruction": "Generate an instruction and output about the chronological order of major events in the Naruto universe",
        "level": 3,
        "expected_output": InstructionOutput,
        "examples": [
            {
                "instruction": "Order these events chronologically: Fourth Shinobi War, Pain's Assault, Uchiha Massacre",
                "output_format": "multiple_choice",
                "complexity": "medium",
            }
        ],
        "guideline": "Ensure historical accuracy and causal relationships between events",
        "group": GroupType.PLOT
    },
    {
        "name": "World Building Comprehension",
        "instruction": "Generate an instruction and output about the political structure, village systems, and ninja world organization",
        "level": 2,
        "expected_output": InstructionOutput,
        "examples": [
            {
                "instruction": "Describe the relationship between the Five Great Ninja Villages and their respective Kage",
                "output_format": "structured_analysis",
                "complexity": "high",
            }
        ],
        "guideline": "Include political, economic, and social aspects of the ninja world",
        "group": GroupType.VILLAGES
    },
    {
        "name": "Battle Analysis",
        "instruction": "Generate an instruction and output of significant battles, strategies, and fighting techniques",
        "level": 3,
        "expected_output": InstructionOutput,
        "examples": [
            {
                "instruction": "Analyze the strategic elements in the battle between Naruto and Pain",
                "output_format": "detailed_analysis",
                "complexity": "high",
            }
        ],
        "guideline": "Focus on tactical decisions, jutsu usage, and battle outcomes. Return at most 6 sentences",
        "group": GroupType.PLOT
    },
    {
        "name": "Theme Exploration",
        "instruction": "Generate an instruction and output exploring major themes like friendship, sacrifice, and cycle of hatred",
        "level": 3,
        "expected_output": InstructionOutput,
        "examples": [
            {
                "instruction": "How does the series explore the theme of breaking the cycle of hatred?",
                "output_format": "essay",
                "complexity": "high",
            }
        ],
        "guideline": "Connect themes to specific story events and character developments",
        "group": GroupType.PLOT
    },
    {
        "name": "Cross-Reference Analysis",
        "instruction": "Generate an instruction and output that require connecting information from different story arcs or time periods",
        "level": 4,
        "expected_output": InstructionOutput,
        "examples": [
            {
                "instruction": "Compare the similarities between the Sage of Six Paths' story and Naruto's journey",
                "output_format": "comparative_analysis",
                "complexity": "high",
            }
        ],
        "guideline": "Focus on connecting seemingly unrelated elements of the story",
        "group": GroupType.PLOT
    },
    {
        "name": "Jutsu Classification",
        "instruction": "Generate an instruction and output about categorizing different types of jutsu",
        "level": 1,
        "expected_output": InstructionOutput,
        "example": {
            "instruction": "Classify these jutsu by type: Chidori, Shadow Clone, Gentle Fist",
            "categories": ["Ninjutsu", "Taijutsu"],
        },
        "group": GroupType.JUTSU
    },
    {
        "name": "Multiple-Choice Character Abilities",
        "instruction": "Generate an instruction and output about character abilities/jutsu at least 4 choices, at most 8 choices, only one correct answer",
        "level": 1,
        "expected_output": InstructionOutput,
        "example": {
            "question": "What type of chakra nature is Naruto's Rasengan?",
            "choices": ["Wind", "Lightning", "No nature", "Fire"],
            "correct": 2,
        },
        "group": GroupType.CHARACTERS
    },    
        {
        "name": "Cause-Effect Analysis",
        "instruction": "Generate an instruction and output about cause-effect relationships in major plot points",
        "level": 3,
        "expected_output": InstructionOutput,
        "example": {
            "instruction": "How did Pain's attack on Konoha ultimately affect Naruto's development as a character?",
            "expected_elements": [
                "personal growth",
                "village recognition",
                "understanding pain",
            ],
        },
        "group": GroupType.PLOT
    },
    {
        "name": "Cultural Reference Identification",
        "instruction": "Generate an instruction and output about Japanese cultural and Buddhist references in Naruto",
        "level": 2,
        "expected_output": InstructionOutput,
        "example": {
            "instruction": "Which Buddhist concept is reflected in Pain's philosophy?",
            "choices": ["Karma", "Samsara", "Nirvana", "Dharma"],
        },
        "group": GroupType.BUDDHIST
    },    
    {
        "name": "Plot Hole Resolution",
        "instruction": "Generate an instruction and output that address apparent plot inconsistencies",
        "level": 4,
        "expected_output": InstructionOutput,
        "example": {
            "instruction": "Explain how Kakashi still had Sharingan abilities after giving his eye to Obito",
            "key_points": ["temporary power", "chakra residue"],
        },
        "group": GroupType.PLOT
    },    
    {
        "name": "Inter-Village Relations",
        "instruction": "Generate an instruction and output about diplomatic relationships between ninja villages",
        "level": 3,
        "expected_output": InstructionOutput,
        "example": {
            "instruction": "How did the alliance system change from pre-war to post-war ninja world?",
            "aspects": ["treaties", "trade", "military cooperation"],
        },
        "group": GroupType.VILLAGES
    },    
    {
        "name": "Clan Politics",
        "instruction": "Generate an instruction and output about political influence of major clans",
        "level": 2,
        "expected_output": InstructionOutput,
        "example": {
            "instruction": "Compare the political influence of the Uchiha and Hyuga clans in Konoha",
            "focus_areas": [
                "village authority",
                "internal autonomy",
                "political restrictions",
            ],
        },
        "group": GroupType.CLANS
    },    
    {
        "name": "Military Hierarchy",
        "instruction": "Generate an instruction and output about the ninja rank system and its political implications",
        "level": 2,
        "expected_output": InstructionOutput,
        "example": {
            "instruction": "Analyze how the ANBU's role affects village political structure",
            "elements": ["black ops", "hokage loyalty", "political balance"],
        },
        "group": GroupType.VILLAGES
    },
    {
        "name": "Emptiness and Illusion",
        "instruction": "Generate an instruction and output about genjutsu and reality perception",
        "level": 4,
        "expected_output": InstructionOutput,
        "example": {
            "instruction": "How does the Infinite Tsukuyomi reflect Buddhist concepts of Maya (illusion)?",
            "elements": ["reality", "perception", "truth"],
        },
        "group": GroupType.BUDDHIST
    }, 
]

In [12]:
# qa_instructions = [
#     {
#         "name": "Cause-Effect Analysis",
#         "instruction": "Generate a question and an answer about cause-effect relationships in major plot points",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "How did Pain's attack on Konoha ultimately affect Naruto's development as a character?",
#             "expected_elements": [
#                 "personal growth",
#                 "village recognition",
#                 "understanding pain",
#             ],
#         },
#     },

#     {
#         "name": "Power System Comparison",
#         "instruction": "Create a question and an answer comparing different ninja techniques or power systems",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Compare and contrast Sage Mode and Six Paths Mode",
#             "aspects": ["power source", "limitations", "abilities"],
#         },
#     },
#     {
#         "name": "Cultural Reference Identification",
#         "instruction": "Generate questions and answers about Japanese cultural and Buddhist references in Naruto",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Which Buddhist concept is reflected in Pain's philosophy?",
#             "choices": ["Karma", "Samsara", "Nirvana", "Dharma"],
#         },
#     },
#     {
#         "name": "Battle Strategy Analysis",
#         "instruction": "Create question and an answer about battle tactics and strategy used in key fights",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Analyze the strategy Shikamaru used to defeat Hidan",
#             "key_elements": ["preparation", "deception", "terrain usage"],
#         },
#     },
#     {
#         "name": "Character Relationship Web",
#         "instruction": "Generate questions and answers about complex character relationships and their development",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Explain the evolution of Team 7's relationships throughout the series",
#             "focus_points": ["bonds", "conflicts", "reconciliation"],
#         },
#     },
#     {
#         "name": "Theme Identification",
#         "instruction": "Create questions and answers about major themes and their manifestation in the story",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "How is the theme of 'cycle of hatred' portrayed through different generations?",
#             "key_themes": ["revenge", "understanding", "peace"],
#         },
#     },
#     {
#         "name": "Jutsu Classification",
#         "instruction": "Generate questions and answers about categorizing different types of jutsu",
#         "level": 1,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Classify these jutsu by type: Chidori, Shadow Clone, Gentle Fist",
#             "categories": ["Ninjutsu", "Taijutsu"],
#         },
#     },
#     {
#         "name": "Plot Hole Resolution",
#         "instruction": "Create questions and answers that address apparent plot inconsistencies",
#         "level": 4,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Explain how Kakashi still had Sharingan abilities after giving his eye to Obito",
#             "key_points": ["temporary power", "chakra residue"],
#         },
#     },
#     {
#         "name": "Village Power Structure",
#         "instruction": "Generate questions and answers about the political hierarchy and power dynamics within ninja villages",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Explain the balance of power between the Hokage, village council, and clan heads in Konoha",
#             "key_points": ["decision making", "checks and balances", "clan autonomy"],
#         },
#     },
#     {
#         "name": "Inter-Village Relations",
#         "instruction": "Create questions and answers about diplomatic relationships between ninja villages",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "How did the alliance system change from pre-war to post-war ninja world?",
#             "aspects": ["treaties", "trade", "military cooperation"],
#         },
#     },
#     {
#         "name": "Clan Politics",
#         "instruction": "Generate questions and answers about political influence of major clans",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Compare the political influence of the Uchiha and Hyuga clans in Konoha",
#             "focus_areas": [
#                 "village authority",
#                 "internal autonomy",
#                 "political restrictions",
#             ],
#         },
#     },
#     {
#         "name": "Military Hierarchy",
#         "instruction": "Create questions and answers about the ninja rank system and its political implications",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Analyze how the ANBU's role affects village political structure",
#             "elements": ["black ops", "hokage loyalty", "political balance"],
#         },
#     },
#     {
#         "name": "Cycle of Reincarnation",
#         "instruction": "Generate questions and answers about karmic cycles in Naruto's narrative",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "How does Indra and Ashura's reincarnation cycle reflect Buddhist concepts?",
#             "aspects": ["karma", "dualism", "spiritual inheritance"],
#         },
#     },
#     {
#         "name": "Path of Pain Philosophy",
#         "instruction": "Create questions and answers about Pain's philosophy and its Buddhist foundations",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Compare Pain's Six Paths to Buddhist concepts of suffering",
#             "elements": ["noble truths", "paths of existence", "cycle of pain"],
#         },
#     },
#     {
#         "name": "Enlightenment Journey",
#         "instruction": "Generate questions and answers about character enlightenment paths",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "How does Naruto's sage training parallel Buddhist enlightenment concepts?",
#             "key_concepts": ["meditation", "natural energy", "spiritual balance"],
#         },
#     },
#     {
#         "name": "Moral Dualism",
#         "instruction": "Create questions and answers about the series' treatment of good and evil",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Analyze how Naruto's treatment of villains reflects Buddhist teachings about duality",
#             "concepts": ["non-dualism", "understanding", "forgiveness"],
#         },
#     },
#     {
#         "name": "Four Noble Truths Parallel",
#         "instruction": "Generate questions connecting plot elements to Buddhist Four Noble Truths",
#         "level": 4,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "How do the ninja world's cycles of war reflect Buddhist concepts of suffering?",
#             "elements": ["cause of suffering", "path to peace", "understanding truth"],
#         },
#     },
#     {
#         "name": "Spiritual Power Systems",
#         "instruction": "Create questions and answers about chakra system's Buddhist origins",
#         "level": 2,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "Compare Naruto's chakra system with Buddhist concepts of spiritual energy",
#             "aspects": ["energy centers", "spiritual flow", "life force"],
#         },
#     },
#     {
#         "name": "Middle Path Philosophy",
#         "instruction": "Generate questions and answers about balanced approaches in conflict resolution",
#         "level": 3,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "How does Naruto's approach to peace reflect Buddhist Middle Path teaching?",
#             "concepts": ["balance", "moderation", "understanding"],
#         },
#     },
#     {
#         "name": "Emptiness and Illusion",
#         "instruction": "Create questions and answers about genjutsu and reality perception",
#         "level": 4,
#         "expected_output": QuestionAnswering,
#         "example": {
#             "instruction": "How does the Infinite Tsukuyomi reflect Buddhist concepts of Maya (illusion)?",
#             "elements": ["reality", "perception", "truth"],
#         },
#     },
# ]

In [13]:
# random_instructions = 2
# multiple_samples = 1


# def generate_and_write_synthetic_meta_prompts(
#     client: OpenAI,
#     model_name: str,
#     data: List[PromptMeta],
#     meta_instructions: List[dict],
#     output_file_name: str,
#     multiple_samples: int,
#     random_instructions: int,
# ):
#     count = 0
#     all_synthetic_prompts = []
#     for idx in trange(len(data)):
#         response = create_synthetic_meta_instructions_from_passage(
#             chat_client=client,
#             model_name=model_name,
#             content=data[idx].input,
#             prompts=meta_instructions,
#             random_instructions=random_instructions,
#             multiple_samples=multiple_samples,
#             llm_config={
#                 "temperature": 0.5,
#             },
#         )
#         synthetic_prompts = create_synthetic_prompts(response)
#         with open(output_file_name, "a+") as f:
#             for prompt in synthetic_prompts:
#                 f.write(prompt.json() + "\n")
#                 all_synthetic_prompts.append(prompt)
#                 count += 1

#     return count

In [14]:
def auto_format_yaml(yaml_string):
    corrected_yaml_string = yaml_string.replace("```yaml", "").replace("```", "")
    return corrected_yaml_string


def auto_format_json(json_string):
    corrected_json_string = json_string.replace("```json", "").replace("```", "")
    return corrected_json_string


def truncate_content(content, max_length=126_000):
    tokens = content.split()
    if len(tokens) * 1.2 > max_length:
        tokens = tokens[:max_length]
        return " ".join(tokens)

    return content


def call_llm(
    chat_client: OpenAI,
    model_name: str,
    prompt,
    structure_output=None,
    raise_exception: bool = False,
    **kwargs,
):
    messages = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": prompt},
    ]
    for retry in range(getattr(chat_client, "max_retries", 3)):
        try:
            if structure_output:
                response = chat_client.beta.chat.completions.parse(
                    model=model_name,
                    messages=messages,
                    response_format=structure_output,
                    **kwargs,
                )
                return response.choices[0].message.parsed
            else:
                response = chat_client.chat.completions.create(
                    model=model_name,
                    messages=messages,
                    **kwargs,
                )
                return response.choices[0].message.content
        except Exception as e:
            print(f"Retry {retry}: {str(e)}")
            if raise_exception:
                raise e
            continue


def create_meta_prompts_from_instruction(meta_instruction, input_content: str):
    guideline = meta_instruction.get("guideline", default_guideline)
    if "examples" in meta_instruction:
        guideline += "\n ### Instruction Examples: \n"
        for example in meta_instruction["examples"]:
            guideline += example["instruction"] + "\n"

    guideline += "\n" + get_format_instructions(meta_instruction["expected_output"])
    return PromptMeta(
        name=meta_instruction["name"],
        instruction=meta_instruction["instruction"],
        guideline=guideline,
        input=input_content,
        response=meta_instruction.get("response", ""),
        expected_output=meta_instruction["expected_output"],
    )


def get_instruction_output_from_meta_prompt(
    chat_client: OpenAI,
    model_name: str,
    prompt: PromptMeta,
    raise_exception: bool = False,
    llm_config: dict = {},
    multiple_samples=3,
):
    results = []
    for _ in range(multiple_samples):
        response = call_llm(
            chat_client=chat_client,
            model_name=model_name,
            prompt=prompt.to_prompt(),
            raise_exception=raise_exception,
            structure_output=prompt.expected_output,
            **llm_config,
        )
        new_prompt = PromptMeta(**prompt.model_dump())
        new_prompt.response = response
        results.append(new_prompt)

    return results


def create_synthetic_prompts(
    synthetic_instructions: List[PromptMeta],
) -> List[BaseModel]:
    res = []
    for synthetic_instruction in synthetic_instructions:
        try:
            cls_dict = synthetic_instruction.response.model_dump()
            response = cls_dict["output"]

            res.append(
                PromptMeta(
                    instruction=cls_dict["instruction"],
                    guideline="",
                    input=cls_dict.get("input", synthetic_instruction.input),
                    response=response,
                )
            )
        except ValidationError as e:
            print(e)
            continue
    return res


def create_synthetic_qa_prompts(
    qa_response_prompts: List[PromptMeta], instruction: str = "Answer the question"
):
    res = []
    for qa_response_prompt in qa_response_prompts:
        try:
            cls_output: BaseModel = qa_response_prompt.expected_output
            list_of_qa_obj: list = cls_output.model_validate(
                qa_response_prompt.response
            )
            list_of_qa = list_of_qa_obj.dict()

            for qa_obj in list_of_qa:
                response = qa_obj["answer"].strip()
                input_content = (
                    qa_obj["question"].strip() + "\n" + qa_obj.get("choices", "")
                )
                res.append(
                    PromptMeta(
                        instruction=instruction,
                        guideline="",
                        input=input_content,
                        response=response,
                    )
                )
        except ValidationError as e:
            print(e, qa_response_prompt.response)
            continue

    return res

# Load from sportseekeda

In [15]:
output = []
with open("../data/sportseeker/all_pages.json") as f:
    for line in f:
        output.append(json.loads(line.strip()))

In [16]:
# Get unique tags
tags = set([item["tag"] for item in output])
tags

{'characters', 'clans', 'jutsu', 'teams'}

In [17]:
mapper_types = {
    "characters": [GroupType.CHARACTERS, GroupType.PLOT],
    "clans": [GroupType.CLANS, GroupType.VILLAGES,],
    "jutsu": [GroupType.JUTSU, GroupType.CHAKRA, GroupType.BUDDHIST],
    "teams": [GroupType.CLANS, GroupType.CHARACTERS],
}

In [18]:
# Cluster data by mapper_type
clustered_data = {}
for row in output:
    tag = row["tag"]
    for mapper_type in mapper_types.get(tag, []):
        if mapper_type not in clustered_data:
            clustered_data[mapper_type] = []
        clustered_data[mapper_type].append(row)

In [19]:
# Preview data
for k in list(clustered_data.keys())[:5]:
    print(k, len(clustered_data[k]))
    print(clustered_data[k][:2])

GroupType.JUTSU 51
[{'text': 'Adamantine Sealing Chains is a form of Fuinjutsu, also known as a Sealing Jutsu. This technique belongs to the Uzumaki and is a Hiden Technique..While casting this Jutsu, the opponent is chained down by the user using chains which appear out of their torso. The chain which leaves the opponent’s body is fully controlled by the user and not only that, these chains wrap the enemy and entangle them in those chains..Apart from binding the targets down, these chains also neutralise the opponent’s chakra. It can also form an incredibly strong barrier, which almost seems impregnable. Even Hiruzen Sarutobi could not pierce it..Adamantine Sealing Chains is a Hiden Jutsu, which means that it is a Jutsu that is exclusive to a clan that has been passed down orally through tradition. It is kept secret and is not allowed to be taught to people outside the clan..In this case, the Adamantine Sealing Chains is a Hiden Jutsu belonging to the Uzumaki Clan. Even though this te

In [20]:
# with open("./prompt_sportseeker_v4.jsonl") as f:
#     sportseekda_prompts = [PromptMeta.model_validate_json(line) for line in f]

In [21]:
meta_instructions_by_group = {}
for meta_instruction in meta_instructions:
    group = meta_instruction["group"]
    if group not in meta_instructions_by_group:
        meta_instructions_by_group[group] = []
    meta_instructions_by_group[group].append(meta_instruction)

In [22]:
sportseekda_meta_prompts_by_group: Dict[GroupType, List[PromptMeta]] = {}
for group, data in clustered_data.items():
    sportseekda_meta_prompts_by_group[group] = []
    for row in data:
        for meta_instruction in meta_instructions_by_group[group]:
            sportseekda_meta_prompts_by_group[group].append(
                create_meta_prompts_from_instruction(
                    meta_instruction=meta_instruction,
                    input_content="{}\n{}".format(row["title"], row["text"]),
                )
            )

In [23]:
# Calculate total input tokens from qa_instruction and meta_instruction on all sportseekda_prompts
total_meta_prompts = []
for k, v in sportseekda_meta_prompts_by_group.items():
    for i in range(len(v)):
       total_meta_prompts.append(v[i].to_prompt())

# Calculate stats about input length (average, min, max) for meta_prompts
total_meta_prompts_length = [len(p.split()) for p in total_meta_prompts]
average_meta_prompts_length = sum(total_meta_prompts_length)/len(total_meta_prompts)
min_meta_prompts_length = min(total_meta_prompts_length)
max_meta_prompts_length = max(total_meta_prompts_length)

print(f"Total meta prompts: {len(total_meta_prompts)}")
print(f"Average meta prompts length: {average_meta_prompts_length}")
print(f"Min meta prompts length: {min_meta_prompts_length}")
print(f"Max meta prompts length: {max_meta_prompts_length}")

Total meta prompts: 6837
Average meta prompts length: 1012.6581834137779
Min meta prompts length: 319
Max meta prompts length: 10354


In [24]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4-turbo")
input_tokens = [len(encoding.encode(p)) for p in total_meta_prompts]
pricing = {
    "gpt-turbo": {
        "input_price_per_million": 0.75,
        "output_price_per_million": 1
    },
    "deepinfra_llama3-70b": {
        "input_price_per_million": 0.12,
        "output_price_per_million": 0.3
    },
    "deepinfra_llama3-405b": {
        "input_price_per_million": 0.8,
        "output_price_per_million": 0.8
    }    
}

total_input_tokens = sum(input_tokens)
average_output_tokens = 4096
total_output_tokens = average_output_tokens * len(total_meta_prompts)
model_name = "deepinfra_llama3-70b"
# model_name = "deepinfra_llama3-405b"
pricing[model_name]["input_price_per_million"] * total_input_tokens / 1e6 + pricing[model_name]["output_price_per_million"] * total_output_tokens / 1e6

9.583321199999999

In [25]:
import time
from pathlib import Path


def generate_and_write_synthetic_meta_prompts_with_checkpoint(
    client: OpenAI,
    model_name: str,
    data: List[PromptMeta],
    output_file_name: str,
    multiple_samples: int,
    chunk_size: int = 512,
    wait_time: int = 60,
    max_retries: int = 3,
    llm_config: dict = {},
):
    # Load checkpoint if exists
    checkpoint_file = Path(output_file_name + ".checkpoint")
    start_idx = 0
    if checkpoint_file.exists():
        start_idx = int(checkpoint_file.read_text())
        print(f"Resuming from checkpoint at index {start_idx}")

    count = 0
    for chunk_start in trange(start_idx, len(data), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(data))
        data_chunk = data[chunk_start:chunk_end]

        retries = 0
        while retries < max_retries:
            try:
                for idx in range(len(data_chunk)):
                    response = get_instruction_output_from_meta_prompt(
                        chat_client=client,
                        model_name=model_name,
                        prompt=data_chunk[idx],
                        multiple_samples=multiple_samples,
                        raise_exception=False,
                        llm_config=llm_config,
                    )
                    synthetic_prompts = create_synthetic_prompts(response)
                    with open(output_file_name, "a+") as f:
                        for prompt in synthetic_prompts:
                            f.write(prompt.model_dump_json() + "\n")
                            count += 1

                # Save checkpoint after successful chunk
                checkpoint_file.write_text(str(chunk_end))
                break  # Break retry loop if successful

            except Exception as e:
                retries += 1
                print(f"Error processing chunk {chunk_start}-{chunk_end}: {str(e)}")
                if retries < max_retries:
                    print(
                        f"Waiting {wait_time} seconds before retry {retries + 1}/{max_retries}"
                    )
                    time.sleep(wait_time)
                else:
                    print(f"Max retries reached for chunk {chunk_start}-{chunk_end}")
                    print("Data index: {}".format(idx))
                    raise e

    # Remove checkpoint file after successful completion
    checkpoint_file.unlink(missing_ok=True)
    return count

In [26]:
model_name = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
api_key = "M8lfE1LIxpckoMXmMi2uW1tzPW2kZAwI"
base_url = "https://api.deepinfra.com/v1/openai"

# Assume openai>=1.0.0
from openai import OpenAI

# Create an OpenAI client with your deepinfra token and endpoint
chat_openai_client = OpenAI(
    api_key=api_key,
    base_url=base_url,
)

In [27]:
p = get_instruction_output_from_meta_prompt(
    chat_client=chat_openai_client,
    model_name=model_name,
    prompt=sportseekda_meta_prompts_by_group[GroupType.CHARACTERS][100],
    raise_exception=True,
    llm_config={"temperature": 0.6},
    multiple_samples=1
)

In [28]:
print(p[0].response.output)

Azami is portrayed as a cheerful, kind, and loving young woman, demonstrated by her daily visits to her grandfather's graveyard with riceballs, showcasing her devotion and belief in his survival. Her personality is deduced from limited interactions, primarily with Rock Lee, Naruto Uzumaki, and Tenten, highlighting her innocence and genuine nature. However, due to her non-canonical status and minimal screen time, Azami's character development and motivations remain largely unexplored, leaving her psychological depth somewhat one-dimensional and undefined.


In [42]:
base_file_name = "./sportseeker_v4_synthetic_{}.jsonl"
turbo_model_name = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
for k in sportseekda_meta_prompts_by_group.keys():
    # print(k.value)
    if k == GroupType.PLOT:
        generate_and_write_synthetic_meta_prompts_with_checkpoint(
            client=chat_openai_client,
            model_name=turbo_model_name,
            data=sportseekda_meta_prompts_by_group[k],
            output_file_name=base_file_name.format(str(k.value)),
            chunk_size=2,
            wait_time=60,
            multiple_samples=1,
            max_retries=4,
            llm_config={"temperature": 0.7},
        )

Resuming from checkpoint at index 758


  0%|          | 6/2048 [00:47<4:18:43,  7.60s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Compare...ir respective groups."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  1%|          | 16/2048 [02:17<4:42:24,  8.34s/it]

Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'cause': "Fūka's abilit...ss to various ninjutsu'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type


  2%|▏         | 39/2048 [04:22<2:49:40,  5.07s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...ness and cooperation."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  2%|▏         | 48/2048 [05:29<3:35:18,  6.46s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...fluence in the story."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 1: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...eness, or friendship.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Error processing chunk 854-856: 'NoneType' object has no attribute 'model_dump'
Waiting 60 seconds before retry 2/4
Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... violence and hatred."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  3%|▎         | 54/2048 [07:38<6:10:40, 11.15s/it] 

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...sitive relationships."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  4%|▍         | 90/2048 [12:11<3:46:12,  6.93s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...nflicts and mistakes."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  4%|▍         | 91/2048 [12:42<7:38:07, 14.05s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Gantets...d forgiveness prevail"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  5%|▌         | 111/2048 [15:03<2:50:26,  5.28s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare...to become the Hokage.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  6%|▌         | 124/2048 [16:41<4:16:31,  8.00s/it]

Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'cause': "Genma's teamwo... Swordsmen of the Mist'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type


  7%|▋         | 136/2048 [18:03<4:25:42,  8.34s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...eractions with Naruto"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  7%|▋         | 137/2048 [18:07<3:42:33,  6.99s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Order t... Gen’yūmaru's body"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  7%|▋         | 138/2048 [18:17<4:07:31,  7.78s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...in their interaction."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  7%|▋         | 145/2048 [19:14<3:56:04,  7.44s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...gainst Kotetsu's team"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  7%|▋         | 147/2048 [19:23<3:09:25,  5.98s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...nges and adversaries.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  7%|▋         | 151/2048 [19:55<3:49:54,  7.27s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci...r", 'output': 'Nagato'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


  9%|▉         | 180/2048 [22:45<2:35:57,  5.01s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...aderie in the series.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 12%|█▏        | 238/2048 [28:24<3:25:09,  6.80s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci... who defended himself"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 13%|█▎        | 273/2048 [32:00<2:38:04,  5.34s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...force in their lives."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 14%|█▍        | 285/2048 [33:40<3:37:09,  7.39s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Compare...su like the Byakugan.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 14%|█▍        | 291/2048 [34:24<2:58:50,  6.11s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...nships in the series."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 15%|█▍        | 298/2048 [35:23<4:28:51,  9.22s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...on of the Hyūga clan"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 16%|█▌        | 324/2048 [38:03<2:33:14,  5.33s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... hatred and violence."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 16%|█▋        | 333/2048 [38:51<2:19:50,  4.89s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Compare...f Itachi's backstory."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 17%|█▋        | 349/2048 [40:30<3:01:44,  6.42s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...it Beast's possession"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 17%|█▋        | 354/2048 [40:57<2:34:39,  5.48s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...nections with others."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 18%|█▊        | 368/2048 [42:43<3:24:26,  7.30s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Order t... leader of Takigakure"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 18%|█▊        | 373/2048 [43:26<4:08:00,  8.88s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci...the giant corpse crab"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 19%|█▊        | 382/2048 [44:28<3:48:34,  8.23s/it]

Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'cause': "Homura's decis...act on Konoha's safety"}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type


 19%|█▉        | 399/2048 [46:11<2:34:12,  5.61s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...immediately apparent."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 20%|█▉        | 402/2048 [46:37<3:24:24,  7.45s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...vercoming challenges."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 1: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...ship and cooperation."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Error processing chunk 1562-1564: 'NoneType' object has no attribute 'model_dump'
Waiting 60 seconds before retry 2/4
Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...sitive relationships."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 20%|█▉        | 405/2048 [48:28<8:57:53, 19.64s/it] 

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...nding and acceptance."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 20%|█▉        | 409/2048 [49:03<5:17:02, 11.61s/it]

Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={"Ibiki's past experience...d sadistic personality'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Retry 1: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={"Ibiki's past experience...terrogation techniques'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Error processing chunk 1576-1578: 'NoneType' object has no attribute 'model_dump'
Waiting 60 seconds before retry 2/4
Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'cause': "Ibiki's interr...ormation from suspects'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type


 21%|██        | 420/2048 [51:20<3:19:55,  7.37s/it] 

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... hatred and violence."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 21%|██▏       | 436/2048 [53:00<3:08:00,  7.00s/it]

Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'character': 'Ino Yamana... 'Increased confidence'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type


 21%|██▏       | 439/2048 [53:23<3:32:48,  7.94s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...'s strong personality"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 24%|██▍       | 492/2048 [59:07<2:43:56,  6.32s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...for the greater good.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 24%|██▍       | 495/2048 [59:33<2:58:24,  6.89s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare...ze earth-based jutsu."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 24%|██▍       | 498/2048 [59:50<2:22:30,  5.52s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Explore... the cycle of hatred."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 1: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...reaking free from it."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Error processing chunk 1754-1756: 'NoneType' object has no attribute 'model_dump'
Waiting 60 seconds before retry 2/4


 25%|██▌       | 516/2048 [1:02:52<2:00:38,  4.72s/it] 

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare...ckground information.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 26%|██▌       | 525/2048 [1:03:37<1:52:04,  4.42s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...ce and understanding.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 26%|██▌       | 526/2048 [1:03:51<3:10:17,  7.50s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...Genin of Konohagakure'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 27%|██▋       | 552/2048 [1:06:28<2:09:00,  5.17s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...endships in her life."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 28%|██▊       | 564/2048 [1:07:45<2:21:35,  5.72s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...he characters' lives."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 29%|██▉       | 589/2048 [1:10:23<2:26:41,  6.03s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...e with a violent past'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 30%|██▉       | 606/2048 [1:11:57<2:06:39,  5.27s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...ation and its leader."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 30%|██▉       | 612/2048 [1:12:36<2:08:48,  5.38s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...rowth and well-being."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 30%|███       | 624/2048 [1:13:52<2:11:09,  5.53s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare...t or weaker than him."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 31%|███       | 625/2048 [1:14:04<2:55:53,  7.42s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...ike's fear and trauma"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 31%|███       | 636/2048 [1:15:06<2:02:35,  5.21s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...he cycle of violence."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 31%|███▏      | 642/2048 [1:15:47<2:18:59,  5.93s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...acters in the series."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 32%|███▏      | 651/2048 [1:16:36<1:44:24,  4.48s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...lliances with others."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 33%|███▎      | 678/2048 [1:19:09<1:52:54,  4.94s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...lenges and adversity."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 34%|███▎      | 687/2048 [1:20:09<2:18:41,  6.11s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...t of the ninja world."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 37%|███▋      | 756/2048 [1:26:26<1:35:46,  4.45s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...as learned from them."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 37%|███▋      | 758/2048 [1:26:52<2:48:25,  7.83s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Order t... lived in Sunagakure.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 38%|███▊      | 780/2048 [1:29:29<2:32:13,  7.20s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... of loyalty and duty."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 40%|███▉      | 813/2048 [1:33:24<2:04:43,  6.06s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... peace and belonging.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 40%|████      | 829/2048 [1:35:02<2:05:04,  6.16s/it]

Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'cause': "Madara's desir... of Infinite Tsukuyomi'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type


 42%|████▏     | 856/2048 [1:37:47<2:31:04,  7.60s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze... to brief appearances'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 42%|████▏     | 861/2048 [1:38:16<2:05:04,  6.32s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... and its motivations."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 42%|████▏     | 865/2048 [1:38:49<2:11:58,  6.69s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci... him a formidable foe"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 43%|████▎     | 882/2048 [1:40:33<1:54:17,  5.88s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...e Shadow Clone Jutsu."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 44%|████▎     | 891/2048 [1:41:26<1:33:51,  4.87s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...iendship and loyalty."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 44%|████▎     | 894/2048 [1:41:47<1:45:52,  5.50s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...of healing and peace."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 44%|████▍     | 909/2048 [1:43:25<2:29:47,  7.89s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...ist between siblings."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 48%|████▊     | 986/2048 [1:51:31<1:43:05,  5.82s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Order t...rth Shinobi World War"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 49%|████▉     | 1002/2048 [1:53:11<1:43:11,  5.92s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...for the greater good."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 49%|████▉     | 1003/2048 [1:53:31<2:59:23, 10.30s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Identif...ention, saving Nonota"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 50%|████▉     | 1020/2048 [1:55:11<1:30:09,  5.26s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...ieving a common goal.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 50%|█████     | 1031/2048 [1:56:22<1:17:58,  4.60s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Order t...t, Fourth Shinobi War"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 51%|█████     | 1035/2048 [1:56:49<1:34:00,  5.57s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...raderie and teamwork."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 53%|█████▎    | 1077/2048 [2:00:55<1:35:07,  5.88s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...orgive one's enemies."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 53%|█████▎    | 1095/2048 [2:03:02<1:29:16,  5.62s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...t exist between them."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 54%|█████▍    | 1113/2048 [2:05:08<1:30:46,  5.83s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...lty and selflessness."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 56%|█████▌    | 1138/2048 [2:07:48<1:59:09,  7.86s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci...pancies in narrative."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 56%|█████▌    | 1141/2048 [2:08:07<1:51:10,  7.35s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Review ... Land of Fire village'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 56%|█████▋    | 1152/2048 [2:09:06<1:10:38,  4.73s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Analyze...ion and perseverance."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 58%|█████▊    | 1188/2048 [2:12:53<1:22:26,  5.75s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe... hatred and violence."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare... friends and village.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 58%|█████▊    | 1189/2048 [2:13:19<2:50:37, 11.92s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci...for Shizuka's actions"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 60%|██████    | 1232/2048 [2:18:58<2:05:16,  9.21s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Analyze...rmal combat training.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 61%|██████    | 1242/2048 [2:20:25<1:48:32,  8.08s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...sake of his ambition."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 63%|██████▎   | 1284/2048 [2:26:10<1:20:59,  6.36s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... in the Naruto world."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 66%|██████▌   | 1351/2048 [2:35:04<1:53:25,  9.76s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...to sparse appearances'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 66%|██████▌   | 1353/2048 [2:35:15<1:30:58,  7.85s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...ndship in the series."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare...hroughout the series."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 68%|██████▊   | 1389/2048 [2:40:20<1:32:25,  8.42s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... the cycle of hatred."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 69%|██████▊   | 1405/2048 [2:42:51<1:32:07,  8.60s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...y leads to his defeat"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 70%|███████   | 1434/2048 [2:46:13<59:36,  5.82s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... violence and hatred."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 71%|███████▏  | 1464/2048 [2:51:36<1:55:38, 11.88s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...vercoming challenges."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 72%|███████▏  | 1467/2048 [2:52:04<1:30:56,  9.39s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe... the next generation."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 73%|███████▎  | 1491/2048 [2:54:39<52:33,  5.66s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...for the greater good."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 74%|███████▎  | 1509/2048 [2:56:49<1:00:23,  6.72s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare...r of the Uchiha clan.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 74%|███████▍  | 1513/2048 [2:57:43<1:45:20, 11.81s/it]

Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'cause': "Temari's tough... side around Shikamaru'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type


 75%|███████▌  | 1539/2048 [3:00:58<57:48,  6.81s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...ded by the orphanage.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Compare...lings of superiority."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 75%|███████▌  | 1540/2048 [3:01:26<1:50:52, 13.09s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Explain...ssed by simple things'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 78%|███████▊  | 1592/2048 [3:07:37<49:41,  6.54s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Analyze...cks around the canal.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 78%|███████▊  | 1605/2048 [3:09:17<50:11,  6.80s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...ith other characters."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 1: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...nderstanding instead."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Error processing chunk 3968-3970: 'NoneType' object has no attribute 'model_dump'
Waiting 60 seconds before retry 2/4
Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...le and achieve peace.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 79%|███████▉  | 1614/2048 [3:11:59<1:05:19,  9.03s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Compare...ape and gain freedom."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 79%|███████▉  | 1618/2048 [3:12:35<58:55,  8.22s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...d to village survival"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 79%|███████▉  | 1624/2048 [3:13:29<1:17:13, 10.93s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci...auses inconsistencies"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 81%|████████  | 1655/2048 [3:16:42<40:05,  6.12s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'Analyze...wo-Tails from Yugito.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 82%|████████▏ | 1673/2048 [3:19:21<42:12,  6.75s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Order t...’s threat to Konoha"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 85%|████████▌ | 1747/2048 [3:28:55<55:28, 11.06s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Analyze...aiken killed Harusame'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 88%|████████▊ | 1800/2048 [3:36:35<28:50,  6.98s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...at perpetuate hatred."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 91%|█████████ | 1857/2048 [3:44:27<22:42,  7.13s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': 'How doe...bout positive change.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 93%|█████████▎| 1914/2048 [3:52:35<13:39,  6.11s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare... friends and village."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 94%|█████████▍| 1926/2048 [3:54:05<15:06,  7.43s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...or revenge and power."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare... determined demeanor.'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Retry 1: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare... the main characters."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type
Error processing chunk 4610-4612: 'NoneType' object has no attribute 'model_dump'
Waiting 60 seconds before retry 2/4


 94%|█████████▍| 1929/2048 [3:55:54<38:41, 19.51s/it]  

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...explored in his case."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 96%|█████████▌| 1959/2048 [3:59:59<13:54,  9.38s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare...Uchiha set him apart."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 96%|█████████▌| 1960/2048 [4:00:19<18:10, 12.39s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci...ll, leading to defeat"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 97%|█████████▋| 1981/2048 [4:03:14<09:00,  8.07s/it]

Retry 0: 1 validation error for InstructionOutput
output
  Input should be a valid string [type=string_type, input_value={'cause': "Orochimaru's e...maru's bond with Guren"}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type


 97%|█████████▋| 1984/2048 [4:03:34<08:00,  7.50s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Reconci...n Orochimaru's orders"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 98%|█████████▊| 2011/2048 [4:06:31<03:56,  6.38s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Review ...y fellow monks' death"}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 99%|█████████▊| 2019/2048 [4:07:26<03:12,  6.63s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "How doe...athy and cooperation."}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


 99%|█████████▊| 2022/2048 [4:07:53<03:10,  7.34s/it]

Retry 0: 1 validation error for InstructionOutput
  Input should be an object [type=model_type, input_value=[{'instruction': "Compare...rom behind the scenes'}], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type


100%|██████████| 2048/2048 [4:11:06<00:00,  7.36s/it]


In [43]:
sportseekda_meta_prompts_by_group.keys()

dict_keys([<GroupType.JUTSU: 'jutsu'>, <GroupType.CHAKRA: 'chakra'>, <GroupType.BUDDHIST: 'buddhist'>, <GroupType.CHARACTERS: 'characters'>, <GroupType.PLOT: 'plot'>, <GroupType.CLANS: 'clans'>, <GroupType.VILLAGES: 'villages'>])

# Verify by reward modelling and duplication detection

In [44]:
import sys
sys.path.append("..")

from src.dedup import deduplicated_contents

In [45]:
with open("sportseeker_v4_synthetic_characters.jsonl", "r") as f:
    prompts = [json.loads(l) for l in f.readlines()]
input_prompts = [prompt["input"] for prompt in prompts]
print(len(prompts))
deduplicated_indices = deduplicated_contents(input_prompts, return_indices=True, ngrams=10, num_perm=64)
deduplicated_prompts = [prompts[i] for i in deduplicated_indices]
print(len(deduplicated_prompts))

1646
796


In [54]:
import json

fpaths = [
    "sportseeker_v4_synthetic_characters.jsonl",
    "sportseeker_v4_synthetic_chakra.jsonl",
    "sportseeker_v4_synthetic_jutsu.jsonl",
    "sportseeker_v4_synthetic_plot.jsonl",
    "sportseeker_v4_synthetic_buddhist.jsonl",
    "sportseeker_v4_synthetic_plot.jsonl",
    "sportseeker_v4_synthetic_villages.jsonl",
    "sportseeker_v4_synthetic_clans.jsonl",
    # "sportseeker_v4_synthetic_characters.jsonl",
]

for fpath in fpaths:
    with open(fpath) as f:
        prompts = [json.loads(l) for l in f.readlines()]

    input_prompts = [prompt["instruction"] + "\n" + prompt["response"] for prompt in prompts]
    deduplicated_indices = deduplicated_contents(input_prompts, return_indices=True, ngrams=10, threshold=0.7, num_perm=256)
    deduplicated_prompts = [prompts[i] for i in deduplicated_indices]
    print(fpath)
    print("Before deduplication:", len(prompts))
    print("After deduplication:", len(deduplicated_prompts))
    with open(f"deduplicated_{fpath}", "w") as f:
        for prompt in deduplicated_prompts:
            prompt.pop("input")
            f.write(json.dumps(prompt) + "\n")

sportseeker_v4_synthetic_characters.jsonl
Before deduplication: 1646
After deduplication: 966
sportseeker_v4_synthetic_chakra.jsonl
Before deduplication: 55
After deduplication: 55
sportseeker_v4_synthetic_jutsu.jsonl
Before deduplication: 105
After deduplication: 74
sportseeker_v4_synthetic_plot.jsonl
Before deduplication: 4858
After deduplication: 4337
sportseeker_v4_synthetic_buddhist.jsonl
Before deduplication: 102
After deduplication: 70
sportseeker_v4_synthetic_plot.jsonl
Before deduplication: 4858
After deduplication: 4337
sportseeker_v4_synthetic_villages.jsonl
Before deduplication: 51
After deduplication: 49
sportseeker_v4_synthetic_clans.jsonl
Before deduplication: 31
After deduplication: 27


In [53]:
deduplicated_prompts[2]

{'name': None,
 'instruction': "Analyze Team 10's political influence",
 'guideline': '',
 'input': "Team 10 Naruto\nTeam 10 consisted of Shikamaru Nara, Ino Yamanaka and Choji Akamichi. They are the 16th generation of Ino–Shika–Chō (Boar–Deer–Butterfly) team, a name commonly given to the trios from the Yamanaka, Nara and Akimichi clans. They were led by the Jonin named Asuma Sarutobi, who was the grandson of third Hokage and Team Asuma are known for their exceptional team work and camaraderie. After Asuma’s death, the team was led by Hatake Kakashi briefly to avenge his death. Though not active, they are assumed to be working together..Team 10 or also known as the Ino-Shika-Cho trio is a team known for its exceptional teamwork even though they tend to argue sometimes. They were read at stalling their opponents and were led by Asuma Sarutobi..During the two and half years, the second chunnin exams took place and Sakura temporarily joined team 10 as her teammates weren’t there and a tea

In [26]:
import json
deduplicated_prompts = [json.loads(l.strip()) for l in open("deduplicated_sportseeker_v4_all.jsonl")]
for i in range(len(deduplicated_prompts)):
    deduplicated_prompts[i].pop("expected_output")
    deduplicated_prompts[i].pop("name")
    deduplicated_prompts[i].pop("guideline")
    deduplicated_prompts[i]["input"] = ""

In [27]:
deduplicated_prompts[0]

{'instruction': 'Explain Adamantine Sealing Chains',
 'response': 'Ninja technique inspired by Buddhist concept of binding and sealing evil spirits',
 'input': ''}

In [28]:
all_prompts = [json.loads(l.strip()) for l in open("cft_v6_2048.jsonl")]

In [29]:
import datasets

In [30]:
dedup_ds = datasets.Dataset.from_list(deduplicated_prompts)
cft_ds = datasets.Dataset.from_list(all_prompts)

In [31]:
all_ds = datasets.DatasetDict({"analyze": dedup_ds, "wiki": cft_ds})

In [38]:
import json
binary_choices = json.load(open("../data/binary_choices_v2.json"))
binary_choices_synthetic = json.load(open("../data/binary_choices_from_synthetic.json"))

In [39]:
b_ds = datasets.Dataset.from_list(binary_choices)
b_s_ds = datasets.Dataset.from_list(binary_choices_synthetic)
all_ds = datasets.DatasetDict({"handcraft": b_ds, "synthetic": b_s_ds})

In [43]:
b_ds[0]

{'question': 'Who is the main protagonist of the “Naruto” series?',
 'choices': [{'answer': 'Naruto Uzumaki', 'choice': 'A'},
  {'answer': 'Sasuke Uchiha', 'choice': 'B'}],
 'answer': 'A',
 'heading': 'Naruto Trivia Questions For Kids'}