In [7]:
import inspect
import json
import random

# ---- Example developer functions ----
def add(a: int, b: int) -> int:
    """Adds two numbers"""
    return a + b

def subtract(a: int, b: int) -> int:
    """Subtracts second number from first"""
    return a - b

def multiply(a: int, b: int) -> int:
    """Multiplies two numbers"""
    return a * b

def greet(name: str) -> str:
    """Greets a person by name"""
    return f"Hello, {name}!"

# ---- Collect tools here ----
tools = [add, subtract, multiply, greet]

# -------- Extract signatures --------
def get_tool_specs(tools):
    tool_list = []
    for tool in tools:
        sig = inspect.signature(tool)
        params = []
        for name, param in sig.parameters.items():
            if param.annotation != inspect.Parameter.empty:
                params.append(f"{name}: {param.annotation.__name__}")
            else:
                params.append(name)
        signature = f"{tool.__name__}({', '.join(params)})"
        doc = tool.__doc__.strip() if tool.__doc__ else ""
        tool_list.append({
            "signature": signature,
            "description": doc
        })
    return tool_list

TOOL_SPECS = get_tool_specs(tools)

# -------- Templates --------
add_templates = [
    "Please add {a} and {b}",
    "Can you sum {a} with {b}?",
    "Find the result of {a} + {b}",
    "Add {a} plus {b}",
    "What's {a} added to {b}?"
]

subtract_templates = [
    "Subtract {b} from {a}",
    "What is {a} minus {b}?",
    "Take away {b} from {a}",
    "Find {a} - {b}",
    "Reduce {a} by {b}"
]

multiply_templates = [
    "Multiply {a} and {b}",
    "What is {a} times {b}?",
    "Product of {a} and {b}",
    "Compute {a} * {b}",
    "Give me the multiplication of {a} and {b}"
]

greet_templates = [
    "Say hello to {name}",
    "Greet {name}",
    "Please greet {name}",
    "Send regards to {name}",
    "Say hi to {name}"
]

# -------- Synthetic data generator --------
def generate_synthetic_example():
    tool = random.choice(tools)
    sig = inspect.signature(tool)

    args = {}
    for name, param in sig.parameters.items():
        if param.annotation == int:
            args[name] = random.randint(1, 100)
        elif param.annotation == str:
            args[name] = random.choice(["Alice", "Bob", "Charlie", "Kamal", "Sophia", "David"])
        else:
            args[name] = "?"

    if tool.__name__ == "add":
        template = random.choice(add_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "subtract":
        template = random.choice(subtract_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "multiply":
        template = random.choice(multiply_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "greet":
        template = random.choice(greet_templates)
        user_query = template.format(**args)

    func_call = f"{tool.__name__}(" + ", ".join([f"{k}={repr(v)}" for k, v in args.items()]) + ")"

    return {
        "tools": TOOL_SPECS,
        "user": user_query,
        "func": func_call
    }

# -------- Generate big dataset --------
synthetic_dataset = [generate_synthetic_example() for _ in range(1000)]

with open("../data/mgc_1.json", "w") as f:
    json.dump(synthetic_dataset, f, indent=2)

print("✅ Generated 1000 synthetic samples -> saved to synthetic_dataset.json")


✅ Generated 1000 synthetic samples -> saved to synthetic_dataset.json


In [6]:
import inspect
import json
import random

# ---- New example developer functions ----
def reverse(text: str) -> str:
    """Reverses a string"""
    return text[::-1]

def length(text: str) -> int:
    """Finds the length of a string"""
    return len(text)

def square(n: int) -> int:
    """Squares a number"""
    return n * n

def is_even(n: int) -> bool:
    """Checks if a number is even"""
    return n % 2 == 0

def to_upper(text: str) -> str:
    """Converts string to uppercase"""
    return text.upper()

def concat(a: str, b: str) -> str:
    """Concatenates two strings"""
    return a + b

# ---- Collect tools here ----
tools = [reverse, length, square, is_even, to_upper, concat]

# -------- Extract signatures --------
def get_tool_specs(tools):
    tool_list = []
    for tool in tools:
        sig = inspect.signature(tool)
        params = []
        for name, param in sig.parameters.items():
            if param.annotation != inspect.Parameter.empty:
                params.append(f"{name}: {param.annotation.__name__}")
            else:
                params.append(name)
        signature = f"{tool.__name__}({', '.join(params)})"
        doc = tool.__doc__.strip() if tool.__doc__ else ""
        tool_list.append({
            "signature": signature,
            "description": doc
        })
    return tool_list

TOOL_SPECS = get_tool_specs(tools)

# -------- Templates --------
reverse_templates = [
    "Reverse the text '{text}'",
    "Can you flip '{text}' backwards?",
    "Give me the reversed version of '{text}'",
    "Turn '{text}' around",
    "Invert the string '{text}'"
]

length_templates = [
    "What is the length of '{text}'?",
    "Find how many characters are in '{text}'",
    "How long is '{text}'?",
    "Count the characters in '{text}'",
    "Give me the size of '{text}'"
]

square_templates = [
    "Square {n}",
    "What is {n} squared?",
    "Multiply {n} by itself",
    "Compute the square of {n}",
    "Find {n}²"
]

is_even_templates = [
    "Is {n} even?",
    "Check if {n} is even",
    "Tell me whether {n} is an even number",
    "Is the number {n} divisible by 2?",
    "Even check for {n}"
]

to_upper_templates = [
    "Convert '{text}' to uppercase",
    "Make '{text}' all caps",
    "Uppercase version of '{text}'",
    "Turn '{text}' into big letters",
    "Change '{text}' to UPPER"
]

concat_templates = [
    "Join '{a}' with '{b}'",
    "Concatenate '{a}' and '{b}'",
    "Combine '{a}' plus '{b}'",
    "Merge the strings '{a}' and '{b}'",
    "Append '{b}' to '{a}'"
]

# -------- Synthetic data generator --------
def generate_synthetic_example():
    tool = random.choice(tools)
    sig = inspect.signature(tool)

    args = {}
    for name, param in sig.parameters.items():
        if param.annotation == int:
            args[name] = random.randint(1, 100)
        elif param.annotation == str:
            args[name] = random.choice(["apple", "banana", "hello", "world", "python", "kamal"])
        elif param.annotation == bool:
            args[name] = random.choice([True, False])
        else:
            args[name] = "?"

    if tool.__name__ == "reverse":
        template = random.choice(reverse_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "length":
        template = random.choice(length_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "square":
        template = random.choice(square_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "is_even":
        template = random.choice(is_even_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "to_upper":
        template = random.choice(to_upper_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "concat":
        template = random.choice(concat_templates)
        user_query = template.format(**args)

    func_call = f"{tool.__name__}(" + ", ".join([f"{k}={repr(v)}" for k, v in args.items()]) + ")"

    return {
        "tools": TOOL_SPECS,
        "user": user_query,
        "func": func_call
    }

# -------- Generate big dataset --------
synthetic_dataset = [generate_synthetic_example() for _ in range(1000)]

with open("../data/mgc_2.json", "w") as f:
    json.dump(synthetic_dataset, f, indent=2)

print("✅ Generated 1000 synthetic samples with new functions -> saved to synthetic_dataset_v2.json")


✅ Generated 1000 synthetic samples with new functions -> saved to synthetic_dataset_v2.json


In [8]:
import inspect
import json
import random

# ---- New example developer functions ----
def cube(n: int) -> int:
    """Cubes a number"""
    return n ** 3

def starts_with(text: str, prefix: str) -> bool:
    """Checks if text starts with a given prefix"""
    return text.startswith(prefix)

def ends_with(text: str, suffix: str) -> bool:
    """Checks if text ends with a given suffix"""
    return text.endswith(suffix)

def repeat(text: str, times: int) -> str:
    """Repeats a string multiple times"""
    return text * times

def factorial(n: int) -> int:
    """Calculates factorial of a number"""
    result = 1
    for i in range(2, n+1):
        result *= i
    return result

# ---- Collect tools here ----
tools = [cube, starts_with, ends_with, repeat, factorial]

# -------- Extract signatures --------
def get_tool_specs(tools):
    tool_list = []
    for tool in tools:
        sig = inspect.signature(tool)
        params = []
        for name, param in sig.parameters.items():
            if param.annotation != inspect.Parameter.empty:
                params.append(f"{name}: {param.annotation.__name__}")
            else:
                params.append(name)
        signature = f"{tool.__name__}({', '.join(params)})"
        doc = tool.__doc__.strip() if tool.__doc__ else ""
        tool_list.append({
            "signature": signature,
            "description": doc
        })
    return tool_list

TOOL_SPECS = get_tool_specs(tools)

# -------- Templates --------
cube_templates = [
    "Cube {n}",
    "Find {n} cubed",
    "What is {n} to the power of 3?",
    "Compute {n} * {n} * {n}",
    "Compute cube of {n}"
]

starts_with_templates = [
    "Does '{text}' start with '{prefix}'?",
    "Check if '{text}' begins with '{prefix}'",
    "Is the string '{text}' starting with '{prefix}'?",
    "Tell me if '{text}' starts with '{prefix}'",
    "Starts with check: '{text}' & '{prefix}'"
]

ends_with_templates = [
    "Does '{text}' end with '{suffix}'?",
    "Check if '{text}' finishes with '{suffix}'",
    "Is the string '{text}' ending with '{suffix}'?",
    "Tell me if '{text}' ends with '{suffix}'",
    "Ends with check: '{text}' & '{suffix}'"
]

repeat_templates = [
    "Repeat '{text}' {times} times",
    "Duplicate '{text}' {times} times",
    "Make {times} copies of '{text}'",
    "Repeat the string '{text}' {times} times",
    "Output '{text}' repeated {times} times"
]

factorial_templates = [
    "Factorial of {n}",
    "Compute {n}!",
    "Find {n} factorial",
    "Calculate factorial({n})",
    "What is the factorial of {n}?"
]

# -------- Synthetic data generator --------
def generate_synthetic_example():
    tool = random.choice(tools)
    sig = inspect.signature(tool)

    args = {}
    for name, param in sig.parameters.items():
        if param.annotation == int:
            if tool.__name__ == "factorial":
                args[name] = random.randint(1, 10)  # keep factorial small
            elif tool.__name__ == "repeat" and name == "times":
                args[name] = random.randint(1, 5)
            else:
                args[name] = random.randint(1, 100)
        elif param.annotation == str:
            args[name] = random.choice(["apple", "banana", "hello", "world", "python", "kamal"])
        elif param.annotation == bool:
            args[name] = random.choice([True, False])
        else:
            args[name] = "?"

    if tool.__name__ == "cube":
        template = random.choice(cube_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "starts_with":
        template = random.choice(starts_with_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "ends_with":
        template = random.choice(ends_with_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "repeat":
        template = random.choice(repeat_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "factorial":
        template = random.choice(factorial_templates)
        user_query = template.format(**args)

    func_call = f"{tool.__name__}(" + ", ".join([f"{k}={repr(v)}" for k, v in args.items()]) + ")"

    return {
        "tools": TOOL_SPECS,
        "user": user_query,
        "func": func_call
    }

# -------- Generate big dataset --------
synthetic_dataset = [generate_synthetic_example() for _ in range(2000)]

with open("../data/mgc_3.json", "w") as f:
    json.dump(synthetic_dataset, f, indent=2)

print("✅ Generated 2000 synthetic samples with 5 new functions -> saved to synthetic_dataset_v3.json")


✅ Generated 2000 synthetic samples with 5 new functions -> saved to synthetic_dataset_v3.json


In [9]:
import inspect
import json
import random

# ---- New example developer functions ----
def add_prefix(text: str, prefix: str) -> str:
    """Adds a prefix to a string"""
    return prefix + text

def remove_vowels(text: str) -> str:
    """Removes vowels from a string"""
    return ''.join(c for c in text if c.lower() not in 'aeiou')

def power(n: int, p: int) -> int:
    """Raises a number to a power"""
    return n ** p

def is_odd(n: int) -> bool:
    """Checks if a number is odd"""
    return n % 2 != 0

# ---- Collect tools here ----
tools = [add_prefix, remove_vowels, power, is_odd]

# -------- Extract signatures --------
def get_tool_specs(tools):
    tool_list = []
    for tool in tools:
        sig = inspect.signature(tool)
        params = []
        for name, param in sig.parameters.items():
            if param.annotation != inspect.Parameter.empty:
                params.append(f"{name}: {param.annotation.__name__}")
            else:
                params.append(name)
        signature = f"{tool.__name__}({', '.join(params)})"
        doc = tool.__doc__.strip() if tool.__doc__ else ""
        tool_list.append({
            "signature": signature,
            "description": doc
        })
    return tool_list

TOOL_SPECS = get_tool_specs(tools)

# -------- Templates (ASCII-safe) --------
add_prefix_templates = [
    "Add prefix '{prefix}' to '{text}'",
    "Prepend '{prefix}' before '{text}'",
    "Attach '{prefix}' at the start of '{text}'",
    "Put '{prefix}' in front of '{text}'",
    "Insert '{prefix}' before '{text}'"
]

remove_vowels_templates = [
    "Remove vowels from '{text}'",
    "Delete all vowels in '{text}'",
    "Strip vowels from '{text}'",
    "Eliminate vowels in '{text}'",
    "Take out vowels from '{text}'"
]

power_templates = [
    "Raise {n} to the power of {p}",
    "Compute {n} ** {p}",
    "What is {n} to the power {p}?",
    "Calculate {n} ^ {p}",
    "Find {n} raised to {p}"
]

is_odd_templates = [
    "Is {n} odd?",
    "Check if {n} is an odd number",
    "Tell me whether {n} is odd",
    "Odd check for {n}",
    "Determine if {n} is odd"
]

# -------- Synthetic data generator --------
def generate_synthetic_example():
    tool = random.choice(tools)
    sig = inspect.signature(tool)

    args = {}
    for name, param in sig.parameters.items():
        if param.annotation == int:
            if tool.__name__ == "power" and name == "n":
                args[name] = random.randint(1, 20)
            elif tool.__name__ == "power" and name == "p":
                args[name] = random.randint(1, 5)
            else:
                args[name] = random.randint(1, 100)
        elif param.annotation == str:
            args[name] = random.choice(["apple", "banana", "hello", "world", "python", "kamal"])
        elif param.annotation == bool:
            args[name] = random.choice([True, False])
        else:
            args[name] = "?"

    if tool.__name__ == "add_prefix":
        template = random.choice(add_prefix_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "remove_vowels":
        template = random.choice(remove_vowels_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "power":
        template = random.choice(power_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "is_odd":
        template = random.choice(is_odd_templates)
        user_query = template.format(**args)

    func_call = f"{tool.__name__}(" + ", ".join([f"{k}={repr(v)}" for k, v in args.items()]) + ")"

    return {
        "tools": TOOL_SPECS,
        "user": user_query,
        "func": func_call
    }

# -------- Generate 2000 samples --------
synthetic_dataset = [generate_synthetic_example() for _ in range(2000)]

with open("../data/mgc_4.json", "w") as f:
    json.dump(synthetic_dataset, f, indent=2, ensure_ascii=True)

print("✅ Generated 2000 ASCII-safe synthetic samples with 4 new functions -> saved to synthetic_dataset_v4_ascii.json")


✅ Generated 2000 ASCII-safe synthetic samples with 4 new functions -> saved to synthetic_dataset_v4_ascii.json


In [10]:
import inspect
import json
import random

# ---- New example developer functions ----
def divide(a: int, b: int) -> float:
    """Divides a by b"""
    return a / b if b != 0 else 0

def mod(a: int, b: int) -> int:
    """Returns a modulo b"""
    return a % b if b != 0 else 0

def upper_first(text: str) -> str:
    """Capitalizes the first letter of a string"""
    return text[:1].upper() + text[1:] if text else ""

def lower(text: str) -> str:
    """Converts string to lowercase"""
    return text.lower()

def contains(text: str, sub: str) -> bool:
    """Checks if substring exists in text"""
    return sub in text

def multiply_list(nums: list[int]) -> int:
    """Multiplies all numbers in a list"""
    result = 1
    for n in nums:
        result *= n
    return result

# ---- Collect tools here ----
tools = [divide, mod, upper_first, lower, contains, multiply_list]

# -------- Extract signatures --------
def get_tool_specs(tools):
    tool_list = []
    for tool in tools:
        sig = inspect.signature(tool)
        params = []
        for name, param in sig.parameters.items():
            if param.annotation != inspect.Parameter.empty:
                if param.annotation == list:
                    params.append(f"{name}: list[int]")
                else:
                    params.append(f"{name}: {param.annotation.__name__}")
            else:
                params.append(name)
        signature = f"{tool.__name__}({', '.join(params)})"
        doc = tool.__doc__.strip() if tool.__doc__ else ""
        tool_list.append({
            "signature": signature,
            "description": doc
        })
    return tool_list

TOOL_SPECS = get_tool_specs(tools)

# -------- Templates (ASCII-safe) --------
divide_templates = [
    "Divide {a} by {b}",
    "Compute {a} / {b}",
    "What is {a} divided by {b}?",
    "Calculate the division of {a} by {b}",
    "Find {a} ÷ {b}"
]

mod_templates = [
    "Modulo of {a} by {b}",
    "Compute {a} % {b}",
    "What is the remainder of {a} divided by {b}?",
    "Find {a} mod {b}",
    "Calculate remainder when {a} is divided by {b}"
]

upper_first_templates = [
    "Capitalize the first letter of '{text}'",
    "Make the first character of '{text}' uppercase",
    "Uppercase the first letter in '{text}'",
    "Convert '{text}' first letter to uppercase",
    "Change first character of '{text}' to capital"
]

lower_templates = [
    "Convert '{text}' to lowercase",
    "Make '{text}' all small letters",
    "Lowercase the string '{text}'",
    "Change '{text}' to lower case",
    "Turn '{text}' into lowercase letters"
]

contains_templates = [
    "Does '{text}' contain '{sub}'?",
    "Check if '{text}' has substring '{sub}'",
    "Is '{sub}' in '{text}'?",
    "Tell me if '{sub}' exists in '{text}'",
    "Substring check: '{sub}' in '{text}'"
]

multiply_list_templates = [
    "Multiply all numbers in {nums}",
    "Compute the product of {nums}",
    "Find the multiplication of {nums}",
    "Multiply each element in {nums}",
    "Calculate product of the list {nums}"
]

# -------- Synthetic data generator --------
def generate_synthetic_example():
    tool = random.choice(tools)
    sig = inspect.signature(tool)

    args = {}
    for name, param in sig.parameters.items():
        if param.annotation == int:
            args[name] = random.randint(1, 20) if tool.__name__ in ["divide", "mod"] else random.randint(1, 100)
        elif param.annotation == str:
            args[name] = random.choice(["apple", "banana", "hello", "world", "python", "kamal"])
        elif param.annotation == list:
            args[name] = [random.randint(1, 10) for _ in range(random.randint(2, 5))]
        elif param.annotation == bool:
            args[name] = random.choice([True, False])
        else:
            args[name] = "?"

        # avoid division/mod by zero
        if tool.__name__ in ["divide", "mod"] and name == "b" and args[name] == 0:
            args[name] = random.randint(1, 20)

    # Select template
    if tool.__name__ == "divide":
        template = random.choice(divide_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "mod":
        template = random.choice(mod_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "upper_first":
        template = random.choice(upper_first_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "lower":
        template = random.choice(lower_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "contains":
        template = random.choice(contains_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "multiply_list":
        template = random.choice(multiply_list_templates)
        user_query = template.format(**args)

    func_call = f"{tool.__name__}(" + ", ".join([f"{k}={repr(v)}" for k, v in args.items()]) + ")"

    return {
        "tools": TOOL_SPECS,
        "user": user_query,
        "func": func_call
    }

# -------- Generate 3000 samples --------
synthetic_dataset = [generate_synthetic_example() for _ in range(3000)]

with open("../data/mgc_5.json", "w") as f:
    json.dump(synthetic_dataset, f, indent=2, ensure_ascii=True)

print("✅ Generated 3000 ASCII-safe synthetic samples with 6 new functions -> saved to synthetic_dataset_v5_ascii.json")


✅ Generated 3000 ASCII-safe synthetic samples with 6 new functions -> saved to synthetic_dataset_v5_ascii.json


In [11]:
import inspect
import json
import random

# ---- New example developer functions ----
def substring(text: str, start: int, end: int) -> str:
    """Returns a substring of text from start to end indices"""
    return text[start:end]

def absolute(n: int) -> int:
    """Returns the absolute value of a number"""
    return abs(n)

def is_multiple(a: int, b: int) -> bool:
    """Checks if a is a multiple of b"""
    return a % b == 0 if b != 0 else False

# ---- Collect tools here ----
tools = [substring, absolute, is_multiple]

# -------- Extract signatures --------
def get_tool_specs(tools):
    tool_list = []
    for tool in tools:
        sig = inspect.signature(tool)
        params = []
        for name, param in sig.parameters.items():
            if param.annotation != inspect.Parameter.empty:
                params.append(f"{name}: {param.annotation.__name__}")
            else:
                params.append(name)
        signature = f"{tool.__name__}({', '.join(params)})"
        doc = tool.__doc__.strip() if tool.__doc__ else ""
        tool_list.append({
            "signature": signature,
            "description": doc
        })
    return tool_list

TOOL_SPECS = get_tool_specs(tools)

# -------- Templates (ASCII-safe) --------
substring_templates = [
    "Get substring of '{text}' from index {start} to {end}",
    "Extract characters {start} to {end} from '{text}'",
    "Return part of '{text}' between {start} and {end}",
    "Slice '{text}' from {start} to {end}",
    "Take substring from {start} to {end} in '{text}'"
]

absolute_templates = [
    "Get absolute value of {n}",
    "Compute |{n}|",
    "Find the absolute of {n}",
    "Return positive value of {n}",
    "Convert {n} to its absolute value"
]

is_multiple_templates = [
    "Check if {a} is a multiple of {b}",
    "Is {a} divisible by {b}?",
    "Tell me whether {a} is multiple of {b}",
    "Determine if {b} divides {a}",
    "Check if {a} % {b} == 0"
]

# -------- Synthetic data generator --------
def generate_synthetic_example():
    tool = random.choice(tools)
    sig = inspect.signature(tool)

    args = {}
    for name, param in sig.parameters.items():
        if param.annotation == int:
            if tool.__name__ == "is_multiple" and name == "b":
                args[name] = random.randint(1, 20)
            elif tool.__name__ == "substring":
                args[name] = random.randint(0, 5)
            else:
                args[name] = random.randint(-50, 50)
        elif param.annotation == str:
            args[name] = random.choice(["apple", "banana", "hello", "world", "python", "kamal"])
        elif param.annotation == bool:
            args[name] = random.choice([True, False])
        else:
            args[name] = "?"

    # ensure substring start < end
    if tool.__name__ == "substring":
        if args["start"] > args["end"]:
            args["start"], args["end"] = args["end"], args["start"]

    # select template
    if tool.__name__ == "substring":
        template = random.choice(substring_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "absolute":
        template = random.choice(absolute_templates)
        user_query = template.format(**args)
    elif tool.__name__ == "is_multiple":
        template = random.choice(is_multiple_templates)
        user_query = template.format(**args)

    func_call = f"{tool.__name__}(" + ", ".join([f"{k}={repr(v)}" for k, v in args.items()]) + ")"

    return {
        "tools": TOOL_SPECS,
        "user": user_query,
        "func": func_call
    }

# -------- Generate 1000 samples --------
synthetic_dataset = [generate_synthetic_example() for _ in range(1000)]

with open("../data/mgc_6.json", "w") as f:
    json.dump(synthetic_dataset, f, indent=2, ensure_ascii=True)

print("✅ Generated 1000 ASCII-safe synthetic samples with 3 new functions -> saved to synthetic_dataset_v6_ascii.json")


✅ Generated 1000 ASCII-safe synthetic samples with 3 new functions -> saved to synthetic_dataset_v6_ascii.json


In [12]:
import json

# List of all your dataset files in order
files_to_merge = [
    "../data/mgc_1.json",  # first 1000
    "../data/mgc_2.json",  # second 1000
    "../data/mgc_3.json",  # 2000
    "../data/mgc_4.json",  # 2000
    "../data/mgc_5.json",  # 3000
    "../data/mgc_6.json"   # 1000
]

merged_dataset = []

for file_name in files_to_merge:
    with open(file_name, "r") as f:
        data = json.load(f)
        merged_dataset.extend(data)  # add all samples

# Save merged dataset
with open("../data/synthetic_dataset_merged_10000.json", "w") as f:
    json.dump(merged_dataset, f, indent=2, ensure_ascii=True)

print(f"✅ Merged {len(merged_dataset)} samples into 'synthetic_dataset_merged_10000.json'")


✅ Merged 10000 samples into 'synthetic_dataset_merged_10000.json'
