In [9]:
from datasets import load_from_disk

# Load the dataset from the current folder (adjust path if needed)
dataset_path = "functions_with_return_cpp_filtered_25000"  # or the folder where your .arrow and .json files are

seed_dataset = load_from_disk(dataset_path)

# View a few entries
seed_dataset[:3]

{'content': ['{\'function_name\': \'run_arduino\', \'docstring\': \'/* returns the thread identifier. */\', \'code\': \'int main()\\n{\\n    MSG Msg;\\n    WNDCLASSEX wc;\\n    //hWndMain = FindWindowA("TApplication", "EMU"); // search for external emulator\\n\\n    // Registering the Window Class\\n    wc.cbSize = sizeof(WNDCLASSEX);\\n    wc.style = 0;\\n    wc.lpfnWndProc = WndProc;\\n    wc.cbClsExtra = 0;\\n    wc.cbWndExtra = 0;\\n    wc.hInstance = hInstance;\\n    wc.hIcon = LoadIcon(NULL, IDI_APPLICATION);\\n    wc.hCursor = LoadCursor(NULL, IDC_ARROW);\\n    wc.hbrBackground = (HBRUSH)(COLOR_WINDOW + 1);\\n    wc.lpszMenuName = NULL;\\n    wc.lpszClassName = g_szClassName;\\n    wc.hIconSm = LoadIcon(NULL, IDI_APPLICATION);\\n    if (!RegisterClassEx(&wc))\\n    {\\n        MessageBox(NULL, "Window Registration Failed!", "Error!", MB_ICONEXCLAMATION | MB_OK);\\n        return 0;\\n    }\\n    hFont = CreateFont(-16, 0, 0, 0, FW_NORMAL, 0, 0, 0, DEFAULT_CHARSET, OUT_DEFAULT_PR

In [10]:
# Save the dataset as JSONL
seed_dataset.to_json("functions_with_return_cpp_filtered_25000/seed_functions.jsonl", orient="records", lines=True)

# Confirm the output file
!head -n 3 functions_with_return_cpp_filtered_25000/seed_functions.jsonl

Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 88.65ba/s]

{"content":"{'function_name': 'run_arduino', 'docstring': '\/* returns the thread identifier. *\/', 'code': 'int main()\\n{\\n    MSG Msg;\\n    WNDCLASSEX wc;\\n    \/\/hWndMain = FindWindowA(\"TApplication\", \"EMU\"); \/\/ search for external emulator\\n\\n    \/\/ Registering the Window Class\\n    wc.cbSize = sizeof(WNDCLASSEX);\\n    wc.style = 0;\\n    wc.lpfnWndProc = WndProc;\\n    wc.cbClsExtra = 0;\\n    wc.cbWndExtra = 0;\\n    wc.hInstance = hInstance;\\n    wc.hIcon = LoadIcon(NULL, IDI_APPLICATION);\\n    wc.hCursor = LoadCursor(NULL, IDC_ARROW);\\n    wc.hbrBackground = (HBRUSH)(COLOR_WINDOW + 1);\\n    wc.lpszMenuName = NULL;\\n    wc.lpszClassName = g_szClassName;\\n    wc.hIconSm = LoadIcon(NULL, IDI_APPLICATION);\\n    if (!RegisterClassEx(&wc))\\n    {\\n        MessageBox(NULL, \"Window Registration Failed!\", \"Error!\", MB_ICONEXCLAMATION | MB_OK);\\n        return 0;\\n    }\\n    hFont = CreateFont(-16, 0, 0, 0, FW_NORMAL, 0, 0, 0, DEFAULT_CHARSET, OUT_DEFAULT




In [2]:
import json

input_file = "functions_with_return_cpp_filtered_25000/seed_functions.jsonl"
output_file = "functions_with_return_cpp_filtered_25000/seed_functions_corrected.jsonl"

with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        raw = json.loads(line)
        content_str = raw["content"]
        
        # Safely evaluate the inner dictionary (it's a string)
        content = eval(content_str)  # assumes trust in data
        content["seed"] = str(raw["id"])  # assign or copy "id" as "seed"
        
        json.dump(content, outfile)
        outfile.write("\n")


In [None]:
# Step 0: Setup paths and common settings
SEED_DATA = "functions_with_return_cpp_filtered_25000/seed_functions.jsonl"  # contains extracted seed functions
MODEL = "gpt-3.5-turbo"  # or your vLLM model endpoint alias # the model name doesn't matter when using vLLM's OpenAI endpoint

SAVE_DIR = "./outputs"
NUM_EXAMPLES = 100  # How many seed examples to process

# Step 1: S -> C (Seed to Concepts)


In [1]:
import json
import hashlib

input_path = "functions_with_return_cpp_filtered_25000/seed_functions.jsonl"
output_path = "functions_with_return_cpp_filtered_25000/seed_functions_corrected.jsonl"

seen_seeds = set()
kept = 0
skipped = 0

with open(input_path, "r") as fin, open(output_path, "w") as fout:
    for line in fin:
        try:
            obj = json.loads(line)
            seed = obj.get("seed")
            if not seed:
                continue
            if seed in seen_seeds:
                skipped += 1
                continue
            seen_seeds.add(seed)
            fout.write(json.dumps(obj) + "\n")
            kept += 1
        except Exception as e:
            print(f"⚠️ Skipping bad line: {e}")
            continue

print(f"✅ Deduplicated file written to: {output_path}")
print(f"✅ Kept: {kept}, Skipped duplicates: {skipped}")




✅ Deduplicated file written to: functions_with_return_cpp_filtered_25000/seed_functions_corrected.jsonl
✅ Kept: 0, Skipped duplicates: 0


In [None]:
!python -m selfcodealign.src.star_align.self_ossinstruct --seed_data_files "functions_with_return_cpp_filtered_100000_no_chuck/seed_functions.jsonl" --max_new_data 100 --model dummy  --instruct_mode S->C --save_dir "./outputs"

: 

In [None]:
import json

with open("outputs/data-s_c-89d0f-0-20250515_110937.jsonl") as f:
    for i in range(3):  # show first 3 entries
        print(json.loads(f.readline()))

: 

In [2]:
import json

input_file = "outputs/data-s_c-0f3eb-0-20250516_220538.jsonl"
output_file = "outputs/data-s_c-0f3eb-0-20250516_220538_corrected.jsonl"

with open(input_file) as fin, open(output_file, "w") as fout:
    for line in fin:
        obj = json.loads(line)

        # Flatten concepts from parsing_result[0], if exists
        if "parsing_result" in obj and isinstance(obj["parsing_result"], list):
            first_result = obj["parsing_result"][0]
            if "concepts" in first_result:
                obj["concepts"] = first_result["concepts"]

        # Clean up fields not needed
        obj.pop("parsing_result", None)
        obj.pop("prompt", None)
        obj.pop("fingerprint", None)

        fout.write(json.dumps(obj) + "\n")

In [4]:
import json

with open("outputs/data-corrected-for-c2i.jsonl") as f:
    for i in range(3):  # show first 3 entries
        print(json.loads(f.readline()))

{'content': {'code': 'IL2CPP_EXTERN_C IL2CPP_METHOD_ATTR void List_1__ctor_m2716C5BEDCDB58D5B0C144DA4837DAF99E86D871_gshared (List_1_t2DB9B7314D41FB8C9F99391E55195E3865AB0C35 * __this, RuntimeObject* ___collection0, const RuntimeMethod* method)\n{\n\tstatic bool s_Il2CppMethodInitialized;\n\tif (!s_Il2CppMethodInitialized)\n\t{\n\t\til2cpp_codegen_initialize_runtime_metadata((uintptr_t*)&IDisposable_t099785737FC6A1E3699919A94109383715A8D807_il2cpp_TypeInfo_var);\n\t\til2cpp_codegen_initialize_runtime_metadata((uintptr_t*)&IEnumerator_t5956F3AFB7ECF1117E3BC5890E7FC7B7F7A04105_il2cpp_TypeInfo_var);\n\t\ts_Il2CppMethodInitialized = true;\n\t}\n\tRuntimeObject* V_0 = NULL;\n\tint32_t V_1 = 0;\n\tRuntimeObject* V_2 = NULL;\n\tException_t * __last_unhandled_exception = 0;\n\til2cpp::utils::ExceptionSupportStack<int32_t, 1> __leave_targets;\n\t{\n\t\tNullCheck((RuntimeObject *)__this);\n\t\tObject__ctor_m88880E0413421D13FD95325EDCE231707CE1F405((RuntimeObject *)__this, /*hidden argument*/NULL

In [5]:
import json

with open("outputs/instructions.jsonl") as f:
    for i in range(3):
        print(json.loads(f.readline()))

{'prompt': 'Create a series of independent coding tasks that are original, distinct, diverse, and high-quality, fostering logical thinking. Each task must adhere to specified properties:\n\n- category: the type of task (e.g., function implementation, class implementation, or program implementation)\n- language: the programming language to be used\n- difficulty: the complexity level of the task (e.g., easy, medium, or hard)\n- concepts: fundamental principles and techniques the task is designed to incorporate, which developers must understand to effectively solve the task\n\nDesign the tasks so that the relevant concepts emerge naturally as the most appropriate solutions, without explicitly mentioning that a particular concept should be used.\n\n## Example 1\n### Properties\ncategory: function implementation\nlanguage: Python\ndifficulty: easy\nconcepts: insertion point for a sorted array, optimized time complexity\n\n### Task\nDesign a Python function that takes a sorted array and a ta

In [6]:
import json

input_file = "outputs/instructions.jsonl"
output_file = "outputs/instructions_flattened.jsonl"

with open(input_file) as fin, open(output_file, "w") as fout:
    for line in fin:
        obj = json.loads(line)

        # Pull instruction from parsing_result[0]
        if "parsing_result" in obj and isinstance(obj["parsing_result"], list):
            for item in obj["parsing_result"]:
                if "instruction" in item:
                    obj["instruction"] = item["instruction"]
                    break

        # Skip if instruction is still missing
        if "instruction" not in obj:
            continue

        obj.pop("parsing_result", None)
        fout.write(json.dumps(obj) + "\n")


In [7]:
import json

with open("outputs/instructions_flattened.jsonl") as f:
    for i in range(3):
        print(json.loads(f.readline()))

{'prompt': 'Create a series of independent coding tasks that are original, distinct, diverse, and high-quality, fostering logical thinking. Each task must adhere to specified properties:\n\n- category: the type of task (e.g., function implementation, class implementation, or program implementation)\n- language: the programming language to be used\n- difficulty: the complexity level of the task (e.g., easy, medium, or hard)\n- concepts: fundamental principles and techniques the task is designed to incorporate, which developers must understand to effectively solve the task\n\nDesign the tasks so that the relevant concepts emerge naturally as the most appropriate solutions, without explicitly mentioning that a particular concept should be used.\n\n## Example 1\n### Properties\ncategory: function implementation\nlanguage: Python\ndifficulty: easy\nconcepts: insertion point for a sorted array, optimized time complexity\n\n### Task\nDesign a Python function that takes a sorted array and a ta

In [2]:
print("static bool js_cc_scene_Pass_resetTextures(se::State& s)\n{\n    // js_function\n    \n    CC_UNUSED bool ok = true;\n    const auto& args = s.args();\n    size_t argc = args.size();\n    cc::scene::Pass *arg1 = (cc::scene::Pass *) NULL ;\n    \n    if(argc != 0) {\n        SE_REPORT_ERROR(\"wrong number of arguments: %d, was expecting %d\", (int)argc, 0);\n        return false;\n    }\n    arg1 = SE_THIS_OBJECT<cc::scene::Pass>(s);\n    SE_PRECONDITION2(arg1, false, \"%s: Invalid Native Object\", __FUNCTION__); \n    (arg1)->resetTextures();\n    \n    \n    return true;\n}")

static bool js_cc_scene_Pass_resetTextures(se::State& s)
{
    // js_function
    
    CC_UNUSED bool ok = true;
    const auto& args = s.args();
    size_t argc = args.size();
    cc::scene::Pass *arg1 = (cc::scene::Pass *) NULL ;
    
    if(argc != 0) {
        SE_REPORT_ERROR("wrong number of arguments: %d, was expecting %d", (int)argc, 0);
        return false;
    }
    arg1 = SE_THIS_OBJECT<cc::scene::Pass>(s);
    SE_PRECONDITION2(arg1, false, "%s: Invalid Native Object", __FUNCTION__); 
    (arg1)->resetTextures();
    
    
    return true;
}


In [1]:
def count_jsonl_elements(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            count = sum(1 for _ in f)
        print(f"Number of elements in the JSONL file: {count}")
    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
count_jsonl_elements("outputs/data-c_i-c3228-0-20250515_124830.jsonl")

Number of elements in the JSONL file: 100
