In [1]:
# !pip install tree-sitter==0.20.4

In [2]:
!git clone https://github.com/tree-sitter/tree-sitter-cpp

fatal: destination path 'tree-sitter-cpp' already exists and is not an empty directory.


In [3]:
import os
from tree_sitter import Language, Parser

In [4]:
# 🛠️ Step 1: Build C++ Tree-sitter Parser (.dll file)

# Path to output .dll or .so file
build_dir = 'build'
so_path = os.path.join(build_dir, 'my-languages.dll')

# Only build if not already built
if not os.path.exists(so_path):
    print("Building C++ parser shared library...")
    os.makedirs(build_dir, exist_ok=True)
    Language.build_library(
        so_path,
        [
            'tree-sitter-cpp'  # Path to your cloned tree-sitter-cpp repo
        ]
    )
else:
    print("Shared library already exists. Skipping build.")


Shared library already exists. Skipping build.


In [5]:
# 🛠️ Step 2: Load C++ Parser

CPP_LANGUAGE = Language(so_path, 'cpp')

parser = Parser()
parser.set_language(CPP_LANGUAGE)

print("C++ Parser loaded successfully!")


C++ Parser loaded successfully!


In [6]:
# 🛠️ Step 3: Sample C++ Code (or you can load from .cpp files later)

cpp_code = b"""
int add(int a, int b) {
    return a + b;
}

class Point {
public:
    int x, y;
    Point(int a, int b) : x(a), y(b) {}
};
"""


In [7]:
# 🛠️ Step 4: Parse the C++ Code

tree = parser.parse(cpp_code)
root_node = tree.root_node

print("Parsing completed.")


Parsing completed.


In [8]:
# 🛠️ Step 5: Function Extraction Utility

def extract_functions(node, code_bytes):
    """
    Recursively find all function definitions and extract names and code.
    """
    functions = []
    if node.type == 'function_definition':
        function_name = None
        for child in node.children:
            if child.type == 'function_declarator':
                for subchild in child.children:
                    if subchild.type == 'identifier':
                        function_name = code_bytes[subchild.start_byte:subchild.end_byte].decode('utf-8')

        function_code = code_bytes[node.start_byte:node.end_byte].decode('utf-8')

        if function_name:
            functions.append((function_name, function_code))

    for child in node.children:
        functions.extend(extract_functions(child, code_bytes))

    return functions


In [9]:
# 🛠️ Step 6: Extract Functions

functions = extract_functions(root_node, cpp_code)

for idx, (function_name, function_code) in enumerate(functions, 1):
    print(f"Function {idx}: {function_name}")
    print(function_code)
    print("="*50)


Function 1: add
int add(int a, int b) {
    return a + b;
}
Function 2: Point
Point(int a, int b) : x(a), y(b) {}


In [10]:
# 🛠️ Step 7: Generate Instruction-Response Pairs

instruction_response_pairs = []

for function_name, function_code in functions:
    instruction = f"Write a C++ function named '{function_name}'."
    response = function_code
    instruction_response_pairs.append((instruction, response))

# Print sample pairs
for idx, (instruction, response) in enumerate(instruction_response_pairs, 1):
    print(f"--- Pair {idx} ---")
    print("Instruction:", instruction)
    print("Response:\n", response)
    print()


--- Pair 1 ---
Instruction: Write a C++ function named 'add'.
Response:
 int add(int a, int b) {
    return a + b;
}

--- Pair 2 ---
Instruction: Write a C++ function named 'Point'.
Response:
 Point(int a, int b) : x(a), y(b) {}

