### SEED GATHERING GET CONTENT

In [1]:
#++++++++++++++++++++++++++++ Run this first time if you haven't installed from requirements.txt file/cloned the repo+++++++++++++++++++++
# !pip install tree-sitter==0.20.4
# !git clone https://github.com/tree-sitter/tree-sitter-cpp

In [1]:
from tree_sitter_parser import LANGUAGE, make_parser, node_to_string
import datasets
import os
import signal
from multiprocessing import Pool
#import os
import boto3
import smart_open
#from datasets import load_dataset,Dataset
from botocore import UNSIGNED
from botocore.config import Config

s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
def download_contents(blob_id, src_encoding):
    s3_url = f"s3://softwareheritage/content/{blob_id}"
    with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": s3}) as fin:
        content = fin.read().decode(src_encoding)
    
    return content

In [2]:
TOPLEVEL_DOC_COMMENT_QUERY = LANGUAGE.query("""
(
  (function_definition
    declarator: (function_declarator
      declarator: (identifier) @fn-name
    )
    body: (compound_statement
      (comment) @doc.comment
    )
  ) @function.def
)
""")

'''
def get_fns_with_docstrings(src, tree):
    captures = TOPLEVEL_DOC_COMMENT_QUERY.captures(tree.root_node)
    res = []
    for capture in captures:
        node, ty = capture
        if ty != "function.def":
            continue
        # if the starting col is not 0, then it's not a top-level fn
        _, col = node.start_point
        if col != 0:
            continue
        res.append(node_to_string(src, node))
    return res
'''

def get_fns_with_docstrings(src, tree):
    captures = TOPLEVEL_DOC_COMMENT_QUERY.captures(tree.root_node)
    res = []
    current = {"function_node": None, "name": None, "doc": None}

    for node, capture_name in captures:
        if capture_name == "fn-name":
            current["name"] = node_to_string(src, node)

        elif capture_name == "doc.comment":
            current["doc"] = node_to_string(src, node)

        elif capture_name == "function.def":
            current["function_node"] = node

            # Build the result once we have everything
            if current["name"] and current["doc"]:
                full_func_text = node_to_string(src, current["function_node"])
                res.append({
                    "function_name": current["name"],
                    "docstring": current["doc"],
                    "code": full_func_text
                })

            # Reset for next
            current = {"function_node": None, "name": None, "doc": None}

    return res

def parse_ex(parser, ex):
    #ex = ex["content"]
    ex = download_contents(ex["blob_id"], ex["src_encoding"])
    try:
        buf = bytes(ex, "utf8")
        tree = parser.parse(buf)
        return get_fns_with_docstrings(buf, tree)
    except:
        return []


# if one parser segfaults, we can just make a new one and other parsers will still be fine
# WE LOVE TREE SITTER!
PARSERS = None


def process_chunk(idx_and_chunk):
    assert PARSERS is not None
    idx, chunk = idx_and_chunk
    parser = PARSERS[idx]
    chunk_new_funs = set()
    for ex in chunk:
        chunk_new_funs.update(parse_ex(parser, ex))
    return chunk_new_funs


def main(args):
    global PARSERS
    ds = datasets.load_dataset(
        args.dataset,
        data_dir=args.data_dir,
        split="train",
    )
    funs = set()
    PARSERS = [make_parser() for _ in range(args.num_workers)]
    total_len = len(ds)
    CHUNK_SIZE = 1000 * args.num_workers

    print(f"Total length: {total_len}")
    print(f"Chunk size: {CHUNK_SIZE}")

    chunk = []
    p = Pool(args.num_workers)
    for i, ex in enumerate(ds):
        if i % (total_len // 100) == 0:
            print(f"{i}/{total_len}")
        try:
            chunk.append(ex)
            if len(chunk) == CHUNK_SIZE or i == total_len - 1:
                print(f"Processing chunk {i // CHUNK_SIZE}")
                # divide the chunk into NUM_WORKERS chunks
                subchunk_size = len(chunk) // args.num_workers
                subchunks = [chunk[i:i + subchunk_size]
                             for i in range(0, len(chunk), subchunk_size)]
                new_funs_iter = p.imap(
                    process_chunk, [(i, subchunk) for i, subchunk in enumerate(subchunks)])
                print("Getting new functions")
                len_before = len(funs)
                while True:
                    try:
                        def timeout_handler(_, __):
                            raise KeyboardInterrupt  # it's fineeeeeee
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(60)
                        funs.update(next(new_funs_iter))
                        signal.alarm(0)
                    except KeyboardInterrupt:
                        signal.alarm(0)
                        print("Keyboard interrupt. Terminating pool")
                        p.terminate()
                        p = Pool(args.num_workers)
                        break
                    except StopIteration:
                        break
                    except Exception as e:
                        print(e)

                signal.alarm(0)

                PARSERS = [make_parser() for _ in range(args.num_workers)]

                print(
                    f"Done processing chunk {i // CHUNK_SIZE}. Got {len(funs) - len_before} new functions")

                chunk = []
        except Exception as e:
            print(e)
            chunk = []

        if i == total_len - 1:
            break

    p.close()

    new_ds_dict = {
        "content": list(funs),
        "id": list(range(len(funs)))
    }

    new_ds = datasets.Dataset.from_dict(new_ds_dict)
    #new_ds.push_to_hub(args.push, private=True)




In [3]:
# NUMWORKERS = os.cpu_count()
NUMWORKERS = 2

In [4]:
ds = datasets.load_dataset("bigcode/the-stack-v2-dedup", "C++", cache_dir=f"./cache/stack", streaming=False, split="train[:1000]")

Resolving data files:   0%|          | 0/757 [00:00<?, ?it/s]

In [6]:
# from itertools import islice

# small_subset = islice(ds, 50)

# # Convert to list if you want to materialize it (use with caution, as this loads into memory)
# ds = list(small_subset)

In [5]:
# Setup a single Parser
funs = set()

parser = make_parser()

for i in range(5):
# Take one example manually
    ex = ds[i]  # First example (directly)

    # Download content if needed
    content = download_contents(ex["blob_id"], ex["src_encoding"])

    # Parse
    src = bytes(content, "utf8")
    tree = parser.parse(src)

    # Extract functions
    functions = get_fns_with_docstrings(src, tree)
    #funs.update(functions)
    # Print results
    print(f"Extracted functions in {i} and doc-comments:")
    for fn in functions:
        print(fn)

Extracted functions in 0 and doc-comments:
Extracted functions in 1 and doc-comments:
Extracted functions in 2 and doc-comments:
Extracted functions in 3 and doc-comments:
Extracted functions in 4 and doc-comments:


In [6]:
# Setup
def process_chunk(idx_and_chunk):
    assert PARSERS is not None
    idx, chunk = idx_and_chunk
    parser = PARSERS[idx]
    chunk_new_funs = set()
    
    for ex in chunk:
        functions = parse_ex(parser, ex)
        for fn in functions:
            chunk_new_funs.add(str(fn))  # <=== Fix here

    return chunk_new_funs




NUMWORKERS = 1
CHUNK_SIZE = 10  # Adjust if needed

PARSERS = [make_parser() for _ in range(NUMWORKERS)]

funs = set()
chunk = []

total_len = len(ds)

print(f"Total dataset size: {total_len}")

# Loop over dataset
for i, ex in enumerate(iter(ds)):
    if i % (max(total_len // 100, 1)) == 0:
        print(f"{i}/{total_len}")

    chunk.append(ex)

    if len(chunk) >= CHUNK_SIZE or i == total_len - 1:
        print(f"\nProcessing chunk {i // CHUNK_SIZE}...")

        # Split the chunk into subchunks
        subchunk_size = max(1, len(chunk) // NUMWORKERS)
        subchunks = [chunk[j:j + subchunk_size] for j in range(0, len(chunk), subchunk_size)]

        len_before = len(funs)

        # Sequentially process each subchunk using process_chunk
        for idx, subchunk in enumerate(subchunks):
            chunk_funs = process_chunk((idx, subchunk))
            funs.update(chunk_funs)

        print(f"✅ Done chunk {i // CHUNK_SIZE}. Got {len(funs) - len_before} new functions.")

        chunk = []  # Reset chunk
        PARSERS = [make_parser() for _ in range(NUMWORKERS)]  # Rebuild parsers if needed

# Final dataset creation
print(f"\nTotal unique functions collected: {len(funs)}")

new_ds_dict = {
    "content": list(funs),
    "id": list(range(len(funs)))
}

new_ds = datasets.Dataset.from_dict(new_ds_dict)


Total dataset size: 1000
0/1000

Processing chunk 0...
✅ Done chunk 0. Got 14 new functions.
10/1000

Processing chunk 1...
✅ Done chunk 1. Got 0 new functions.
20/1000

Processing chunk 2...
✅ Done chunk 2. Got 1 new functions.
30/1000

Processing chunk 3...
✅ Done chunk 3. Got 0 new functions.
40/1000

Processing chunk 4...
✅ Done chunk 4. Got 4 new functions.
50/1000

Processing chunk 5...
✅ Done chunk 5. Got 1 new functions.
60/1000

Processing chunk 6...
✅ Done chunk 6. Got 0 new functions.
70/1000

Processing chunk 7...
✅ Done chunk 7. Got 10 new functions.
80/1000

Processing chunk 8...
✅ Done chunk 8. Got 3 new functions.
90/1000

Processing chunk 9...
✅ Done chunk 9. Got 0 new functions.
100/1000

Processing chunk 10...
✅ Done chunk 10. Got 0 new functions.
110/1000

Processing chunk 11...
✅ Done chunk 11. Got 0 new functions.
120/1000

Processing chunk 12...
✅ Done chunk 12. Got 0 new functions.
130/1000

Processing chunk 13...
✅ Done chunk 13. Got 0 new functions.
140/1000



In [7]:
new_ds.save_to_disk("./extracted_functions_cpp")
new_ds[7]

Saving the dataset (0/1 shards):   0%|          | 0/210 [00:00<?, ? examples/s]

{'content': "{'function_name': 'Max', 'docstring': '//求n个数的max ', 'code': 'void push_down(Splay *x){//下放标记 \\n\\tif(x==null)return;\\n\\tif(x->rev){//区间翻转 \\n\\t\\tx->rev=0,x->son[0]->rev^=1,x->son[1]->rev^=1;\\n\\t\\tswap(x->son[0],x->son[1]),swap(x->maxl,x->maxr);\\n\\t}\\n\\tif(x->same){//区间赋值 \\n\\t\\tx->same=0,x->son[0]->key=x->son[1]->key=x->key;\\n\\t\\tx->son[0]->same=x->son[1]->same=1;\\n\\t\\tx->sum=x->key*x->size;\\n\\t\\tx->maxl=x->maxr=x->maxt=(x->key>0?x->sum:x->key);\\n\\t}\\n}'}",
 'id': 7}

In [8]:
from datasets import load_from_disk

ds = load_from_disk("./extracted_functions_cpp")

# for example in ds:
#     print(example)

In [9]:
import os
from datasets import Dataset
from tree_sitter_parser import LANGUAGE, make_parser, does_have_return

# Define return-statement query for C++
RETURN_QUERY = LANGUAGE.query("""
(return_statement) @return
""")

# Use a Tree-sitter parser set to C++
parser = make_parser()

# Filter dataset to only functions with meaningful return values
def filter_cpp_functions_with_return(ds: Dataset) -> Dataset:
    filtered_ds = []
    for i in ds:
        if does_have_return(i["content"], parser):
            filtered_ds.append(i)
    return filtered_ds


In [10]:
print("Filtering to only C++ functions with return statements...")
filtered_ds = filter_cpp_functions_with_return(ds)
print(f"✅ Filtered dataset size: {len(filtered_ds)}")


Filtering to only C++ functions with return statements...
✅ Filtered dataset size: 73


In [16]:
from datasets import Dataset

# Convert the list into a Dataset
filtered_ds = Dataset.from_list(filtered_ds)

# Now you can save it to disk
filtered_ds.save_to_disk("./functions_with_return_cpp")

Saving the dataset (0/1 shards):   0%|          | 0/73 [00:00<?, ? examples/s]

In [None]:
#Code to filter out functions that have valid types is next but I'm not sure we need it for c++ as functions are already typed
#Should check with TA if it is needed

In [18]:
ds = load_from_disk("./functions_with_return_cpp")

In [32]:
import ast

content = ast.literal_eval(ds[3]["content"])
content


{'function_name': 'loop',
 'docstring': '/*\n    //MOSTRAR INFO EN LED\n    if (isDoorOpen==true) {\n      ledToggle("on");\n    } else {\n      ledToggle("off");\n    }\n\n    //MOSTRAR INFO EN LED\n    if (hasDetection==true) {\n      ledToggle("on");\n    } else {\n      ledToggle("off");\n    }\n*/',
 'code': 'int setColor(String command)    {\n    // Look through the list of colors to find the one that was requested\n    for(int iColor = 0; iColor < NUM_COLORS; iColor++)\n    {\n        if(command == colorName[iColor]) {\n            // When it matches, look up the RGB values for that color in the table,\n            // and write the red, green, and blue values.\n            RGB.control(true);\n            RGB.color(colorRGB[iColor][0], colorRGB[iColor][1], colorRGB[iColor][2]);\n\n            analogWrite(pinRed,colorRGB[iColor][0]);\n            analogWrite(pinGreen,colorRGB[iColor][1]);\n            analogWrite(pinBlue,colorRGB[iColor][2]);\n            return 0;\n        }\n   