In [15]:
import os
import re
import json
import textwrap

def extract_main_function(file_content):
    # Regular expression to find the main function in C++ code
    pattern = r'int\s+main\s*\(\s*\)\s*{([^{}]*)}'
    match = re.search(pattern, file_content, re.DOTALL)
    if match:
        main_code = match.group(1).strip()
        return main_code
    return None

def process_cpp_files(folder_path):
    main_functions = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".cpp"):
            with open(os.path.join(folder_path, filename), 'r') as file:
                file_content = file.read()
                main_code = extract_main_function(file_content)
                if main_code:
                    formatted_main_code = textwrap.indent(main_code, '    ')  # Add 4 spaces indentation
                    main_functions.append({"code_string": formatted_main_code})
    return main_functions

def save_to_jsonl(data, output_file):
    with open(output_file, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')


folder_path = "cplusplus"
output_file = "functions.jsonl"
functions = process_cpp_files(folder_path)
save_to_jsonl(functions, output_file)
print("Extraction complete. Functions saved to functions.jsonl")

Extraction complete. Functions saved to functions.jsonl


In [21]:
import json
import random

def mask_token(code_string):
    tokens = code_string.split()
    if len(tokens) == 0:
        return None
    token_to_mask = random.choice(tokens)
    masked_code_string = code_string.replace(token_to_mask, "<mask>", 1)  # Apply mask only once
    return masked_code_string, token_to_mask

def mask_dataset(input_file, output_file, num_duplicates):
    with open(input_file, 'r') as f:
        data = f.readlines()

    modified_data = []
    for line in data:
        obj = json.loads(line)
        for _ in range(num_duplicates):
            code_string = obj['code_string']
            masked_code_string, masked_token = mask_token(code_string)
            obj_copy = obj.copy()
            obj_copy['output'] = masked_token
            obj_copy['code_string'] = masked_code_string
            modified_data.append(obj_copy)

    with open(output_file, 'w') as f:
        for obj in modified_data:
            f.write(json.dumps(obj) + '\n')

# Usage
input_file = 'functions.jsonl'
output_file = 'masked_functions.jsonl'
num_duplicates = 12
mask_dataset(input_file, output_file, num_duplicates)