In [1]:
import pandas as pd
import regex as re

regex = {
    "PHP":        r'(?<doc>(\/\*\*(.|\n)*?\*\/|(\/\/.*?\s*)+))\s*(?<name>(public|protected|private)*\s*function\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})',
    "JavaScript": r'(?<doc>(\/\*\*(?:(?!\/\*\*).|\n)*?\*\/|(\/\/.*?\s*)+))\s*(?<name>function\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})',
    "C":          r"(?<doc>(\/\*(?:(?!\/\*).|\n)*?\*\/|(\/\/.*?\s*)+))\s*(?<name>(public|private|protected)?\s*(static)?\s*\w+\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "C++":        r"(?<doc>(\/\*(?:(?!\/\*).|\n)*?\*\/|(\/\/.*?\s*)+))\s*(?<name>(public|private|protected)?\s*(static)?\s*\w+\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "C#":         r"(?<doc>(\/\/\/.*\s*)+)\s*(?<name>(public|private|protected|internal|file|protected internal|private protected)?\s*(static)?\s*\w+\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "Java":       r"(?<doc>(\/\*\*(?:(?!\/\*\*).|\n)*?\*\/|(\/\/.*?\s*)+))\s*(?<name>(public|private|protected)?\s*(static)?\s*\w+\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "TypeScript": r'(?<doc>(\/\*\*(?:(?!\/\*\*).|\n)*?\*\/|(\/\/.*?\s*)+))\s*(?<name>function\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})',
    "Shell":      r"(?<doc>(\#.*?\s*)+)\s*(?<name>\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "Ruby":       r'(?<doc>(\#.*?\s*)+)\s*(?<name>def\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})',
    "Python": r""
}

keywords = ["if", "while", "do", "else", "switch", "elseif", "elif", "match", "for", "try", "catch"]

In [2]:
import regex as re

def add_curly_brace(match):
    return match.group(1) + " {"

def Ruby_hotfix(col):
    col = ruby_regex.sub(add_curly_brace, col)
    col = ' '.join(word_map.get(word, word) for word in col.split())
    return col


def reverse_Ruby_hotfix(functions):
    if functions:
        for func in functions:
            func[-1] = ' '.join(reverse_word_map.get(word, word) for word in func[-1].split())
            func[2] = ' '.join(reverse_word_map.get(word, word) for word in func[2].split())
            
            i = func[-1].find('{')
            if i != -1:
                func[-1] = func[-1][:i] + func[-1][i + 1:]
            if func[2][0] == "{":
                func[2] = func[2][1:]
            
    return functions


word_map = { 
    "do":"do{", 
    "if":"if{", 
    "unless":"unless{", 
    "case":"case{", 
    "while":"while{", 
    "until":"until{",
    "end":"end}",
    "class":"class{",
    "module":"module{"
}

reverse_word_map = {v: k for k, v in word_map.items()}

ruby_regex = re.compile(r'(def\s+\w+\s*(?<paren>\(([^()]+|(?&paren))*\)))')

In [3]:
whitespace = re.compile(r'\s+')
def extract_functions(code):
    # Regular expression pattern to match functions
    
    functions = []
    matches = re.finditer(pattern, code)
        
    for match in matches:
        docstring = match.group("doc")
        name = match.group("name")
        parameters = match.group("paren")
        body = match.group("brace").strip()
        functions.append([name, parameters, body, docstring])
        
    return functions

In [4]:
def filter_function(example):
    return example['max_stars_count'] and example['max_stars_count'] > 2 and example['lang'] in regex.keys() and example['size'] < 20000

In [5]:
def handle_data(example, df):
    functions_data = []
    if filter_function(example):
        lang = example['lang']
        
        if lang == "Ruby":
            example['content'] = Ruby_hotfix(example['content'])
        
        functions = extract_functions(example['content'])
        
        if language == "Ruby":
            functions = reverse_Ruby_hotfix(functions)
        
        if functions:
            for function in functions:
                if function:
                    if re.sub(whitespace, '', function[0]) in keywords: continue
                    # if len(function[2]) > 1000: continue
                    if not function[2].replace('{','').replace('}','').replace(':','').replace('\n','').strip(): continue
                    if not function[3] or (function[3] and not function[3].replace("//","").replace("///","").replace("#","").replace("//*","").replace("*/","").replace("'''","").replace('"""',"").strip()): continue
                    
                    functions_data.append({
                        'name':function[0],
                        'params':function[1],
                        'body':function[2],
                        'docstring':function[3],
                        #'full_code':function[4],
                        'file_id':example['hexsha'],
                        'language':lang,
                    })
    return functions_data        

In [6]:
%%time
import pandas as pd
import ast

# Function to extract function definitions from Python code
def extract_python_functions(code, error_counter, hexsha):
    functions = []
    try:
        code = code.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
        # Parse the code into an AST
        tree = ast.parse(code)
        # Iterate over all nodes in the AST
        for node in ast.walk(tree):
            # Check if the node is a function definition
            if isinstance(node, ast.FunctionDef):
                # Extract the function name and its definition
                function_name = node.name.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                params = str([arg.arg.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for arg in node.args.args])
                function_code = ast.get_source_segment(code, node).encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                docstring = ast.get_docstring(node)
                if docstring: docstring = docstring.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                body = "\n".join([ast.unparse(stmt) for stmt in node.body]).encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                if hexsha: hexsha = hexsha.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                
                functions.append([
                    function_name,
                    params,
                    body,
                    docstring,
                    function_code,
                    hexsha
                    ])
    except (SyntaxError, ValueError, UnicodeEncodeError, UnicodeDecodeError):
        error_counter += 1
    return functions, error_counter

        
def handle_py_data(example, df, py_error_count):
    functions_data = []
    if filter_function(example):
        #print(example, flush=True)
        hexsha = example['hexsha']
        functions, py_error_count = extract_python_functions(example['content'], py_error_count, hexsha)

        if functions:
            for function in functions:
                if function:
                    functions_data.append({
                        'name':function[0],
                        'params':function[1],
                        'body':function[2],
                        'docstring':function[3],
                        #'full_code':function[4],
                        'file_id': function[5],
                        'language':'Python',
                    })
    return functions_data, py_error_count

CPU times: total: 0 ns
Wall time: 0 ns


In [None]:
from huggingface_hub import login
login(token="XXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


  from .autonotebook import tqdm as notebook_tqdm


Token is valid (permission: write).
Your token has been saved to C:\Users\User\.cache\huggingface\token
Login successful


In [8]:
def list_of_dicts_to_dict_of_lists(data_list):
    dict_of_lists = {}
    for d in data_list:
        for key, value in d.items():
            if key not in dict_of_lists:
                dict_of_lists[key] = []
            dict_of_lists[key].append(value)
    return dict_of_lists

In [None]:
%%time 

# 544 minutes +

import json
import time
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import load_dataset, Dataset
from huggingface_hub import HfApi, HfFolder

chunk_size = 1000000
hf_repo_id = "JanDkff/TinyFuncData-new"
api = HfApi()
token = HfFolder.get_token()

#expected_features = Features({
#    'name': Value(dtype='string'),
#    'params': Value(dtype='string'),
#    'body': Value(dtype='string'),
#    'docstring': Value(dtype='string'),
#    'file_id': Value(dtype='string'),
#    'language': Value(dtype='string')
#})

# Process the dataset in chunks
processed_data = []

for language in regex.keys():
    i = 0
    j = 0
    start_time = time.time()
    dataset = load_dataset('bigcode/the-stack-dedup', data_dir=f'data/{language.lower().replace("++","pp").replace("#","-sharp")}', streaming=True, split="train")
    pattern = regex[language]
    for example in dataset:
        if not i % 50000:
            current_time = time.time()
            print(i, language, round(current_time - start_time, 2), flush=True)
            if processed_data: print(processed_data[-1])
            #start_time = current_time
        i += 1

        processed_data += handle_data(example, dataset)
        if len(processed_data) >= chunk_size:
            temp_dataset = Dataset.from_dict(list_of_dicts_to_dict_of_lists(processed_data))
            temp_dataset.push_to_hub(hf_repo_id, token=token, split=f'part_{j}_{language.lower().replace("++","pp").replace("#","sharp")}')
            print(j)
            j += 1
            processed_data = []  # Reset the data chunks
            temp_dataset = None

    # Write any remaining data
    if processed_data:
        temp_dataset = Dataset.from_dict(list_of_dicts_to_dict_of_lists(processed_data))
        temp_dataset.push_to_hub(hf_repo_id, token=token, split=f'part_{j}_{language.lower().replace("++","pp").replace("#","sharp")}')
        processed_data = []
        temp_dataset = None
