In [19]:
import pandas as pd
import regex as re

regex = {
    "PHP":        r'(?<doc>(\/\*\*(.|\n)*?\*\/|(\/\/.*?\s*)+))?\s*(?<name>(public|protected|private)*\s*function\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})',
    "JavaScript": r'(?<doc>(\/\*\*(?:(?!\/\*\*).|\n)*?\*\/|(\/\/.*?\s*)+))?\s*(?<name>function\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})',
    "C":          r"(?<doc>(\/\*(?:(?!\/\*).|\n)*?\*\/|(\/\/.*?\s*)+))?\s*(?<name>(public|private|protected)?\s*(static)?\s*\w+\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "C++":        r"(?<doc>(\/\*(?:(?!\/\*).|\n)*?\*\/|(\/\/.*?\s*)+))?\s*(?<name>(public|private|protected)?\s*(static)?\s*\w+\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "C#":         r"(?<doc>(\/\/\/.*\s*)+)?\s*(?<name>(public|private|protected|internal|file|protected internal|private protected)?\s*(static)?\s*\w+\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "Java":       r"(?<doc>(\/\*\*(?:(?!\/\*\*).|\n)*?\*\/|(\/\/.*?\s*)+))?\s*(?<name>(public|private|protected)?\s*(static)?\s*\w+\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "TypeScript": r'(?<doc>(\/\*\*(?:(?!\/\*\*).|\n)*?\*\/|(\/\/.*?\s*)+))?\s*(?<name>function\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})',
    "Shell":      r"(?<doc>(\#.*?\s*)+)?\s*(?<name>\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})",
    "Ruby":       r'(?<doc>(\#.*?\s*)+)?\s*(?<name>def\s+\w+)\s*(?<paren>\(([^()]+|(?&paren))*\))\s*(?<brace>\{([^{}]+|(?&brace))*\})',
    "Python": r""
}

keywords = ["if", "while", "do", "else", "switch", "elseif", "elif", "match", "for", "try", "catch"]

In [20]:
import regex as re

def add_curly_brace(match):
    return match.group(1) + " {"

def Ruby_hotfix(col):
    col = ruby_regex.sub(add_curly_brace, col)
    col = ' '.join(word_map.get(word, word) for word in col.split())
    return col


def reverse_Ruby_hotfix(functions):
    if functions:
        for func in functions:
            func[-1] = ' '.join(reverse_word_map.get(word, word) for word in func[-1].split())
            func[2] = ' '.join(reverse_word_map.get(word, word) for word in func[2].split())
            
            i = func[-1].find('{')
            if i != -1:
                func[-1] = func[-1][:i] + func[-1][i + 1:]
            if func[2][0] == "{":
                func[2] = func[2][1:]
            
    return functions


word_map = { 
    "do":"do{", 
    "if":"if{", 
    "unless":"unless{", 
    "case":"case{", 
    "while":"while{", 
    "until":"until{",
    "end":"end}",
    "class":"class{",
    "module":"module{"
}

reverse_word_map = {v: k for k, v in word_map.items()}

ruby_regex = re.compile(r'(def\s+\w+\s*(?<paren>\(([^()]+|(?&paren))*\)))')

In [None]:
whitespace = re.compile(r'\s+')
def extract_functions(code):
    
    functions = []
    matches = re.finditer(pattern, code)
        
    for match in matches:
        docstring = match.group("doc")
        name = match.group("name")
        parameters = match.group("paren")
        body = match.group("brace").strip()
        functions.append([name, parameters, body, docstring])
        
    return functions

In [22]:
def filter_function(example):
    return example['max_stars_count'] and example['max_stars_count'] > 2 and example['lang'] in regex.keys() and example['size'] < 20000

In [23]:
def handle_data(example, df):
    functions_data = []
    if filter_function(example):
        lang = example['lang']
        
        if lang == "Ruby":
            example['content'] = Ruby_hotfix(example['content'])
        
        functions = extract_functions(example['content'])
        
        if language == "Ruby":
            functions = reverse_Ruby_hotfix(functions)
        
        if functions:
            for function in functions:
                if function:
                    if re.sub(whitespace, '', function[0]) in keywords: continue
                    # if len(function[2]) > 1000: continue
                    if not function[2].replace('{','').replace('}','').replace(':','').replace('\n','').strip(): continue
                    if not function[3] or (function[3] and not function[3].replace("//","").replace("///","").replace("#","").replace("//*","").replace("*/","").replace("'''","").replace('"""',"").strip()): continue
                    
                    functions_data.append({
                        'name':function[0],
                        'params':function[1],
                        'body':function[2],
                        'docstring':function[3],
                        #'full_code':function[4],
                        'file_id':example['hexsha'],
                        'language':lang,
                    })
    return functions_data        

In [None]:
%%time
import pandas as pd
import ast

def extract_python_functions(code, error_counter, hexsha):
    functions = []
    try:
        code = code.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
        tree = ast.parse(code)
        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                function_name = node.name.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                params = str([arg.arg.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for arg in node.args.args])
                function_code = ast.get_source_segment(code, node).encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                docstring = ast.get_docstring(node)
                if docstring: docstring = docstring.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                body = "\n".join([ast.unparse(stmt) for stmt in node.body]).encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                if hexsha: hexsha = hexsha.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
                
                functions.append([
                    function_name,
                    params,
                    body,
                    docstring,
                    function_code,
                    hexsha
                    ])
    except (SyntaxError, ValueError, UnicodeEncodeError, UnicodeDecodeError):
        error_counter += 1
    return functions, error_counter

        
def handle_py_data(example, df, py_error_count):
    functions_data = []
    if filter_function(example):
        #print(example, flush=True)
        hexsha = example['hexsha']
        functions, py_error_count = extract_python_functions(example['content'], py_error_count, hexsha)

        if functions:
            for function in functions:
                if function:
                    functions_data.append({
                        'name':function[0],
                        'params':function[1],
                        'body':function[2],
                        'docstring':function[3],
                        #'full_code':function[4],
                        'file_id': function[5],
                        'language':'Python',
                    })
    return functions_data, py_error_count

CPU times: total: 0 ns
Wall time: 0 ns


In [None]:
from huggingface_hub import login
login(token="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\User\.cache\huggingface\token
Login successful


In [26]:
def list_of_dicts_to_dict_of_lists(data_list):
    dict_of_lists = {}
    for d in data_list:
        for key, value in d.items():
            if key not in dict_of_lists:
                dict_of_lists[key] = []
            dict_of_lists[key].append(value)
    return dict_of_lists

In [None]:
%%time 

# 544 minutes +

import json
import time
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import load_dataset, Dataset
from huggingface_hub import HfApi, HfFolder

chunk_size = 1000000
hf_repo_id = "JanDkff/TinyFuncData-new"
api = HfApi()
token = HfFolder.get_token()

#expected_features = Features({
#    'name': Value(dtype='string'),
#    'params': Value(dtype='string'),
#    'body': Value(dtype='string'),
#    'docstring': Value(dtype='string'),
#    'file_id': Value(dtype='string'),
#    'language': Value(dtype='string')
#})


processed_data = []

for language in regex.keys():
    i = 0
    j = 0
    start_time = time.time()
    dataset = load_dataset('bigcode/the-stack-dedup', data_dir=f'data/{language.lower().replace("++","pp").replace("#","-sharp")}', streaming=True, split="train")
    pattern = regex[language]
    for example in dataset:
        if not i % 50000:
            current_time = time.time()
            print(i, language, round(current_time - start_time, 2), flush=True)
            if processed_data: print(processed_data[-1])
            #start_time = current_time
        i += 1

        processed_data += handle_data(example, dataset)
        if len(processed_data) >= chunk_size:
            temp_dataset = Dataset.from_dict(list_of_dicts_to_dict_of_lists(processed_data))
            temp_dataset.push_to_hub(hf_repo_id, token=token, split=f'part_{j}_{language.lower().replace("++","pp").replace("#","sharp")}')
            print(j)
            j += 1
            processed_data = []
            temp_dataset = None

    if processed_data:
        temp_dataset = Dataset.from_dict(list_of_dicts_to_dict_of_lists(processed_data))
        temp_dataset.push_to_hub(hf_repo_id, token=token, split=f'part_{j}_{language.lower().replace("++","pp").replace("#","sharp")}')
        processed_data = []
        temp_dataset = None


0 PHP 6.61


KeyboardInterrupt: 

In [None]:
%%%%%%%%%%%%%%%%%%%%%%%%

In [118]:
len(processed_data)

928043

In [66]:
%%time
import json
import time
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import load_dataset

# Define the chunk size
chunk_size = 1000
py_error_count = 0

# Process the dataset in chunks
i = 0

start_time = time.time()
processed_data = []

# Initialize ParquetWriter
parquet_file = 'new_dataset.parquet'
schema = None
writer = None

try:
    language = 'Python'
    for example in dataset:
        i += 1
        
        if not i % 50000:
            current_time = time.time()
            print(i, language, round(current_time - start_time, 2), flush=True)
            start_time = current_time
        
        data, py_error_count = handle_py_data(example, dataset, py_error_count)
        processed_data += data
        if len(processed_data) >= chunk_size:
            # Convert the processed data to a DataFrame
            df = pd.DataFrame(processed_data)
            table = pa.Table.from_pandas(df)

            if schema is None:
                schema = table.schema
                writer = pq.ParquetWriter(parquet_file, schema)

            # Write the Table to the Parquet file
            writer.write_table(table)

            # Clear the list to free memory
            processed_data.clear()

    # Write any remaining data
    if processed_data:
        df = pd.DataFrame(processed_data)
        table = pa.Table.from_pandas(df)
        if writer is None:
            writer = pq.ParquetWriter(parquet_file, table.schema)
        writer.write_table(table)

finally:
    # Close the ParquetWriter
    if writer:
        writer.close()




50000 Python 112.42




100000 Python 113.46
150000 Python 111.03
200000 Python 107.22




250000 Python 109.57
300000 Python 110.75
350000 Python 110.36




400000 Python 106.73
450000 Python 107.39
500000 Python 108.61
550000 Python 111.51
600000 Python 112.34




650000 Python 108.33
700000 Python 113.74
750000 Python 113.31
800000 Python 112.84
850000 Python 108.3




900000 Python 110.07
950000 Python 110.46
1000000 Python 107.96




1050000 Python 109.81
1100000 Python 105.67




1150000 Python 112.18
1200000 Python 112.89
1250000 Python 108.07




1300000 Python 106.99
1350000 Python 109.13




1400000 Python 110.27
1450000 Python 109.51




1500000 Python 110.96
1550000 Python 105.02
1600000 Python 109.94




1650000 Python 109.06
1700000 Python 109.02
1750000 Python 111.05
1800000 Python 110.45
1850000 Python 110.78
1900000 Python 112.92
1950000 Python 110.49
2000000 Python 108.53
2050000 Python 109.6
2100000 Python 122.0
2150000 Python 111.73




2200000 Python 104.88
2250000 Python 112.82
2300000 Python 112.08
2350000 Python 110.28
2400000 Python 107.88
2450000 Python 112.52




2500000 Python 110.84
2550000 Python 110.52




2600000 Python 109.05
2650000 Python 106.38




2700000 Python 113.86
2750000 Python 158.68
2800000 Python 127.1
2850000 Python 125.53




2900000 Python 128.04
2950000 Python 122.62
3000000 Python 137.52
3050000 Python 119.97




3100000 Python 140.13
3150000 Python 113.81
3200000 Python 135.32
3250000 Python 128.42
3300000 Python 121.91
3350000 Python 125.39




3400000 Python 132.49
3450000 Python 140.75
3500000 Python 107.88
3550000 Python 135.39
3600000 Python 118.67
3650000 Python 135.53
3700000 Python 121.39
3750000 Python 125.16
3800000 Python 140.42
3850000 Python 113.3
3900000 Python 135.37




3950000 Python 115.77
4000000 Python 147.93
4050000 Python 115.85
4100000 Python 149.17
4150000 Python 118.62
4200000 Python 125.91
4250000 Python 130.96




4300000 Python 122.35




4350000 Python 139.77
4400000 Python 109.23
4450000 Python 136.21




4500000 Python 123.3
4550000 Python 143.13
4600000 Python 117.28
4650000 Python 136.15
4700000 Python 137.12
4750000 Python 119.4




4800000 Python 145.72
4850000 Python 114.03
4900000 Python 143.69
4950000 Python 107.49
5000000 Python 145.56
5050000 Python 114.94
5100000 Python 127.37
5150000 Python 124.36
5200000 Python 122.59
5250000 Python 136.42
5300000 Python 112.12




5350000 Python 154.65




5400000 Python 110.18
5450000 Python 135.95




5500000 Python 117.71




5550000 Python 130.35
5600000 Python 128.75
5650000 Python 133.12
5700000 Python 131.42
5750000 Python 118.59
5800000 Python 138.8
5850000 Python 112.78
5900000 Python 141.98




5950000 Python 121.17
6000000 Python 141.71
6050000 Python 132.34
6100000 Python 115.58




6150000 Python 134.99
6200000 Python 121.81
6250000 Python 132.51
6300000 Python 126.76
6350000 Python 137.71




6400000 Python 121.74
6450000 Python 135.33
6500000 Python 128.28
6550000 Python 118.74
6600000 Python 124.59
6650000 Python 121.14
6700000 Python 148.52
6750000 Python 109.6
6800000 Python 130.14
6850000 Python 123.9
6900000 Python 132.76
6950000 Python 137.17
7000000 Python 119.07
7050000 Python 134.73
7100000 Python 117.67
7150000 Python 145.52
7200000 Python 111.03
7250000 Python 137.94
7300000 Python 123.76
7350000 Python 125.26
7400000 Python 131.06
7450000 Python 123.05




7500000 Python 137.31
7550000 Python 115.85
7600000 Python 143.64
7650000 Python 110.36
7700000 Python 138.43
7750000 Python 126.85




7800000 Python 120.38
7850000 Python 137.7




7900000 Python 114.32
7950000 Python 142.99
8000000 Python 114.05




8050000 Python 134.65
8100000 Python 107.42
8150000 Python 135.13




8200000 Python 125.09
8250000 Python 133.98
8300000 Python 119.12
8350000 Python 127.83
8400000 Python 137.83
8450000 Python 113.79
8500000 Python 134.83
8550000 Python 111.05
8600000 Python 139.48
8650000 Python 124.33
8700000 Python 137.13
8750000 Python 127.04
8800000 Python 119.92




8850000 Python 141.56
8900000 Python 117.42




8950000 Python 145.83
9000000 Python 110.07
9050000 Python 150.45
9100000 Python 118.33
9150000 Python 133.33
9200000 Python 122.32
9250000 Python 121.18




9300000 Python 134.85




9350000 Python 112.6
9400000 Python 132.98
9450000 Python 109.64
9500000 Python 152.15
9550000 Python 117.24
9600000 Python 140.98
9650000 Python 125.42
9700000 Python 123.59
9750000 Python 131.7
9800000 Python 110.13
9850000 Python 129.8
9900000 Python 115.29
9950000 Python 144.69
10000000 Python 125.78
10050000 Python 131.93
10100000 Python 133.74




10150000 Python 126.97
10200000 Python 132.13
10250000 Python 121.82




10300000 Python 147.71
10350000 Python 118.13




10400000 Python 144.34




10450000 Python 118.65




10500000 Python 130.15
10550000 Python 136.96
10600000 Python 120.42
10650000 Python 132.2
10700000 Python 113.55
10750000 Python 139.06




10800000 Python 114.11




10850000 Python 141.17
10900000 Python 120.65
10950000 Python 137.4
11000000 Python 143.84
11050000 Python 117.94
11100000 Python 141.41
11150000 Python 108.72




11200000 Python 140.29
11250000 Python 113.46
11300000 Python 140.7
11350000 Python 115.85
11400000 Python 135.64
11450000 Python 123.51
11500000 Python 131.2
11550000 Python 139.45
11600000 Python 111.62
11650000 Python 138.88
11700000 Python 114.44
11750000 Python 137.55
11800000 Python 117.16
11850000 Python 133.55
11900000 Python 125.44




11950000 Python 131.14
12000000 Python 134.52
12050000 Python 118.57
12100000 Python 141.5
12150000 Python 115.03
12200000 Python 145.17
12250000 Python 115.84
12300000 Python 139.55
12350000 Python 122.93
12400000 Python 128.25
12450000 Python 136.41
12500000 Python 111.21
12550000 Python 154.91
12600000 Python 117.98
12650000 Python 141.88
12700000 Python 122.62




12750000 Python 139.27
12800000 Python 134.4




12850000 Python 140.92
12900000 Python 144.16
12950000 Python 143.68
CPU times: total: 7h 29min 58s
Wall time: 8h 59min 4s


In [28]:
%%time
import json
import time
from datasets import load_dataset


# Define the chunk size
chunk_size = 1000
py_error_count = 0

# Process the dataset in chunks
i = 0

start_time = time.time()
processed_data = []

for language in ["Shell","Python"]:
    dataset = load_dataset('bigcode/the-stack-dedup', data_dir=f'data/{language.lower().replace("#","-sharp").replace("++","pp")}', streaming=True, split="train")
    pattern = regex[language]
    for example in dataset:
        if not i%50000:
            current_time = time.time()
            print(i, language, round(current_time - start_time, 2), flush=True)
            start_time = current_time
        i += 1

        processed_data += handle_data(example, dataset)
        if len(processed_data) >= chunk_size:
            # Save the data to a file
            with open('new_dataset.json', 'a') as f:
                for item in processed_data:
                    f.write(json.dumps(item) + '\n')

            # Clear the list to free memory
            processed_data.clear()

    # Save any remaining data
    if processed_data:
        with open('new_dataset.json', 'a') as f:
            for item in processed_data:
                f.write(json.dumps(item) + '\n')

KeyboardInterrupt: 

In [None]:
['Python' 'PHP' 'JavaScript' 'C' 'C++' 'Ruby']
55598072

In [65]:
temp_dataset

Dataset({
    features: ['name', 'params', 'body', 'docstring', 'file_id', 'language'],
    num_rows: 2500003
})

In [67]:
            temp_dataset = Dataset.from_dict(list_of_dicts_to_dict_of_lists(processed_data))
            temp_dataset.push_to_hub(hf_repo_id, token=token, split=f"part_{j}")
            print(j)
            j += 1

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1251 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1251 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

30


In [69]:
    processed_data = []
    temp_dataset = None

In [73]:
j

37