In [1]:
import pandas as pd
import regex as re

regex = {
    "TypeScript": r'(?<doc>(?:\/\*\*(.|\n)*?\*\/|(\/\/.*?\n)*)?)\s*(?<name>function\s+\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})',
    "Shell":      r"(?<doc>(\#.*?)*)?\s*(?<name>\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})",
    "PHP":        r'(?<doc>(?:\/\*\*(.|\n)*?\*\/|(\/\/.*?)*)?)\s*(?<name>(public|protected|private)*\s*function\s+\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})',
    "JavaScript": r'(?<doc>(?:\/\*\*(.|\n)*?\*\/|(\/\/.*?\n)*)?)\s*(?<name>function\s+\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})',
    "C":          r"(?<doc>(?:\/\*(.|\n)*?\*\/|(\/\/.*?\n)*)?)\s*(?<name>(?:public|private|protected)?\s*(?:static)?\s*\w+\s+\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})",
    "C++":        r"(?<doc>(?:\/\*(.|\n)*?\*\/|(\/\/.*?\n)*)?)\s*(?<name>(?:public|private|protected)?\s*(?:static)?\s*\w+\s+\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})",
    "C#":         r"(?<doc>(?:\/\/\/.*?\s*)+)\s*\s*(?<name>(?:public|private|protected|internal|file|protected internal|private protected)?\s*(?:static)?\s*\w+\s+\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})",
    "Java":       r"(?<doc>(?:\/\*\*(.|\n)*?\*\/|(\/\/.*?\n)*)?)\s*(?<name>(?:public|private|protected)?\s*(?:static)?\s*\w+\s+\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})",
    "Ruby":       r'(?(<doc>\#.*?)*)?\s*(?<name>def\s+\w+)\s*(?<paren>\((?:[^()]+|(?&paren))*\))\s*(?<brace>\{(?:[^{}]+|(?&brace))*\})',
}

keywords = ["if", "while", "do", "else", "switch", "elseif", "elif", "match", "for", "try", "catch"]

In [351]:
def add_curly_brace(match):
    return match.group(1) + " {"

def Ruby_hotfix(col):
    col = col.apply(lambda x: ruby_regex.sub(add_curly_brace, x))
    col = col.map(lambda x: ' '.join(word_map.get(word, word) for word in x.split()))
    return col


def reverse_Ruby_hotfix(functions):
    if functions:
        for func in functions:
            func[-1] = ' '.join(reverse_word_map.get(word, word) for word in func[-1].split())
            func[2] = ' '.join(reverse_word_map.get(word, word) for word in func[2].split())
            
            i = func[-1].find('{')
            if i != -1:
                func[-1] = func[-1][:i] + func[-1][i + 1:]
            if func[2][0] == "{":
                func[2] = func[2][1:]
            
    return functions


word_map = { 
    "do":"do{", 
    "if":"if{", 
    "unless":"unless{", 
    "case":"case{", 
    "while":"while{", 
    "until":"until{",
    "end":"end}",
    "class":"class{",
    "module":"module{"
}
reverse_word_map = {v: k for k, v in word_map.items()}

ruby_regex = re.compile(r'(def\s+\w+\s*(?:\([^()]*\)))')

In [355]:
%%time
import regex as re

# Function to extract function information
def extract_functions(code):
    # Regular expression pattern to match functions
    functions = []
    matches = re.finditer(pattern, code)
        
    for match in matches:
        docstring = match.group("doc")
        name = match.group("name")
        parameters = match.group("paren")
        body = match.group("brace").strip()
        
        
        if len(body) > 1000: continue
        if not body.replace('{','').replace('}','').replace(':','').replace('\n','').strip(): continue
        if not docstring or (docstring and not docstring.replace("//","").replace("///","").replace("#","").replace("//*","").replace("*/","").replace("'''","").replace('"""',"").strip()).replace("/*",""): continue
        
        functions.append([name, parameters, body, docstring])
        
    return functions

# Create a new DataFrame with function information
functions_data = []
whitespace = re.compile(r'\s+')

print("starting...", flush=True)
for language in regex.keys():
    data = pd.read_parquet(f"data/{language}-data.parquet")
    df = pd.DataFrame(data)
    pattern = regex[language]
    
    if language == "Ruby":
        df['content'] = Ruby_hotfix(df['content'])
    
    for index, row in df.iterrows():
        if len(row['content'])>20000: continue
        if not index%500: print(language, index)

        functions = extract_functions(row['content'])
        
        if language == "Ruby":
            functions = reverse_Ruby_hotfix(functions)
        
        if functions:
            for function in functions:
                if not re.sub(whitespace, '', function[0]) in keywords:
                    functions_data.append({'name':function[0], 'params':function[1],'body':function[2], 'docstring':function[3], 'file_id':row['__index_level_0__'], 'language':language})

starting...
Ruby 0
Ruby 500
Ruby 1000
Ruby 1500
Ruby 2000
Ruby 2500
Ruby 3000
Ruby 3500
Ruby 4000
Ruby 4500
Ruby 5000
Ruby 5500
Ruby 6000
Ruby 6500
Ruby 7000
Ruby 7500
Ruby 8000
Ruby 8500
Ruby 9000
Ruby 9500
PHP 0
PHP 500
PHP 1000
PHP 1500
PHP 2000
PHP 2500
PHP 3000
PHP 3500
PHP 4000
PHP 4500
PHP 5000
PHP 5500
PHP 6000
PHP 6500
PHP 7000
PHP 7500
PHP 8500
PHP 9000
PHP 9500
JavaScript 0
JavaScript 500
JavaScript 1000
JavaScript 1500
JavaScript 2000
JavaScript 3000
JavaScript 3500
JavaScript 4000
JavaScript 4500
JavaScript 5000
JavaScript 5500
JavaScript 6500
JavaScript 7000
JavaScript 7500
JavaScript 8000
JavaScript 8500
JavaScript 9000
JavaScript 9500
C 0
C 500
C 1000
C 1500
C 2500
C 3000
C 4000
C 4500
C 5000
C 5500
C 6000
C 6500
C 7500
C 8000
C 8500
C 9000
C 9500
C++ 0
C++ 500
C++ 1000
C++ 1500
C++ 2000
C++ 2500
C++ 3000
C++ 3500
C++ 4000
C++ 4500
C++ 5000
C++ 5500
C++ 6000
C++ 6500
C++ 7000
C++ 7500
C++ 8000
C++ 8500
C++ 9000
C++ 9500
C# 0
C# 500
C# 1000
C# 1500
C# 2000
C# 2500
C# 300

In [357]:
new_df = pd.DataFrame(functions_data)
new_df.to_parquet(new_dataset)

Unnamed: 0,name,params,body,docstring,full_code,file_id,language
0,def trigger_transaction,(txn_name),TestWidget.new.run_transaction(txn_name) end,# encoding: utf-8 # This file is distributed u...,# encoding: utf-8 # This file is distributed u...,220001,Ruby
1,def trigger_transaction_with_error,"(txn_name, error_msg)",TestWidget.new.run_transaction(txn_name) do N...,,"def trigger_transaction_with_error(txn_name, e...",220001,Ruby
2,def trigger_transaction_with_slow_sql,(txn_name),TestWidget.new.run_transaction(txn_name) do s...,,def trigger_transaction_with_slow_sql(txn_name...,220001,Ruby
3,def run_transaction,(txn_name),NewRelic::Agent.set_transaction_name(txn_name...,,def run_transaction(txn_name) NewRelic::Agent...,220001,Ruby
4,def post_install,"(var/""lib/arangodb3"")",".mkpath (var/""log/arangodb3"").mkpath end",#{share} -DCMAKE_INSTALL_DATAROOTDIR=#{share} ...,#share} -DCMAKE_INSTALL_DATAROOTDIR=#{share} -...,220005,Ruby
...,...,...,...,...,...,...,...
162023,build_web,(),"{\n echo ""Building web..""\n rm cc.js\n $WEB...",,"\n\nbuild_web() {\n echo ""Building web..""\n ...",259986,Shell
162024,build_rpi,(),"{\n echo ""Building rpi..""\n cp $ROOT_DIR/mis...",,"\n\nbuild_rpi() {\n echo ""Building rpi..""\n ...",259986,Shell
162025,build_android,(),"{\n echo ""Building android..""\n rm cc.apk cc...",,"\n \nbuild_android() {\n echo ""Building andr...",259986,Shell
162026,run_timed,(),{\n beg=`date +%s`\n $1\n end=`date +%s`\n\...,,\n\nrun_timed() {\n beg=`date +%s`\n $1\n e...,259986,Shell


In [359]:
python_df = pd.read_csv("data/Python-extracted.csv")
python_df

Unnamed: 0.1,Unnamed: 0,file_id,name,body,full_code,docstring,params,language
0,0,210001,__init__,\n pass,def __init__(self):\n pass,,['self'],Python
1,1,210001,train_model,\n if os.path.exists(config_path) and (...,"def train_model(self, config_path):\n i...",,"['self', 'config_path']",Python
2,2,210002,_canonicalize_experiment,"\n """"""Sorts the repeated fields of an Exper...","def _canonicalize_experiment(exp):\n """"""Sor...",Sorts the repeated fields of an Experiment mes...,['exp'],Python
3,3,210002,setUp,\n self._mock_tb_context = base_plugin....,def setUp(self):\n self._mock_tb_contex...,,['self'],Python
4,4,210002,_mock_all_summary_metadata,\n result = {}\n hparams_content...,def _mock_all_summary_metadata(self):\n ...,,['self'],Python
...,...,...,...,...,...,...,...,...
81761,81761,219998,test_find_start_codons,"\n """"""New test to test the function to find...","def test_find_start_codons():\n """"""New test...",New test to test the function to find start co...,[],Python
81762,81762,219998,test_transcribe,\n assert DNA('GTC').transcribe() == 'GAC'\...,def test_transcribe():\n assert DNA('GTC')....,,[],Python
81763,81763,219998,test_is_gc_rich,\n assert DNA('GTGT').gc_content() == 0.5,def test_is_gc_rich():\n assert DNA('GTGT')...,,[],Python
81764,81764,219998,test_gc_content,\n assert DNA('ATTTATGGCC').gc_content == 0...,def test_gc_content():\n assert DNA('ATTTAT...,,[],Python


In [361]:
combined_df = pd.concat([new_df, python_df])

In [363]:
combined_df['language'].value_counts()

language
Python        81766
Java          48353
C#            28134
PHP           22561
C             19669
C++           16769
JavaScript     9251
Ruby           8103
Shell          7114
TypeScript     2074
Name: count, dtype: int64

In [365]:
combined_df.to_csv("data/combined.csv")

In [369]:
%%time
import regex as re
import pandas as pd


print("starting...", flush=True)
data = pd.read_parquet(f"data/Ruby-data.parquet")
df = pd.DataFrame(data)

count = 0

for index, row in df.iterrows():
    count += row['content'].count(" def ")

count

starting...
CPU times: total: 422 ms
Wall time: 412 ms


43484