In [23]:
%%time
import pandas as pd
import ast

data = pd.read_parquet("data/Python-data.parquet")
df = pd.DataFrame(data)

# Create a new DataFrame to store the extracted functions
functions_data = []
error_counter = 0

# Function to extract function definitions from Python code
def extract_functions(code, error_counter):
    functions = []
    try:
        # Parse the code into an AST
        tree = ast.parse(code)
        # Iterate over all nodes in the AST
        for node in ast.walk(tree):
            # Check if the node is a function definition
            if isinstance(node, ast.FunctionDef):
                # Extract the function name and its definition
                function_name = node.name
                params = [arg.arg for arg in node.args.args]
                function_code = ast.get_source_segment(code, node)
                # Check for a docstring
                docstring = ast.get_docstring(node)
                functions.append({
                    'name': function_name,
                    'full_code': function_code,
                    'body': function_code.split(r"):")[1] if r"):" in function_code else None ,
                    'params':params,
                    'docstring': docstring
                })
    except SyntaxError as e:
        print(f"Syntax error in code: {e}")
        error_counter += 1
    return functions, error_counter


# Iterate through each row in the dataset and extract functions
for index, row in df.iterrows():
    code_content = row['content']  # Replace with the actual column name containing code
    extracted_functions, error_counter = extract_functions(code_content, error_counter)
    for func in extracted_functions:
        functions_data.append({
            'file_id': row['__index_level_0__'],  # Replace with the actual identifier for each code file
            'name': func['name'],
            'body': func['body'],
            'full_code': func['full_code'],
            'docstring': func['docstring'],
            'params':func['params'],
            'language':'Python',
        })
    if not index % 500:
        print(index, " ", error_counter)

# Convert the list of extracted functions to a DataFrame
functions_df = pd.DataFrame(functions_data)

functions_df.to_csv("data/python-extracted.csv")
functions_df

0   0
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 28)
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 138)
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 9)
Syntax error in code: unexpected indent (<unknown>, line 23)
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 27)
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 174)
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 26)
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 39)
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 82)
Syntax error in code: invalid syntax (<unknown>, line 8)
Syntax erro

Unnamed: 0,file_id,name,body,full_code,docstring,params,language
0,210001,__init__,\n pass,def __init__(self):\n pass,,[self],Python
1,210001,train_model,\n if os.path.exists(config_path) and (...,"def train_model(self, config_path):\n i...",,"[self, config_path]",Python
2,210002,_canonicalize_experiment,"\n """"""Sorts the repeated fields of an Exper...","def _canonicalize_experiment(exp):\n """"""Sor...",Sorts the repeated fields of an Experiment mes...,[exp],Python
3,210002,setUp,\n self._mock_tb_context = base_plugin....,def setUp(self):\n self._mock_tb_contex...,,[self],Python
4,210002,_mock_all_summary_metadata,\n result = {}\n hparams_content...,def _mock_all_summary_metadata(self):\n ...,,[self],Python
...,...,...,...,...,...,...,...
81761,219998,test_find_start_codons,"\n """"""New test to test the function to find...","def test_find_start_codons():\n """"""New test...",New test to test the function to find start co...,[],Python
81762,219998,test_transcribe,\n assert DNA('GTC').transcribe() == 'GAC'\...,def test_transcribe():\n assert DNA('GTC')....,,[],Python
81763,219998,test_is_gc_rich,\n assert DNA('GTGT').gc_content() == 0.5,def test_is_gc_rich():\n assert DNA('GTGT')...,,[],Python
81764,219998,test_gc_content,\n assert DNA('ATTTATGGCC').gc_content == 0...,def test_gc_content():\n assert DNA('ATTTAT...,,[],Python
