In [18]:
%%time
import pandas as pd
import ast

data = pd.read_parquet("data/Python-data.parquet")
df = pd.DataFrame(data)

# Create a new DataFrame to store the extracted functions
functions_data = []
error_counter = 0

# Function to extract function definitions from Python code
def extract_functions(code, error_counter):
    functions = []
    try:
        # Parse the code into an AST
        tree = ast.parse(code)
        # Iterate over all nodes in the AST
        for node in ast.walk(tree):
            # Check if the node is a function definition
            if isinstance(node, ast.FunctionDef):
                # Extract the function name and its definition
                function_name = node.name
                params = [arg.arg for arg in node.args.args]
                function_code = ast.get_source_segment(code, node)
                # Check for a docstring
                docstring = ast.get_docstring(node)
                functions.append({
                    'name': function_name,
                    'full_code': function_code,
                    'body': function_code.split(r"):")[1] if r"):" in x else None ,
                    'params':params,
                    'docstring': docstring
                })
    except SyntaxError as e:
        print(f"Syntax error in code: {e}")
        error_counter += 1
    return functions, error_counter


# Iterate through each row in the dataset and extract functions
for index, row in df.iterrows():
    code_content = row['content']  # Replace with the actual column name containing code
    extracted_functions, error_counter = extract_functions(code_content, error_counter)
    for func in extracted_functions:
        functions_data.append({
            'file_id': row['__index_level_0__'],  # Replace with the actual identifier for each code file
            'name': func['name'],
            'body': func['body'],
            'full_code': func['full_code'],
            'docstring': func['docstring'],
            'params':func['params'],
            'language':'Python',
        })
    if not index % 500:
        print(index, " ", error_counter)

# Convert the list of extracted functions to a DataFrame
functions_df = pd.DataFrame(functions_data)

functions_df.to_csv("data/python-extracted.csv")
functions_df

0   0
Syntax error in code: Missing parentheses in call to 'print'. Did you mean print(...)? (<unknown>, line 28)


IndexError: list index out of range