In [3]:
import os
import json
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

## AutoFL

In [6]:
def extract_length(trajectory):
    return len(trajectory)

def extract_function_call_counts(trajectory):
    return len([m for m in trajectory if "function_call" in m])

def extract_valid_function_call_counts(trajectory):
    return len([m for m in trajectory 
                if m["role"] == "function" and "error_message" not in m["content"] and "required positional argument" not in m["content"]])

def extract_repeated_call_counts(trajectory, only_func_name=True):
    prev_name = ''
    prev_arguments = {}
    repeat_count = 0
    for m in trajectory:
        if m["role"] == "assistant" and 'function_call'in m and m['function_call']:
            if prev_name == m['function_call']['name']:
                if only_func_name or prev_arguments == m['function_call']['arguments']:
                    repeat_count += 1
            else:
                prev_name = m['function_call']['name']
                prev_arguments = m['function_call']['arguments'] 
    return repeat_count

def extract_content_length(trajectory):
    return len(''.join([m['content'] for m in trajectory if m['role'] == 'assistant' and m['content']]))

def is_found(buggy_methods):
    return any([buggy_methods[m]['is_found'] for m in buggy_methods])

In [7]:
AUTOFL_DIR = Path('raw_data/autofl')

results = list()
for run_dir in tqdm(os.listdir(AUTOFL_DIR)):
    if not os.path.isdir(AUTOFL_DIR / run_dir):
        continue
    run_id = int(run_dir.split('_')[-1])
    for model_dir in os.listdir(AUTOFL_DIR / run_dir):
        assert os.path.isdir(AUTOFL_DIR / run_dir / model_dir)
        for file in os.listdir(AUTOFL_DIR / run_dir / model_dir):
            assert 'json' in file
            bug_id = file.split('-')[1].split('.')[0]
            with open(AUTOFL_DIR / run_dir / model_dir / file) as f:
                data = json.load(f)
                if 'messages' not in data or type(data['messages']) != list:
                    continue
                if 'buggy_methods' not in data or type(data['buggy_methods']) != dict:
                    continue
                trajectory = data['messages']
                results.append({
                    'bug_id': bug_id,
                    'run_id': run_id,
                    'model': model_dir,
                    'total_messages': extract_length(trajectory),
                    'function_calls': extract_function_call_counts(trajectory),
                    'valid_function_calls': extract_valid_function_call_counts(trajectory),
                    'function_repetitions': extract_repeated_call_counts(trajectory),
                    'function_and_args_repetitions': extract_repeated_call_counts(trajectory, only_func_name=False),
                    'content_length': extract_content_length(trajectory),
                    'success': is_found(data['buggy_methods']),
                })

print(len(results))
df = pd.DataFrame(results)
df.to_csv('data/autofl_base.csv',index=False)

100%|██████████| 21/21 [00:18<00:00,  1.13it/s]

32308



