In [112]:
from glob import glob
import json
import os
import pandas as pd
from collections import defaultdict

In [113]:
types_ordered = ['range+name', 'range:non_spat_filter+name', 'range:direction+name', 'range:towards+name', 
 'knn+name', 'knn:non_spat_filter+name', 'knn+name+multihop1', 'knn+name+multihop2',
 'knn:direction+name', 'knn:towards+name', 'intersects:area_max+name', 'intersects:length_max+name',
 'range+loc', 'range:non_spat_filter+loc','range:direction+loc',  'range:towards+loc',
  'knn+loc', 'knn:non_spat_filter+loc', 'knn:direction+loc',  'knn:towards+loc', 
  'range+angle', 'knn+angle',
  'range+count', 'intersects+count',
'range+distance', 'knn+distance', 
'intersects:area_total+area', 'intersects:length_total+length'
 ]
type_labels = {types_ordered[i-1]: 'T%d' % i for i in range(1,29)}


In [114]:
text = {
    'G': pd.read_csv('./gpt_text_eval.csv').fillna(0),
    'L': pd.read_csv('./llama_text_eval.csv').fillna(0),
    'GT': pd.read_csv('./gpt_text2sql_text_eval.csv').fillna(0),
    'GR': pd.read_csv('./gpt_rag_text_eval.csv').fillna(0),
    'LT': pd.read_csv('llama_text2sql_text_eval.csv').fillna(0),
    'LR': pd.read_csv('llama_rag_text_eval.csv').fillna(0),
    'R': pd.read_csv('shuffled_text_eval.csv').fillna(0),
}

In [115]:
parsed = {
    'G': pd.read_csv('./gpt_parsed_eval.csv').fillna(0),
    'L': pd.read_csv('./llama_parsed_eval.csv').fillna(0),
    'GT': pd.read_csv('./gpt_text2sql_parsed_eval.csv').fillna(0),
    'LT': pd.read_csv('./llama_text2sql_parsed_eval.csv').fillna(0),
    'GR': pd.read_csv('./gpt_rag_parsed_eval.csv').fillna(0),
    'LR': pd.read_csv('./llama_rag_parsed_eval.csv').fillna(0),
    'R': pd.read_csv('./shuffled_parsed_eval.csv').fillna(0),
}


In [116]:
answers = {
    'G': {
        'text': './answers_gpt.json',
        'json': './json_answers_gpt.json'
    },
    'GT': {
        'text': './text2sql_answers_gpt.json',
        'json': './text2sql_json_answers_gpt.json'
    }, 
    'GR': {
        'text': './rag_answers_gpt.json',
        'json': './rag_json_answers_gpt.json'
    },   
    'L': {
        'text': './answers_llama.json',
        'json': './json_answers_llama.json'
    },
    'LT': {
        'text': './text2sql_answers_llama.json',
        'json': './text2sql_json_answers_llama.json'
    },
    'LR': {
        'text': './rag_answers_llama.json',
        'json': './rag_json_answers_llama.json'
    }
}

for b in answers:
    for t in answers[b]:
        with open(answers[b][t], 'r') as file:
            answers[b][t] = {a['id']: a['content'] for a in json.loads(file.read())}

In [117]:
relevant_scores = {
    'count': ['relative_error'],
    'area': ['relative_error'],
    'length': ['relative_error'],
    'distance': ['relative_error'],
    'name': ['P','R', 'F1'],
    'loc': ['P','R', 'F1', 'distance_error'],
    'angle': ['P','R', 'F1','angle_error']
}
for c in relevant_scores:
    relevant_scores[c].append('attempted')

scores = {}
for b in text:
    scores[b] = {'text': {}, 'json': {}}
    for i, row in text[b].iterrows():
        d = row.to_dict()
        scores[b]['text'][row['id']] = {k: d[k] for k in d if k not in ['type', 'id']}
for b in parsed:
    for i, row in parsed[b].iterrows():
        d = row.to_dict()
        rs = []
        for o in relevant_scores:
            if d['type'].endswith(o):
                rs = relevant_scores[o]
                break
        scores[b]['json'][row['id']] = {k: d[k] for k in d if (k not in ['type', 'id'] and k in rs)}

In [118]:
files = glob('./selected_questions/*.jsonl')
questions = []
for path in files:
    question_type = path[path.rfind('/')+1:-6]
    with open(path, 'r') as file:
        for i in range(100):
            line = file.readline()
            question = json.loads(line)
            question['type'] = question_type
            questions.append(question)

In [119]:
import re

def flatten_if_nested(array):
    # Check if the input is a list and contains nested lists
    if isinstance(array, list) and any(isinstance(item, list) for item in array):
        flattened = []
        for item in array:
            if isinstance(item, list):
                flattened.extend(flatten_if_nested(item))
            else:
                flattened.append(item)
        return flattened
    else:
        return array  # Return the input as-is if it's not a list or doesn't contain nested lists

def extract_json_blocks(text, i):
    # Regular expression pattern to match JSON blocks
    pattern = r'```[\s]*json(.*?)```'
    pattern1 = r'\b\d+(?:_\d+)*\b'
    pattern2 = r'\b\d+(?:,\d+)*\b'
    pattern3 = r'//.*?\n'
    pattern4 = r',\s*}'
    pattern5 = r'}\s*{'
    # Find all JSON blocks
    matches = re.findall(pattern, text, re.DOTALL)
    
    # Parse each match to ensure valid JSON
    json_blocks = []
    for match in matches:
        try:
            # Remove any leading/trailing whitespace and parse as JSON
            s = match.strip()
            s = re.sub(pattern1, lambda x: x.group().replace('_', ''), s)
            s = re.sub(pattern2, lambda x: x.group().replace(',', ''), s)
            s = re.sub(pattern3, '', s)
            s = re.sub(pattern4, '}', s)
            # these are just for corrected errors in the json strings
            s = s.replace('''\\\'''', '''\'''').replace('''\\&''', '''&''').replace("""\\'""", '''\'''').replace('}\njson', '}').replace('" W', ' W').replace(""""length": 20 + 30 + 10,""", """"length": 60,""")
            if re.search(pattern5, s):
                s = re.sub(pattern5, '},\n{', s)
                s = '[\n%s\n]' % s
            convert_area = False
            if ' acres' in s:
                convert_area = True
                s = s.replace(' acres,', ',')
            json_data = json.loads(s)
            if convert_area and 'area' in json_data:
                json_data['area'] = json_data['area'] * 4046.8564224
            json_blocks.append(json_data)
        except json.JSONDecodeError as w:
            print(w)
            # If parsing fails, print an error message (can log or handle as needed)
            print(i)
            print(s)
            print("Warning: Found an invalid JSON block.") 
    return flatten_if_nested(json_blocks)

In [120]:
def clean_entities(obj, q_type):
    entitites = {}
    for k in obj:
        if k == '[1]':
            o = {}
            for k2 in obj[k]:
                if k2 in ['main_category', 'sub_category', 'poi_filter_desc', 'poi_filter_sql', 'sub_category_label', 'table']:
                    o[k2] = obj[k][k2]
            entitites['[1]'] = o
        else:
            entitites[k] = obj[k]
    return entitites

def clean_question(q, q_type):
    q = q.replace('  ', ' ')
    q = q.replace("The", "the")
    if type_labels[q_type] in ['T4', 'T16', 'T20']:
        q = q.replace("fast food", "fast food restaurant")
    if type_labels[q_type] == 'T16':
        q = q.replace("where can I find", "where can I find  a")
    q = q.replace('Pediatric emergency', 'pediatric emergency')
    return q

In [121]:
def create_directory(path):
    os.makedirs(path, exist_ok=True)

counts = defaultdict(int)
for q in questions:
    counts[q['type']] += 1
    baseline_answers = {}
    for b in answers:
        baseline_answers[b] = {}
        baseline_answers[b]['text'] = {
            'answer': answers[b]['text'][q['id']],
            'scores': scores[b]['text'][q['id']]
        }
        baseline_answers[b]['parsed'] = {
            'answer': extract_json_blocks(answers[b]['json'][int(q['id'])], q['id']),
            'scores': scores[b]['json'][q['id']]
        }
    _q = {}
    for k in q:
        if k == 'question_entities':
            _q[k] = clean_entities(q[k], q['type'])
        elif k == 'question':
            _q[k] = clean_question(q[k], q['type'])
        else:
            _q[k] = q[k]
    path = './benchmark/%s/%3d/' % (type_labels[q['type']], counts[q['type']])
    create_directory(path)
    with open(path + 'question.json', 'w') as f:
        f.write(json.dumps(_q, indent=2))
    with open(path + 'baseline_answers.json', 'w') as f:
        f.write(json.dumps(baseline_answers, indent=2))
    