In [2]:
import pandas as pd
import geopandas as gpd
from glob import glob
import json
import math
from geopy.geocoders import Nominatim
import re

In [3]:

paths = {}
for i in ["[1]", "[2]", "[3]", "[4]"]:
    keys = [
        # "main_category",
        "sub_category",
        "geo_wkt",
        "distance",
        "direction_desc",
        "poi_filter_desc"
    ]
    for k in keys:
        if (i == '[1]' and k in ['geo_wkt', 'distance', 'direction_desc']
            or i != '[1]' and k in ['poi_filter_desc']):
            continue
        paths[i+'_'+k] = ["question_entities", i, k]
# paths['answer_geo'] = ['answers', 0,'geometry']
paths['[3]_state'] = ['question_entities', '[3]', 'poi', 'addr_state']
paths['[2]_state'] = ['question_entities', '[2]', 'poi', 'addr_state']
paths['[2]_border'] = ['question_entities', '[2]', 'region', 'border_type']
paths['[2]_region_state'] = ['question_entities', '[2]', 'region', 'region_name']

In [4]:
def parse_jsonl(fpath):
    with open(fpath, 'r') as file:
        lines = file.readlines()
        json_objects = []
        for l in lines:
            o = json.loads(l)
            if 'loc.jsonl' in fpath:
                _answers = []
                for answer in o['answers']:
                    keys = ['addr_state', 'addr_city']
                    if 'addr_city' in answer or 'addr_state' in answer:
                        _answers.append(answer)
                if len(_answers) == 0:
                    continue
                else:
                    o['answers'] = _answers
            elif 'name.jsonl' in fpath:
                keys = ['poi_name', 'lake_name', 'park_name', 'road_name', 'region_name', 'wikipedia']
                _answers = []
                for answer in o['answers']:
                    for k in keys:
                        if k in answer:
                            _answers.append(answer)
                            continue
                if len(_answers) == 0:
                    continue
                else:
                    o['answers'] = _answers
            elif 'multihop1' in fpath:
                _answers = []
                for answer in _answers:
                    att = set(re.sub(r'[^a-zA-Z0-9\s]', '', answer['multihop_attribute']).lower().split())
                    ans = set(re.sub(r'[^a-zA-Z0-9\s]', '', answer['multihop_answer']).lower().split())
                    print(att, ans)
                    if not len(att & ans):
                        _answers.append(answer)
                if len(_answers) == 0:
                    continue
                else:
                    o['answers'] = _answers 
            elif 'count' in fpath:
                if o['answers'][0]['count'] == 0:
                    continue
            json_objects.append(o)
    return json_objects

In [5]:
def get_by_keys(o, keys):
    _o = o
    for k in keys:
        if k in _o or (k == 0 and type(_o) == list):
            _o = _o[k]
            if k == 'region_name':
                _o = _o[_o.rfind(', ')+1:]
        else:
            _o = None
            break
    return _o

In [6]:
def select_questions(df):
    grouping_cols = ['[3]_direction_desc', '[4]_direction_desc', '[1]_sub_category', '[2]_border', '[2]_region_state', '[2]_sub_category', '[3]_sub_category', '[4]_sub_category', '[3]_state', '[2]_state']
    sel_col = []
    count = 0
    for c in grouping_cols:
        if c in df:
            sel_col.append(c)
        if len(sel_col):
            count = df.groupby(sel_col).count().shape[0]
            if count > 100:
                break
    print(sel_col, count)
    selected = list(df.groupby(sel_col).sample(1).sample(100).index)
    return selected


In [8]:
generated_files = glob('./questions/*.jsonl')
for fpath in generated_files:
    if 'length_max+name' in fpath:
        continue
    if 'multihop' in fpath:
        continue
    extracted = {k: [] for k in paths}
    objects = parse_jsonl(fpath)
    for p in paths:
        for o in objects:
            extracted[p].append(get_by_keys(o, paths[p]))
    df = pd.DataFrame(extracted).dropna(axis=1, how='all')
    print(fpath, len(objects), df.shape)
    selected = select_questions(df)
    new_path = fpath.replace('questions', 'selected_questions')
    with open(new_path, 'w') as file:
        for i in selected:
            q = json.dumps(objects[i])
            file.write(q + '\n')

./questions/range:direction+loc.jsonl 686 (686, 6)
['[4]_direction_desc', '[1]_sub_category'] 113
./questions/knn:towards+name.jsonl 1000 (1000, 7)
['[1]_sub_category', '[2]_sub_category', '[3]_sub_category'] 433
./questions/knn+name.jsonl 1000 (1000, 4)
['[1]_sub_category', '[2]_sub_category'] 245
./questions/knn:non_spat_filter+loc.jsonl 764 (764, 5)
['[1]_sub_category', '[2]_sub_category', '[2]_state'] 474
./questions/range:non_spat_filter+loc.jsonl 934 (934, 6)
['[1]_sub_category', '[3]_sub_category', '[3]_state'] 483
./questions/range:non_spat_filter+name.jsonl 1000 (1000, 6)
['[1]_sub_category', '[3]_sub_category', '[3]_state'] 512
./questions/knn:non_spat_filter+name.jsonl 1000 (1000, 5)
['[1]_sub_category', '[2]_sub_category', '[2]_state'] 541
./questions/range+distance.jsonl 1000 (1000, 5)
['[1]_sub_category', '[3]_sub_category'] 229
./questions/range+name.jsonl 1000 (1000, 5)
['[1]_sub_category', '[3]_sub_category'] 222
./questions/range+angle.jsonl 1000 (1000, 5)
['[1]_sub_c