In [1]:
import os
import re
import time
import requests

import pickle
from tqdm import tqdm, trange

backend_url = "https://ttds-gutenberg-fvyohsgcaq-nw.a.run.app/"
regex_language_line = re.compile(r"(\w+) \([0-9]+ phrases\):")
regex_phrase_line = re.compile(r"(.+) \((.+)\)")

def fetch_english_testset(dir_: str="test_sets", fname: str="generic.txt"):
    with open(os.path.join(dir_, fname), 'r', encoding="utf-8") as f:
        queries = f.read().splitlines()
    return {q.strip('"') : [] for q in queries if q}

def fetch_multilingual_testset(dir_: str="test_sets", fname: str="multi-lingual.txt"):
    all_queries: dict[str, dict] = dict()
    current_language = None
    with open(os.path.join(dir_, fname), 'r', encoding="utf-8") as f:
        queries = f.read().splitlines()
    for line in queries:
        match_language_line = regex_language_line.match(line)
        if match_language_line is not None:
            current_language = match_language_line.group(1).strip().casefold()
            all_queries[current_language] = dict()
            continue
        match_phrase_line = regex_phrase_line.match(line)
        all_queries[current_language][(match_phrase_line.group(1), match_phrase_line.group(2))] = []
    return all_queries

In [2]:
generic = fetch_english_testset(fname="generic.txt")
memorable = fetch_english_testset(fname="memorable.txt")
multi_lingual = fetch_multilingual_testset(fname="multi-lingual.txt")

In [3]:
query_template = {
    "query"     : "",
    "languages" : ["english"],
    "subjects"  : [],
    "page"      : 1,
    "dist"      : 1,
    "numPerPage": 100000
}

In [4]:
assert requests.post(backend_url + "phrase", json=query_template).json()["err_msg"] == "No error"

In [5]:
def test_english(rounds: int=10, dist: int=3, *datasets):
    with tqdm(total=len(datasets) * 100 * rounds, ncols=80, leave=False) as pbar:
        for dataset in datasets:
            for _ in range(rounds):
                for query_str, record in dataset.items():
                    query_json = query_template.copy()
                    query_json["query"] = query_str
                    query_json["dist"] = dist
                    resp_json = requests.post(backend_url + "phrase", json=query_json).json()
                    assert resp_json["err_msg"] == "No error", f"{query_str}\n"+ resp_json["err_msg"]
                    record.append(resp_json["queryTime"])
                    pbar.update()
                    time.sleep(0.1)

In [6]:
test_english(10, 3, generic, memorable)

  0%|                                                  | 0/2000 [00:00<?, ?it/s]

  1%|▎                                      | 13/2000 [00:38<1:24:30,  2.55s/it]

In [None]:
with open("test_sets/english_results.pkl", "wb") as f:
    pickle.dump((generic, memorable), f, protocol=5)