In [1]:
import os
import re
import time
import requests

import numpy as np
import pickle
from tqdm import tqdm, trange

backend_url = "https://ttds-gutenberg-fvyohsgcaq-nw.a.run.app/"
regex_language_line = re.compile(r"(\w+) \([0-9]+ phrases\):")
regex_phrase_line = re.compile(r"(.+) \((.+)\)")

def fetch_english_testset(dir_: str="test_sets", fname: str="generic.txt"):
    with open(os.path.join(dir_, fname), 'r', encoding="utf-8") as f:
        queries = f.read().splitlines()
    return {q.strip('"') : [] for q in queries if q}

def fetch_multilingual_testset(dir_: str="test_sets", fname: str="multi-lingual.txt"):
    all_queries: dict[str, dict] = dict()
    current_language = None
    with open(os.path.join(dir_, fname), 'r', encoding="utf-8") as f:
        queries = f.read().splitlines()
    for line in queries:
        match_language_line = regex_language_line.match(line)
        if match_language_line is not None:
            current_language = match_language_line.group(1).strip().casefold()
            all_queries[current_language] = dict()
            continue
        match_phrase_line = regex_phrase_line.match(line)
        all_queries[current_language][(match_phrase_line.group(1), match_phrase_line.group(2))] = []
    return all_queries

In [2]:
generic = fetch_english_testset(fname="generic.txt")
memorable = fetch_english_testset(fname="memorable.txt")
multi_lingual = fetch_multilingual_testset(fname="multi-lingual.txt")

In [3]:
query_template = {
    "query"     : "",
    "languages" : ["english"],
    "subjects"  : [],
    "page"      : 1,
    "dist"      : 1,
    "numPerPage": 100000
}

In [4]:
assert requests.post(backend_url + "phrase", json=query_template).json()["err_msg"] == "No error"

In [5]:
def test_english(rounds: int, dist: int, *datasets):
    with tqdm(total=len(datasets) * 100 * rounds, ncols=80, leave=False) as pbar:
        for dataset in datasets:
            requests.post(backend_url + "setcache", json={"clear_cache" : True})
            for _ in range(rounds):
                for query_str, record in dataset.items():
                    query_json = query_template.copy()
                    query_json["query"] = query_str
                    query_json["dist"] = dist
                    keep_trying = True
                    retry_counter = 0
                    while keep_trying:
                        try:
                            resp_json = requests.post(backend_url + "phrase", json=query_json, timeout=90).json()
                            assert resp_json["err_msg"] == "No error", f"{query_str}\n"+ resp_json["err_msg"]
                            keep_trying = False
                        except Exception as e:
                            retry_counter += 1
                            if retry_counter > 10:
                                raise e
                            time.sleep(1)
                    del resp_json["books"]
                    record.append(resp_json)
                    pbar.update()
                    time.sleep(0.2)

def test_multi_lingual(rounds: int, dist: int, datasets: dict):
    total = sum(len(v) for v in datasets.values()) * rounds
    with tqdm(total=total, ncols=80, leave=False) as pbar:
        for language, dataset in datasets.items():
            requests.post(backend_url + "setcache", json={"clear_cache" : True}).json()
            for _ in range(rounds):
                for (query_str, _), record in dataset.items():
                    query_json = query_template.copy()
                    query_json["query"] = query_str
                    query_json["dist"] = dist
                    query_json["languages"] = [language]
                    keep_trying = True
                    retry_counter = 0
                    while keep_trying:
                        try:
                            resp_json = requests.post(backend_url + "phrase", json=query_json, timeout=90).json()
                            assert resp_json["err_msg"] == "No error", f"{query_str}\n"+ resp_json["err_msg"]
                            keep_trying = False
                        except Exception as e:
                            retry_counter += 1
                            if retry_counter > 10:
                                raise e
                            time.sleep(1)
                    del resp_json["books"]
                    record.append(resp_json)
                    pbar.update()
                    time.sleep(0.2)

In [6]:
test_multi_lingual(10, 3, multi_lingual)

  0%|                                                  | 0/1000 [00:00<?, ?it/s]

                                                                                

In [7]:
test_english(10, 3, memorable, generic)

                                                                                

In [8]:
with open("test_sets/test_results.pkl", "wb") as f:
    pickle.dump((generic, memorable, multi_lingual), f, protocol=5)

In [9]:
def write_as_csv(result_dict, fname):
    content = "query_id,query,number of matches,average response time over 10 rounds (s),minimum response time (s),maximum response time,standard deviation of query response time (ms)\n"
    for i, (k, v) in enumerate(result_dict.items()):
        query_times = np.array([j["queryTime"] for j in v], dtype=np.float64)
        num_matches = v[0]["totalNum"]
        content += f"{i},\"{k}\",{num_matches},{query_times.mean()},{query_times.min()},{query_times.max()},{query_times.std() * 100}\n"
    with open(fname, 'w', encoding="utf-8") as f:
        f.write(content)

In [10]:
write_as_csv(generic, "test_sets/generic_results.csv")

In [11]:
write_as_csv(memorable, "test_sets/memorable_results.csv")

In [12]:
for k, d in multi_lingual.items():
    write_as_csv({query : v for (query, translation), v in d.items()}, f"test_sets/{k.lower()}_results.csv")