In [1]:
import os
import re
import time
import requests

import pickle
from tqdm import tqdm, trange

backend_url = "https://ttds-gutenberg-fvyohsgcaq-nw.a.run.app/"
regex_language_line = re.compile(r"(\w+) \([0-9]+ phrases\):")
regex_phrase_line = re.compile(r"(.+) \((.+)\)")

def fetch_english_testset(dir_: str="test_sets", fname: str="generic.txt"):
    with open(os.path.join(dir_, fname), 'r', encoding="utf-8") as f:
        queries = f.read().splitlines()
    return {q.strip('"') : [] for q in queries if q}

def fetch_multilingual_testset(dir_: str="test_sets", fname: str="multi-lingual.txt"):
    all_queries: dict[str, dict] = dict()
    current_language = None
    with open(os.path.join(dir_, fname), 'r', encoding="utf-8") as f:
        queries = f.read().splitlines()
    for line in queries:
        match_language_line = regex_language_line.match(line)
        if match_language_line is not None:
            current_language = match_language_line.group(1).strip().casefold()
            all_queries[current_language] = dict()
            continue
        match_phrase_line = regex_phrase_line.match(line)
        all_queries[current_language][(match_phrase_line.group(1), match_phrase_line.group(2))] = []
    return all_queries

In [2]:
generic = fetch_english_testset(fname="generic.txt")
memorable = fetch_english_testset(fname="memorable.txt")
multi_lingual = fetch_multilingual_testset(fname="multi-lingual.txt")

In [3]:
query_template = {
    "query"     : "",
    "languages" : ["english"],
    "subjects"  : [],
    "page"      : 1,
    "dist"      : 1,
    "numPerPage": 100000
}

In [4]:
assert requests.post(backend_url + "phrase", json=query_template).json()["err_msg"] == "No error"

In [5]:
def test_english(rounds: int, dist: int, *datasets):
    print(requests.post(backend_url + "clearcache", json={"clear_cache" : True}).json())
    with tqdm(total=len(datasets) * 100 * rounds, ncols=80, leave=False) as pbar:
        for dataset in datasets:
            for _ in range(rounds):
                for query_str, record in dataset.items():
                    query_json = query_template.copy()
                    query_json["query"] = query_str
                    query_json["dist"] = dist
                    resp_json = requests.post(backend_url + "phrase", json=query_json).json()
                    assert resp_json["err_msg"] == "No error", f"{query_str}\n"+ resp_json["err_msg"]
                    record.append((resp_json["queryTime"], resp_json["cache_size"]))
                    pbar.update()
                    time.sleep(0.1)

def test_multi_lingual(rounds: int, dist: int, datasets: dict):
    print(requests.post(backend_url + "clearcache", json={"clear_cache" : True}).json())
    total = sum(len(v) for v in datasets.values()) * rounds
    with tqdm(total=total, ncols=80, leave=False) as pbar:
        for language, dataset in datasets.items():
            for _ in range(rounds):
                for (query_str, query_english), record in dataset.items():
                    query_json = query_template.copy()
                    query_json["query"] = query_str
                    query_json["dist"] = dist
                    query_json["languages"] = [language]
                    resp_json = requests.post(backend_url + "phrase", json=query_json).json()
                    assert resp_json["err_msg"] == "No error", f"{query_str}\n"+ resp_json["err_msg"]
                    record.append((resp_json["queryTime"], resp_json["cache_size"]))
                    pbar.update()
                    time.sleep(0.1)

In [6]:
test_english(10, 3, generic, memorable)

{'err_msg': 'No error', 'index_size': 3}


                                                                                

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
test_multi_lingual(10, 3, multi_lingual)

In [None]:
with open("test_sets/test_results.pkl", "wb") as f:
    pickle.dump((generic, memorable, multi_lingual), f, protocol=5)

In [None]:
with open("test_sets/partial_results.pkl", "wb") as f:
    pickle.dump((generic, memorable, multi_lingual), f, protocol=5)

In [7]:
generic

{'A walk in the park': [(2.281963348388672, 11), (1.9976036548614502, 397)],
 'The sun rose over the hills': [(3.752171516418457, 30),
  (3.5701582431793213, 416)],
 'A quiet, small town': [(4.2121076583862305, 54), (3.9876163005828857, 440)],
 'The bustling city streets': [(2.9834766387939453, 71),
  (2.5177524089813232, 457)],
 'Lost in thought': [(3.7271931171417236, 95), (3.672752857208252, 481)],
 'A moment of realization': [(2.4734902381896973, 110),
  (1.0236351490020752, 484)],
 'The taste of freedom': [(1.9344511032104492, 117),
  (1.7960398197174072, 491)],
 'A cold winter night': [(4.164812326431274, 141), (3.984376907348633, 515)],
 'The warmth of summer': [(1.3845715522766113, 146), (1.377873420715332, 520)],
 'An unexpected visitor': [(1.387603759765625, 150),
  (1.3425376415252686, 524)],
 'A letter arrived': [(3.045915126800537, 167), (2.585296630859375, 541)],
 'The sound of laughter': [(2.0476531982421875, 177),
  (1.946216106414795, 551)],
 'Tears of joy': [(2.346754