In [1]:
import json
import random
import common
from collections import defaultdict

In [2]:
lang = "en"
diff = "easy"
excl = None

vocabs_pool: set[str] = set()  # for faster in operation
vocabs_pool_list: list[str] = []  # for consistent results
exclusive_pool: set[str] = set()
alphabets: list[str] = []

with open(rf"vocabs\{lang}\{lang}-{diff}.json") as file:
    json_data = json.load(file)
    for d in json_data:
        vocabs_pool.add(d)
        vocabs_pool_list.append(d)
        exclusive_pool.add(d)
        alphabets = list(set(alphabets).union(set(d)))

if excl is not None:
    with open(rf"vocabs\{lang}\{lang}-{excl}.json") as file:
        json_data = json.load(file)
        for d in json_data:
            exclusive_pool.remove(d)

vocabs_pool_list.sort()
alphabets.sort()
num_vocabs = len(vocabs_pool)
print(len(alphabets), "".join(alphabets))
print(f"{len(vocabs_pool)=}")
print(f"{len(exclusive_pool)=}")
print(vocabs_pool_list[:5])

26 abcdefghijklmnopqrstuvwxyz
len(vocabs_pool)=2575
len(exclusive_pool)=2575
['aback', 'abate', 'abbey', 'abbot', 'abhor']


In [3]:
config = {
    "easy": {"min_distance": 3, "max_distance": 5, "strict": True},
    "medium": {"min_distance": 4, "max_distance": 6, "strict": True},
    "hard": {"min_distance": 5, "max_distance": 8, "strict": True},
}

target_num_pairs = 3000
min_distance = config[diff]["min_distance"]
max_distance = config[diff]["max_distance"]
strict = config[diff]["strict"]

In [4]:
starter = set()
involved = set()
length_counters = defaultdict(lambda: 0)
distance_counters = defaultdict(lambda: 0)

pairs: list[tuple[str, str]] = []

random.seed(0)
while len(pairs) < target_num_pairs:
    v1 = random.choice(vocabs_pool_list)
    v2 = random.choice(vocabs_pool_list)

    if (v1, v2) in pairs or (v2, v1) in pairs:
        continue

    dist = common.distance(v1, v2, vocabs_pool, alphabets, max_distance=max_distance)
    if strict:
        difference = common.character_difference(v1, v2)
        is_valid = dist[0] > difference and dist[0] >= min_distance
    else:
        is_valid = dist[0] >= min_distance
    is_valid = is_valid and any(v in exclusive_pool for v in dist[1])

    if is_valid:
        pairs.append((v1, v2))
        length_counters[len(v1)] += 1
        distance_counters[dist[0]] += 1
        starter.add(v1)
        involved = involved.union(set(dist[1]))

        num_pair = len(pairs)
        if (num_pair + 1) % (target_num_pairs // 10) == 0:
            print(f'checking "{v1}"| {num_pair+1}/{target_num_pairs}')

checking "pick"| 300/3000


In [None]:
total_pairs = 0

print_table = []
for k, count in length_counters.items():
    to_print = [f"len {k}", f"{count}"]
    print_table.append(to_print)

    total_pairs += count

for k, count in distance_counters.items():
    to_print = [f"dis {k}", f"{count}"]
    print_table.append(to_print)

to_print = [f"total", f"{total_pairs}"]
print_table.append(to_print)

print(f"----- pair & distance -----")
for items in print_table:
    print(f"{items[0]:<5}: {items[1]:>5}")
print("-" * 15)

starter_rate = len(starter) / num_vocabs
print(f"starter: {len(starter)}/{num_vocabs} = {starter_rate:.2%}")

participation_rate = len(involved) / num_vocabs
print(f"participation: {len(involved)}/{num_vocabs} = {participation_rate:.2%}")

----- pair & exclusive -----
len 4:  2772 |  2686
len 5:   327 |   314
total:  3099 |  3000
---------------
starter: 1029/2771 = 37.13%
participation: 1371/2771 = 49.48%


In [None]:
with open(rf"vocabs\{lang}\{lang}-pairs-{diff}.json", "w") as f:
    json.dump(pairs, f)

for i in range(5):
    pair = pairs[i]
    dist = common.distance(pair[0], pair[1], vocabs_pool, alphabets)
    print(dist)

(6, ['damn', 'damp', 'lamp', 'lame', 'same', 'some', 'sore'])
(5, ['roar', 'road', 'load', 'lord', 'lore', 'bore'])
(4, ['meal', 'deal', 'dual', 'dull', 'bull'])
(4, ['sure', 'cure', 'care', 'case', 'vase'])
(5, ['will', 'bill', 'bell', 'belt', 'beat', 'bean'])
