In [1]:
import json
import random
import common

In [2]:
lang = "en"
diff = "medium"
excl = "easy"

vocabs_pool: set[str] = set()  # for faster in operation
vocabs_pool_list: list[str] = []  # for consistent results
exclusive_pool: set[str] = set()
alphabets: list[str] = []

with open(rf"vocabs\{lang}\{lang}-{diff}.json") as file:
    json_data = json.load(file)
    for d in json_data:
        vocabs_pool.add(d)
        vocabs_pool_list.append(d)
        exclusive_pool.add(d)
        alphabets = list(set(alphabets).union(set(d)))

if excl is not None:
    with open(rf"vocabs\{lang}\{lang}-{excl}.json") as file:
        json_data = json.load(file)
        for d in json_data:
            exclusive_pool.remove(d)

vocabs_pool_list.sort()
alphabets.sort()
num_vocabs = len(vocabs_pool)
print(len(alphabets), "".join(alphabets))
print(f"{len(vocabs_pool)=}")
print(f"{len(exclusive_pool)=}")
print(vocabs_pool_list[:5])

26 abcdefghijklmnopqrstuvwxyz
len(vocabs_pool)=2771
len(exclusive_pool)=1287
['aback', 'abate', 'abbey', 'abbot', 'abhor']


In [3]:
# medium: {"min_distance": 4, "max_distance": 6, "strict": True}
from collections import defaultdict

target_num_pairs = 3000
min_distance = 4
max_distance = 6
strict = True

starter = set()
involved = set()
counts = defaultdict(lambda: [0, 0])  # vocab, pair, exclusive

pairs: list[tuple[str, str]] = []

random.seed(0)
while len(pairs) < target_num_pairs:
    # random.randint to get consistent random results each time
    v1_idx = random.randint(0, len(vocabs_pool_list) - 1)
    v2_idx = random.randint(0, len(vocabs_pool_list) - 1)
    v1 = vocabs_pool_list[v1_idx]
    v2 = vocabs_pool_list[v2_idx]
    # print(v1_idx, v1, v2_idx, v2)
    if (v1, v2) in pairs or (v2, v1) in pairs:
        continue

    dist = common.distance(v1, v2, vocabs_pool, alphabets, max_distance=max_distance)
    if strict:
        difference = common.character_difference(v1, v2)
        flag = dist[0] > difference and dist[0] >= min_distance
    else:
        flag = dist[0] >= min_distance

    # if the pair has valid path
    if flag:
        counts[len(v1)][0] += 1
        starter.add(v1)
        involved = involved.union(set(dist[1]))
        # print(dist)

        if any(v in exclusive_pool for v in dist[1]):
            counts[len(v1)][1] += 1
            pairs.append((v1, v2))

            num_pair = len(pairs)
            if (num_pair + 1) % (target_num_pairs // 10) == 0:
                print(f'checking "{v1}"| {num_pair+1}/{target_num_pairs}')

checking "loom"| 300/3000
checking "till"| 600/3000
checking "shirk"| 900/3000
checking "fuel"| 1200/3000
checking "sport"| 1500/3000
checking "bend"| 1800/3000
checking "tile"| 2100/3000
checking "lady"| 2400/3000
checking "hawk"| 2700/3000
checking "lick"| 3000/3000


In [4]:
total_pairs = 0
total_exclusive = 0

print_table = []
for k, v in counts.items():
    count_pair = v[0]
    count_exlusive = v[1]

    to_print = [f"len {k}", f"{count_pair}", f"{count_exlusive}"]
    print_table.append(to_print)

    total_pairs += count_pair
    total_exclusive += count_exlusive

to_print = [f"total", f"{total_pairs}", f"{total_exclusive}"]
print_table.append(to_print)

print(f"----- pair & exclusive -----")
for items in print_table:
    print(f"{items[0]:<5}: {items[1]:>5} | {items[2]:>5}")
print("-" * 15)

starter_rate = len(starter) / num_vocabs
print(f"starter: {len(starter)}/{num_vocabs} = {starter_rate:.2%}")

participation_rate = len(involved) / num_vocabs
print(f"participation: {len(involved)}/{num_vocabs} = {participation_rate:.2%}")

----- pair & exclusive -----
len 4:  2772 |  2686
len 5:   327 |   314
total:  3099 |  3000
---------------
starter: 1029/2771 = 37.13%
participation: 1371/2771 = 49.48%


In [5]:
with open(rf"vocabs\{lang}\{lang}-pairs-{diff}.json", "w") as f:
    json.dump(pairs, f)

for i in range(5):
    pair = pairs[i]
    dist = common.distance(pair[0], pair[1], vocabs_pool, alphabets)
    print(dist)

(6, ['damn', 'damp', 'lamp', 'lame', 'same', 'some', 'sore'])
(5, ['roar', 'road', 'load', 'lord', 'lore', 'bore'])
(4, ['meal', 'deal', 'dual', 'dull', 'bull'])
(4, ['sure', 'cure', 'care', 'case', 'vase'])
(5, ['will', 'bill', 'bell', 'belt', 'beat', 'bean'])
