In [1]:
import json
import random
import common

In [2]:
lang = "en"
diff = "easy"
excl = None

vocabs_pool: set[str] = set()  # for faster in operation
vocabs_pool_list: list[str] = []  # for consistent results
exclusive_pool: set[str] = set()
alphabets: list[str] = []

with open(rf"vocabs\{lang}\{lang}-{diff}.json") as file:
    json_data = json.load(file)
    for d in json_data:
        vocabs_pool.add(d)
        vocabs_pool_list.append(d)
        exclusive_pool.add(d)
        alphabets = list(set(alphabets).union(set(d)))

if excl is not None:
    with open(rf"vocabs\{lang}\{lang}-{excl}.json") as file:
        json_data = json.load(file)
        for d in json_data:
            exclusive_pool.remove(d)

vocabs_pool_list.sort()
alphabets.sort()
num_vocabs = len(vocabs_pool)
print(len(alphabets), "".join(alphabets))
print(f"{len(vocabs_pool)=}")
print(f"{len(exclusive_pool)=}")

26 abcdefghijklmnopqrstuvwxyz
len(vocabs_pool)=1484
len(exclusive_pool)=1484


In [3]:
print(common.change_character("abc", alphabets))

['bbc', 'cbc', 'dbc', 'ebc', 'fbc', 'gbc', 'hbc', 'ibc', 'jbc', 'kbc', 'lbc', 'mbc', 'nbc', 'obc', 'pbc', 'qbc', 'rbc', 'sbc', 'tbc', 'ubc', 'vbc', 'wbc', 'xbc', 'ybc', 'zbc', 'aac', 'acc', 'adc', 'aec', 'afc', 'agc', 'ahc', 'aic', 'ajc', 'akc', 'alc', 'amc', 'anc', 'aoc', 'apc', 'aqc', 'arc', 'asc', 'atc', 'auc', 'avc', 'awc', 'axc', 'ayc', 'azc', 'aba', 'abb', 'abd', 'abe', 'abf', 'abg', 'abh', 'abi', 'abj', 'abk', 'abl', 'abm', 'abn', 'abo', 'abp', 'abq', 'abr', 'abs', 'abt', 'abu', 'abv', 'abw', 'abx', 'aby', 'abz']


In [4]:
print(common.distance("admin", "admit", vocabs_pool, alphabets))
print(common.distance("line", "firm", vocabs_pool, alphabets))
print(common.distance("センセイ", "モンダイ", vocabs_pool, alphabets))
print(common.distance("caro", "mano", vocabs_pool, alphabets))

(1, ['admin', 'admit'])
(3, ['line', 'fine', 'fire', 'firm'])
(-1, [])
(-1, [])


In [5]:
# easy: {"min_distance": 3, "max_distance": 4, "strict": False}
from collections import defaultdict

min_distance = 3
max_distance = 4
strict = False

starter = set()
involved = set()
counts = defaultdict(lambda: [0, 0, 0])  # vocab, pair, exclusive

pairs: list[tuple[str, str]] = []

for i, v1 in enumerate(vocabs_pool_list):
    for j, v2 in enumerate(vocabs_pool_list):
        dist = common.distance(
            v1, v2, vocabs_pool, alphabets, max_distance=max_distance
        )
        if strict:
            difference = common.character_difference(v1, v2)
            flag = dist[0] > difference and dist[0] >= min_distance
        else:
            flag = dist[0] >= min_distance

        if flag:
            counts[len(v1)][1] += 1
            starter.add(v1)
            involved = involved.union(set(dist[1]))
            # print(dist)

            if any(v in exclusive_pool for v in dist[1]):
                counts[len(v1)][2] += 1
                pairs.append((v1, v2))

    counts[len(v1)][0] += 1
    if (i + 1) % (num_vocabs // 10) == 0:
        print(f'checking "{v1}"| {i+1}/{num_vocabs}')

checking "book"| 148/1484
checking "crush"| 296/1484
checking "fever"| 444/1484


KeyboardInterrupt: 

In [None]:
total_vocab = 0
total_combinations = 0
total_pairs = 0
total_exclusive = 0

print_table = []
for k, v in counts.items():
    count_vocab = v[0]
    vocab_combinations = count_vocab * (count_vocab - 1)
    count_pair = v[1]
    count_exlusive = v[2]

    cur_pair_rate = count_pair / vocab_combinations
    cur_exclusive_rate = count_exlusive / vocab_combinations
    to_print = [
        f"len {k}",
        f"{count_vocab}",
        f"{count_pair}/{vocab_combinations}",
        f"{cur_pair_rate:.2%}",
        f"{count_exlusive}/{vocab_combinations}",
        f"{cur_exclusive_rate:.2%}",
    ]
    print_table.append(to_print)

    total_vocab += count_vocab
    total_combinations += vocab_combinations
    total_pairs += count_pair
    total_exclusive += count_exlusive

pair_rate = total_pairs / total_combinations
exclusive_rate = total_exclusive / total_combinations
to_print = [
    f"total",
    f"{total_vocab}",
    f"{total_pairs}/{total_combinations}",
    f"{pair_rate:.2%}",
    f"{total_exclusive}/{total_combinations}",
    f"{exclusive_rate:.2%}",
]
print_table.append(to_print)

print(f"----- pair rate & exclusive rate -----")
for items in print_table:
    print(
        f"{items[0]:<5}, #{items[1]:5}:"
        f" {items[2]:>13} = {items[3]:<7} |"
        f" {items[4]:>13} = {items[5]:<7}"
    )
print("-" * 15)

starter_rate = len(starter) / num_vocabs
print(f"starter: {len(starter)}/{num_vocabs} = {starter_rate:.2%}")

participation_rate = len(involved) / num_vocabs
print(f"participation: {len(involved)}/{num_vocabs} = {participation_rate:.2%}")

ZeroDivisionError: division by zero

In [None]:
# use _quick to output json

random.seed(0)
pairs_shuffled = pairs[:3000]
random.shuffle(pairs_shuffled)
# with open(rf"vocabs\{lang}\{lang}-pairs-{diff}.json", "w") as f:
#     json.dump(pairs_shuffled, f)

for i in range(5):
    pair = pairs_shuffled[i]
    dist = common.distance(pair[0], pair[1], vocabs_pool, alphabets)
    print(dist)

(3, ['タイショク', 'タイショウ', 'アイショウ', 'アイジョウ'])
(4, ['ショウコウ', 'リョウコウ', 'リョウヨウ', 'キョウヨウ', 'キョウソウ'])
(4, ['シュウトク', 'シュウカク', 'シュウカン', 'チュウカン', 'チュウシン'])
(4, ['イッショウ', 'ケッショウ', 'ケイショウ', 'タイショウ', 'タイショク'])
(3, ['イッショウ', 'インショウ', 'サンショウ', 'サンチョウ'])


In [None]:
def create_distance_graph():
    distance_graph: dict[tuple[str, str], tuple[int, list[str]]] = {}

    def calculate_distance(from_: str, to: str):
        if (from_, to) in distance_graph:
            return

        min_distance = -1
        min_path = []
        for new_vocab in common.change_character(from_, alphabets):
            if new_vocab in vocabs_pool:
                calculate_distance(new_vocab, to)
                distance, path = distance_graph[(new_vocab, to)]
                if min_distance == -1 or 0 <= distance < min_distance:
                    min_distance = distance + 1
                    min_path = [from_] + path
        distance_graph[(from_, to)] = (min_distance, min_path)

    for v1 in vocabs_pool:
        for v2 in vocabs_pool:
            calculate_distance(v1, v2)
    return distance_graph


# distance_graph = create_distance_graph()
# print(distance_graph)