In [10]:
import json

In [11]:
vocabs_pool: set[str] = set()
exclusive_pool: set[str] = set()
alphabets: set[str] = set()

with open(r"vocabs\es\es-len5-medium.json") as file:
    json_data = json.load(file)
    for d in json_data:
        vocabs_pool.add(d)
        exclusive_pool.add(d)
        alphabets = alphabets.union(set(d))

with open(r"vocabs\es\es-len5-easy.json") as file:
    json_data = json.load(file)
    for d in json_data:
        exclusive_pool.remove(d)

num_vocabs = len(vocabs_pool)
print(len(alphabets), alphabets)
print(f"{len(vocabs_pool)=}")
print(f"{len(exclusive_pool)=}")

32 {'ú', 'i', 'ñ', 'e', 'í', 'á', 'b', 'q', 'c', 'r', 'é', 'p', 'm', 'z', 'w', 'g', 'u', 'ó', 'd', 'o', 'k', 'x', 'f', 'a', 'y', 'v', 'l', 'j', 's', 't', 'n', 'h'}
len(vocabs_pool)=1000
len(exclusive_pool)=300


In [12]:
def change_character(vocab: str) -> set[str]:
    new_vocabs = set()
    for i, c in enumerate(vocab):
        for a in alphabets:
            if a != c:
                new_vocab = vocab[:i] + a + vocab[i + 1 :]
                new_vocabs.add(new_vocab)

    return new_vocabs


print(change_character("abc"))

{'agc', 'axc', 'aúc', 'abf', 'wbc', 'cbc', 'abv', 'abl', 'aby', 'ubc', 'aoc', 'aóc', 'aac', 'atc', 'abg', 'apc', 'abh', 'xbc', 'pbc', 'abn', 'abj', 'hbc', 'ábc', 'arc', 'abp', 'óbc', 'fbc', 'afc', 'anc', 'amc', 'ñbc', 'ebc', 'abi', 'tbc', 'abe', 'añc', 'abu', 'auc', 'abú', 'aíc', 'abm', 'sbc', 'nbc', 'abz', 'abo', 'abs', 'ajc', 'abá', 'abk', 'aqc', 'ayc', 'bbc', 'zbc', 'mbc', 'abd', 'awc', 'aec', 'lbc', 'íbc', 'acc', 'abx', 'obc', 'aác', 'abó', 'asc', 'avc', 'abr', 'azc', 'abb', 'alc', 'rbc', 'abw', 'qbc', 'ébc', 'vbc', 'aéc', 'úbc', 'ibc', 'aic', 'abñ', 'aba', 'ahc', 'ybc', 'jbc', 'kbc', 'adc', 'dbc', 'abé', 'abt', 'akc', 'abí', 'abq', 'gbc'}


In [13]:
def character_difference(from_: str, to: str) -> int:
    count = 0
    for c1, c2 in zip(from_, to):
        if c1 != c2:
            count += 1
    return count

In [14]:
def distance(from_: str, to: str, max_distance=None) -> tuple[int, list[str]]:
    vocab_path = {from_: ""}
    queue = [(from_, 0)]
    while queue:
        (cur_vocab, cur_distance) = queue.pop(0)
        if cur_vocab == to:
            path = [cur_vocab]
            while path[0] != from_:
                path.insert(0, vocab_path[path[0]])
            return (cur_distance, path)

        if cur_distance == max_distance:
            continue

        for new_vocab in change_character(cur_vocab):
            if (new_vocab not in vocab_path) and (new_vocab in vocabs_pool):
                vocab_path[new_vocab] = cur_vocab
                queue.append((new_vocab, cur_distance + 1))

    return (-1, [])

In [20]:
print(distance("admin", "admit"))
print(distance("dicho", "hecha"))

(-1, [])
(4, ['dicho', 'dicha', 'ficha', 'fecha', 'hecha'])


In [16]:
max_distance = 5
count = 0
count_exclusive = 0
starter = set()
involved = set()
pairs: list[tuple[str, str]] = []

for i, v1 in enumerate(vocabs_pool):
    for j, v2 in enumerate(vocabs_pool):
        dist = distance(v1, v2, max_distance=max_distance)
        diff = character_difference(v1, v2)
        if dist[0] > diff:
            count += 1
            starter.add(v1)
            involved = involved.union(set(dist[1]))
            # print(dist)

            if any(v in exclusive_pool for v in dist[1]):
                count_exclusive += 1
                pairs.append((v1, v2))

    if i % (num_vocabs // 10) == 0:
        print(f'checking "{v1}"| {i}/{num_vocabs}')

total_combinations = num_vocabs * (num_vocabs - 1)

pair_rate = count / total_combinations
print(f"pair rate: {count}/{total_combinations} = {pair_rate:.2%}")

exclusive_rate = count_exclusive / total_combinations
print(
    f"exclusive pair rate: {count_exclusive}/{total_combinations} = {exclusive_rate:.2%}"
)

starter_rate = len(starter) / num_vocabs
print(f"starter: {len(starter)}/{num_vocabs} = {starter_rate:.2%}")

participation_rate = len(involved) / num_vocabs
print(f"participation: {len(involved)}/{num_vocabs} = {participation_rate:.2%}")

checking "exige"| 0/1000
checking "salvo"| 100/1000
checking "local"| 200/1000
checking "contó"| 300/1000
checking "vende"| 400/1000
checking "señor"| 500/1000
checking "aviar"| 600/1000
checking "ahmad"| 700/1000
checking "norma"| 800/1000
checking "falta"| 900/1000
pair rate: 1536/999000 = 0.15%
exclusive pair rate: 1282/999000 = 0.13%
starter: 251/1000 = 25.10%
participation: 279/1000 = 27.90%


In [17]:
with open(rf"vocabs\es\es-len5-pairs-medium.json", "w") as f:
    json.dump(pairs, f)

In [18]:
def create_distance_graph():
    distance_graph: dict[tuple[str, str], tuple[int, list[str]]] = {}

    def calculate_distance(from_: str, to: str):
        if (from_, to) in distance_graph:
            return

        min_distance = -1
        min_path = []
        for new_vocab in change_character(from_):
            if new_vocab in vocabs_pool:
                calculate_distance(new_vocab, to)
                distance, path = distance_graph[(new_vocab, to)]
                if min_distance == -1 or 0 <= distance < min_distance:
                    min_distance = distance + 1
                    min_path = [from_] + path
        distance_graph[(from_, to)] = (min_distance, min_path)

    for v1 in vocabs_pool:
        for v2 in vocabs_pool:
            calculate_distance(v1, v2)
    return distance_graph


# distance_graph = create_distance_graph()
# print(distance_graph)