In [None]:
import string
import random
import numpy as np
import matplotlib.pyplot as plt

from string_matching import (
    levenshteinDistance,
    get_llm_ids_and_fullnames,
    get_llm_distances,
    get_matches,
)

In [None]:
def randomString(length, alphabet):
    return ''.join(random.choice(alphabet) for _ in range(length))

def get_random_distance(n, alphabet=string.ascii_lowercase):
    s1 = randomString(n, alphabet)
    s2 = randomString(n, alphabet)
    return levenshteinDistance(s1, s2)

def get_random_distances(n, m, alphabet=string.ascii_lowercase):
    return [get_random_distance(n, alphabet) for i in range(m)]

alphabet = string.digits
n = 8
k = len(alphabet)
n_samples = 1000000
distances = get_random_distances(n, n_samples, alphabet)


In [None]:
distances = np.array(distances)
print('mean:', np.mean(distances))
print('std:', np.std(distances))
print('min:', np.min(distances))
print('max:', np.max(distances))
print(len(distances[distances == 3]))
print(len(distances[distances == 2]))
print(len(distances[distances == 1]))
d_cutoff = 2
frac_lt_cut = len(distances[distances <= d_cutoff]) / n_samples
print(f'percent d <= {d_cutoff}: {frac_lt_cut*100:.5f}%')

In [None]:
plt.figure()
plt.title(f'Levenshtein distance [{n=:}, {k=:}, {n_samples=:.0e}]')
plt.hist(distances, bins=[0.5 + i for i in range(n+1)], density=True, label=r'$\mathrm{Prob}(d\leq2) \approx$' + f'{frac_lt_cut:.4%}')
plt.xlabel('Levenshtein distance')
plt.ylabel('Density')
plt.legend()
plt.savefig(f'levenshtein_distance_{n}_{k}_{n_samples}.png')

In [None]:
df_llm = get_llm_ids_and_fullnames("tests/output/qwen2-VL-2B-results.json")
df_llm.head()

In [None]:
df_test = get_llm_distances(df_llm, "imgs/q11/doc_info.csv", "tests/data/test_ids.csv")
df_test.head(30)

In [None]:
df_matching = get_matches(df_test)
print(df_matching.query("found == False"))
df_matching[df_matching.found]

In [None]:
# plot the distribution of ID distances
plt.figure()
plt.title('LLM Levenshtein distances from test IDs')
plt.hist(df_test.id_distance, bins=[i-0.5 for i in range(10)], label=r'# $d_\mathrm{ID} \leq 2$ = ' + f'{len(df_test[df_test.id_distance <= 2])}')
plt.xlabel('Levenshtein distance')
plt.ylabel('Count')
plt.legend()
plt.savefig('hist_llm_ID_leven_distances.png')

In [None]:
# plot the distribution of last name distances
plt.figure()
plt.title('LLM Levenshtein distances from test last names')
plt.hist(df_test.lastname_distance, bins=[i-0.5 for i in range(15)], label=r'# $d_\mathrm{last} \leq 2$ = ' + f'{len(df_test[df_test.lastname_distance <= 2])}')
plt.xlabel('Levenshtein distance')
plt.ylabel('Count')
plt.legend()
plt.savefig('hist_llm_lastname_leven_distances.png')

In [None]:
# Combine the two histograms subplots
fig, axs = plt.subplots(2, 1, figsize=(6, 8))
fig.suptitle('LLM Levenshtein distances from test IDs and last names')
axs[0].hist(df_test.id_distance, bins=[i-0.5 for i in range(10)], label=r'# $d_\mathrm{ID} \leq 2$ = ' + f'{len(df_test[df_test.id_distance <= 2])}')
axs[0].set_xlabel('Levenshtein distance')
axs[0].set_ylabel('ID Pairs Count')
axs[0].legend()
axs[1].hist(df_test.lastname_distance, bins=[i-0.5 for i in range(15)], label=r'# $d_\mathrm{last} \leq 2$ = ' + f'{len(df_test[df_test.lastname_distance <= 2])}')
axs[1].set_xlabel('Levenshtein distance')
axs[1].set_ylabel('Name Pairs Count')
axs[1].legend()
plt.tight_layout()
plt.savefig('hist_llm_ID_lastname_leven_distances.png')
plt.show()