In [2]:
# Loading in dataset
import json
def read_paragraphs_from_json(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data[0]["para"]

In [4]:
import Levenshtein

def filter_out_words(input_text: str):
    split = input_text.split(' ')
    no_num = [w for w in split if not w.isdigit()]
    short_words = [w for w in no_num if 1 < len(w) < 15 and w.isupper()]
    return short_words

def levenstein_distance(paragraph_one, paragraph_two):
    words_one = filter_out_words(paragraph_one)
    words_two = filter_out_words(paragraph_two)

    # Find all word pairs with Levenshtein distance == 1
    levenstein_1_pairs: list[tuple[str, str]] = [
        (w1, w2)
        for w1 in words_one
        for w2 in words_two
        if Levenshtein.distance(w1, w2) == 1 and len(w1) == len(w2)
    ]

    return levenstein_1_pairs

en_data = read_paragraphs_from_json("../data/test_sample_en_parsed.json")
de_data = read_paragraphs_from_json("../data/test_sample_de_parsed.json")

total = 0
levenstein_1_pairs = []
for i, _ in enumerate(en_data):
    en_results = levenstein_distance(en_data[i]["para"], de_data[i]["para"])
    total += len(en_results)
    levenstein_1_pairs += en_results

grouped_count = {}
levenstein_frequency_filtered = []
for levenstein_1_pair in levenstein_1_pairs:
    grouped_count[levenstein_1_pair] = grouped_count.get(levenstein_1_pair, 0) + 1

for levenstein_1_pair in levenstein_1_pairs:
    if grouped_count.get(levenstein_1_pair, 0) < 10:
        levenstein_frequency_filtered.append(levenstein_1_pair)

for w1, w2 in levenstein_frequency_filtered:
    print(f"{w1!r} ↔ {w2!r}")

print(len(levenstein_frequency_filtered))

'AND' ↔ 'UND'
'AND' ↔ 'UND'
'(MFF)' ↔ '(MFR)'
'(ECB)' ↔ '(EZB)'
'ECB' ↔ 'EZB'
'(EC)' ↔ '(EG)'
6
