In [1]:
import os
import json
import re
from difflib import SequenceMatcher
from Levenshtein import ratio  # Levenshtein Distance
from jamo import h2j  # Converts Hangul characters into Jamo units

## Determine Language & Define File Paths

In [7]:
# Determine Language Based on File Name
DATA_ORIGINAL = "eng_abbreviations.json"
filename = os.path.basename(DATA_ORIGINAL)
IS_KOREAN = filename.startswith("kor_")
IS_ENGLISH = not IS_KOREAN

# List of files to process
file_paths = [
    " ",
    " ",
    " "
]

## Load Reference Dataset

In [3]:
# Load the reference dataset
with open(DATA_ORIGINAL, "r", encoding="utf-8") as file:
    abbreviation_data = json.load(file)

# Convert reference dataset into a dictionary (abbreviation → original phrase)
abbreviation_dict = {item["transformed"]: item["original"] for item in abbreviation_data}

## Preprocess Text Data

In [4]:
# Text cleaning function
def clean_text(text: str, keep_pipe=False) -> str:
    """
    Cleans input text based on the preprocessing requirements of the specific task.

    Args:
        text (str): The input string to be cleaned.
        keep_pipe (bool): If True, retains the '|' character (used for original answers with multiple references).

    Returns:
        str: The cleaned text, processed according to the specified task requirements.
    """
    if isinstance(text, str):
        # Remove specific special characters [,],",\,?,&,*
        text = re.sub(r'[\[\]\",\\\?\&\*]', '', text)

        if IS_ENGLISH:
            text = text.lower()  # Convert to lowercase for English processing

        if keep_pipe:
            pass  # Keep '|' character if necessary
        else:
            text = text.replace("|", "")  # Remove '|' if not needed

        # Remove spaces
        text = text.replace(" ", "")

    return text

## Define Similarity Metrics

In [5]:
# Similarity measurement functions
def sequence_similarity(a, b):
    """Computes SequenceMatcher similarity score between two strings."""
    return SequenceMatcher(None, a, b).ratio()

def levenshtein_similarity(a, b):
    """Computes Levenshtein similarity score between two strings."""
    return ratio(a, b)

# Function to extract Jamo (Korean phonetic units)
def get_jamo(word):
    """Extracts Jamo representation from Korean text using h2j()."""
    return h2j(word)  # Converts Hangul to Jamo

# Jamo-based similarity function (applied only to Korean data)
def jamo_similarity(word1, word2):
    """Computes similarity based on Jamo decomposition for Korean text."""
    jamo1 = get_jamo(word1)
    jamo2 = get_jamo(word2)

    common_count = sum(1 for a, b in zip(jamo1, jamo2) if a == b)
    max_length = max(len(jamo1), len(jamo2))

    return common_count / max_length if max_length > 0 else 0

## Answer Selection & Processing

In [None]:
# Function to split answers while preserving commas inside quotes
def smart_split(answer_text):
    """Splits a string while preserving content inside single quotes."""
    matches = re.findall(r"'(.*?)'", answer_text)  # Extract content inside quotes
    return matches if matches else [answer_text.strip("'")]

# Function to find the best match when multiple correct answers are possible
def get_best_match(answer_list, original_list, is_korean):
    """Finds the best matching answer from the list of correct answers."""
    best_match = None
    best_score = -1

    for ans in answer_list:
        for original in original_list:
            if is_korean:
                score = (sequence_similarity(original, ans) +
                         levenshtein_similarity(original, ans) +
                         jamo_similarity(original, ans))
            else:
                score = (sequence_similarity(original, ans) +
                         levenshtein_similarity(original, ans))

            if score > best_score:
                best_match = ans
                best_score = score

    return best_match

# Process each file individually
for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    removed_count = 0  # Counter for removed incorrect answers
    selection_log = []  # Log of selected and removed answers

    for entry in data:
        word = entry.get("word")
        original = abbreviation_dict.get(word, None)

        if original and "Answer" in entry:
            # Clean and preprocess the original text
            original_cleaned = clean_text(original, keep_pipe=True)
            original_list = original_cleaned.split("|") if "|" in original_cleaned else [original_cleaned]

            # Clean and preprocess the answer text
            answer_list = smart_split(entry["Answer"])
            answer_list = [clean_text(ans) for ans in answer_list]  # Apply cleaning

            if len(answer_list) > 1:
                best_match = get_best_match(answer_list, original_list, IS_KOREAN)

                # Log removed answers for verification
                removed_answers = [ans for ans in answer_list if ans != best_match]
                selection_log.append(f"Word: {word} | Original: {original_cleaned} | Selected: {best_match} | Removed: {removed_answers}")

                # Update the Answer field with the best match
                entry["Answer"] = best_match
                removed_count += len(removed_answers)  # Update removal count

    # Save the modified data back to the original file
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

    # Display processing results
    print(f"Processed {file_path}: Removed {removed_count} incorrect answers")
    print("\n".join(selection_log))  # Log of removed and selected answers