In [1]:
import os
import json
import re
import pandas as pd
import numpy as np
from gensim import models
from Levenshtein import distance as levenshtein

## Load Dataset

In [34]:
def load_json(filepath: str) -> list:
    """
    Loads a JSON file and returns a list of dictionaries.

    Args:
        filepath (str): Path to the JSON file.

    Returns:
        list: List of parsed JSON objects (dictionaries).
    """
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

# File paths
DATA_ZRS = " "
DATA_COT_ICL = " "
DATA_COT = " "
DATA_ORIGINAL = " "

# Load dataset
data_zrs = load_json(DATA_ZRS)
data_cot_icl = load_json(DATA_COT_ICL)
data_cot = load_json(DATA_COT)
data_original = load_json(DATA_ORIGINAL)

print(f"Loaded dataset: {len(data_original)} original samples")

Loaded dataset: 727 original samples


## Preprocess Text Data

In [None]:
# Text Preprocessing Function
def clean_text(text: str, keep_pipe=False) -> str:
    """
    Cleans input text based on the preprocessing requirements of the specific task.

    Args:
        text (str): The input string to be cleaned.
        keep_pipe (bool): If True, retains the '|' character (used for original answers with multiple references).

    Returns:
        str: The cleaned text, processed according to the specified task requirements.
    """
    if isinstance(text, str):
        if keep_pipe:
            # Uncomment the following line if you need to remove all special characters except '|'
            text = re.sub(r'[^a-zA-Z0-9ㄱ-ㅎㅏ-ㅣ가-힣| ]', '', text)
            pass
        else:
            # Uncomment the following line if you need to remove all special characters
            text = re.sub(r'[^a-zA-Z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', text)
            pass

        # Uncomment the following line if case normalization is required (for English text tasks)
        #text = text.lower()

        # This step is always applied: Removes spaces from the text
        text = text.replace(" ", "")
    return text

# Convert to DataFrame (Min-Length Matching)
min_length = min(len(data_original), len(data_zrs), len(data_cot_icl), len(data_cot))

if "kor_consonant_vowel_combination" in DATA_ORIGINAL:
    df = pd.DataFrame({
        "original": [item["words"] for item in data_original[:min_length]],
        "transformed": [item["random"] for item in data_original[:min_length]],
        "zrs_answer": [item["Answer"] for item in data_zrs[:min_length]],
        "cot_answer": [item["Answer"] for item in data_cot[:min_length]],
        "cot_icl_answer": [item["Answer"] for item in data_cot_icl[:min_length]]
    })
else:
    df = pd.DataFrame({
        "original": [item["original"] for item in data_original[:min_length]],
        "transformed": [item["transformed"] for item in data_original[:min_length]],
        "zrs_answer": [item["Answer"] for item in data_zrs[:min_length]],
        "cot_answer": [item["Answer"] for item in data_cot[:min_length]],
        "cot_icl_answer": [item["Answer"] for item in data_cot_icl[:min_length]]
    })

# Apply Text Preprocessing
if "kor_abbreviations" in DATA_ORIGINAL:
    df["original"] = df["original"].apply(lambda x: clean_text(x, keep_pipe=True))
else:
    df["original"] = df["original"].apply(lambda x: clean_text(x, keep_pipe=False))

df[["transformed", "zrs_answer", "cot_answer", "cot_icl_answer"]] = \
    df[["transformed", "zrs_answer", "cot_answer", "cot_icl_answer"]].applymap(lambda x: clean_text(x, keep_pipe=False))

df = df.fillna('').applymap(clean_text)
print("Text preprocessing completed.")

## Load Evaluation Model

In [None]:
# Determine FastText Model Based on File Name
filename = os.path.basename(DATA_ORIGINAL)  # Extract filename from path

if filename.startswith("kor_"):
    model_path = "cc.ko.300.bin"
elif filename.startswith("eng_"):
    model_path = "cc.en.300.bin"
else:
    raise ValueError("Invalid file name! Must start with 'kor_' or 'eng_'.")

ko_model = models.fasttext.load_facebook_model(model_path)
print(f"FastText model loaded: {model_path}")

## Define Vectorization & Similarity Functions

In [37]:
# Define Vectorization & Similarity Functions
def get_sentence_vector(sentence: str, model) -> np.ndarray:
    """
    Computes the sentence vector using a pre-trained FastText model.

    Args:
        sentence (str): The input sentence.
        model: FastText model for generating word embeddings.

    Returns:
        np.ndarray: The averaged word vectors for the sentence. If no valid tokens exist, returns a zero vector.
    """
    tokens = sentence.split()
    valid_tokens = [token for token in tokens if token in model.wv.key_to_index]
    return np.mean([model.wv[token] for token in valid_tokens], axis=0) if valid_tokens else np.zeros(model.vector_size)

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """
    Computes cosine similarity between two vectors.

    Args:
        vec1 (np.ndarray): First vector.
        vec2 (np.ndarray): Second vector.

    Returns:
        float: Cosine similarity score. Returns -1 if either vector has zero norm.
    """
    norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
    return np.dot(vec1, vec2) / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else -1.0


## Evaluate Models and Compute Metrics

In [None]:
# Model Evaluation
overall_results = {}

for model in ["zrs", "cot", "cot_icl"]:
    df[f"{model}_accuracy"] = [
        100 if row[f"{model}_answer"] in row["original"].split("|") else 0
        for _, row in df.iterrows()
    ]
    df[f"{model}_edit_distance"] = [
        min(levenshtein(row[f"{model}_answer"], gt) for gt in row["original"].split("|"))
        for _, row in df.iterrows()
    ]
    df[f"{model}_cosine_similarity"] = [
        max(cosine_similarity(get_sentence_vector(row[f"{model}_answer"], ko_model), get_sentence_vector(gt, ko_model))
            for gt in row["original"].split("|"))
        for _, row in df.iterrows()
    ]

    # Store overall evaluation metrics
    overall_results[model.upper()] = {
        "Accuracy": np.mean(df[f"{model}_accuracy"]),
        "Edit Distance": np.mean(df[f"{model}_edit_distance"]),
        "Cosine Similarity": np.mean(df[f"{model}_cosine_similarity"])
    }

print("Evaluation completed.")

## Display Final Performance Results

In [None]:
df_performance = pd.DataFrame.from_dict(overall_results, orient='index')
df_performance.rename(columns={'Accuracy': 'Accuracy(%)', 'Edit Distance': 'Edit Distance', 'Cosine Similarity': 'Cosine Similarity'}, inplace=True)

print("Overall Model Performance")
df_performance