In [None]:
import os
import json
import re
import pandas as pd
import numpy as np
from gensim import models
from Levenshtein import distance as levenshtein

## Load Dataset & Utility functions

In [None]:
# Load Dataset Function
def load_json(filepath: str) -> list:
    """Loads a JSON file and returns a list of dictionaries."""
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

# Text Preprocessing Function
def clean_text(text: str, keep_pipe=False) -> str:
    """Cleans input text while preserving selected characters for evaluation."""
    if isinstance(text, str):
        if keep_pipe:
            # Uncomment the following line if you need to remove all special characters except '|'
            #text = re.sub(r'[^a-zA-Z0-9ㄱ-ㅎㅏ-ㅣ가-힣| ]', '', text)
            pass
        else:
            # Uncomment the following line if you need to remove all special characters
            #text = re.sub(r'[^a-zA-Z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', text)
            pass

        # Uncomment the following line if case normalization is required (for English text tasks)
        #text = text.lower()

        # This step is always applied: Removes spaces from the text
        text = text.replace(" ", "")
    return text

# Define Vectorization & Similarity Functions
def get_sentence_vector(sentence: str, model) -> np.ndarray:
    """Computes the sentence vector using a pre-trained FastText model."""
    tokens = sentence.split()
    valid_tokens = [token for token in tokens if token in model.wv.key_to_index]
    return np.mean([model.wv[token] for token in valid_tokens], axis=0) if valid_tokens else np.zeros(model.vector_size)

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Computes cosine similarity between two vectors."""
    norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
    return np.dot(vec1, vec2) / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else -1.0

## Load Pre-trained FastText Model

In [None]:
# Determine FastText Model Based on File Name
DATA_ORIGINAL = " "
filename = os.path.basename(DATA_ORIGINAL)

if filename.startswith("kor_"):
    model_path = "cc.ko.300.bin"
elif filename.startswith("eng_"):
    model_path = "cc.en.300.bin"
else:
    raise ValueError("Invalid file name! Must start with 'kor_' or 'eng_'.")

ko_model = models.fasttext.load_facebook_model(model_path)
print(f"FastText model loaded: {model_path}")

## Evaluate All Models for Different Prompt Types

In [None]:
# Define Evaluation Types
EVAL_TYPES = ["zrs", "cot", "icl"]
overall_results = {}

# Load original dataset
data_original = load_json(DATA_ORIGINAL)

# Iterate over each EVAL_TYPE and compute performance
for eval_type in EVAL_TYPES:
    print(f"\nEvaluating {eval_type.upper()}...")

    # Load dataset for the current eval_type
    gpt4o = load_json(f"gpt4o_{eval_type}.json")
    gemini = load_json(f"gemini_{eval_type}.json")
    claude = load_json(f"claude_{eval_type}.json")
    gpto3 = load_json(f"{eval_type}.json")

    # Ensure minimum length for consistency
    min_length = min(len(data_original), len(gpt4o), len(gemini), len(claude), len(gpto3))

    # Convert to DataFrame
    df = pd.DataFrame({
        "original": [item["original"] for item in data_original[:min_length]],
        "transformed": [item["transformed"] for item in data_original[:min_length]],
        "gpt4o": [item["Answer"] for item in gpt4o[:min_length]],
        "gemini": [item["Answer"] for item in gemini[:min_length]],
        "claude": [item["Answer"] for item in claude[:min_length]],
        "gpto3": [item["Answer"] for item in gpto3[:min_length]],
    })

    # Apply Text Preprocessing
    df["original"] = df["original"].apply(lambda x: clean_text(x, keep_pipe=True))
    df[["transformed", "gpt4o", "gemini", "claude", "gpto3"]] = \
        df[["transformed", "gpt4o", "gemini", "claude", "gpto3"]].applymap(lambda x: clean_text(x, keep_pipe=False))

    # 텍스트 정제 적용
    df = df.fillna('')

    # Model Evaluation
    eval_results = {}

    for model in ["gpt4o", "gemini", "claude", "gpto3"]:
        df[f"{model}_accuracy"] = [
            100 if row[f"{model}"] in row["original"].split("|") else 0
            for _, row in df.iterrows()
        ]
        df[f"{model}_edit_distance"] = [
            min(levenshtein(row[f"{model}"], gt) for gt in row["original"].split("|"))
            for _, row in df.iterrows()
        ]
        df[f"{model}_cosine_similarity"] = [
            max(cosine_similarity(get_sentence_vector(row[f"{model}"], ko_model), get_sentence_vector(gt, ko_model))
                for gt in row["original"].split("|"))
            for _, row in df.iterrows()
        ]

        # Store evaluation metrics
        eval_results[model.upper()] = {
            "Accuracy": np.mean(df[f"{model}_accuracy"]),
            "Edit Distance": np.mean(df[f"{model}_edit_distance"]),
            "Cosine Similarity": np.mean(df[f"{model}_cosine_similarity"])
        }

    overall_results[eval_type.upper()] = eval_results

print("\nAll evaluations completed!")


## Compute Evaluation Metrics

In [None]:
# Display Final Performance Results for All Evaluations
for eval_type, results in overall_results.items():
    df_performance = pd.DataFrame.from_dict(results, orient='index')
    df_performance.rename(columns={'Accuracy': 'Accuracy(%)', 'Edit Distance': 'Edit Distance', 'Cosine Similarity': 'Cosine Similarity'}, inplace=True)

    print(f"\nPerformance for {eval_type.upper()}:")
    print(df_performance)