In [19]:
import json
import glob
import os

with open("running_man.json", "r") as f:
    running_man = json.load(f)["annotation"]
annotations = {"RunningMan": running_man}

files = glob.glob("segmented/*.json")
programs = ["RunningMan"]

# BERTScore

In [3]:
from torchmetrics.text.bert import BERTScore

metric = BERTScore("klue/roberta-base")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from itertools import product

os.makedirs("matrices", exist_ok=True)

for fn in sorted(files):
    with open(fn, "r") as f:
        data = json.load(f)

    user_result = {}
    for program in programs:
        result = {"references": annotations[program]}
        for recall in ["Recall1", "Recall2"]:
            inputs = list(product(annotations[program], data[recall][program]))
            targets, preds = list(zip(*inputs))

            scores = metric(preds, targets)
            matrix = scores["f1"].view(len(annotations[program]), -1)
            
            result[recall] = data[recall][program]
            result[f"{recall}-Matrix"] = matrix.tolist()
        user_result[program] = result
    
    with open(f"matrices/{os.path.basename(fn)}", "w") as f:
        json.dump(user_result, f, indent=2, ensure_ascii=False)

# BLEU

In [23]:
from konlpy.tag import Mecab

mecab = Mecab()

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from itertools import product

os.makedirs("matrices", exist_ok=True)

for fn in sorted(files):
    with open(fn, "r") as f:
        data = json.load(f)

    user_result = {}
    for program in programs:
        result = {"references": annotations[program]}
        for recall in ["Recall1", "Recall2"]:
            inputs = list(product(annotations[program], data[recall][program]))
            targets, preds = list(zip(*inputs))

            scores = [ sentence_bleu([mecab.morphs(t)], mecab.morphs(p)) for t, p in zip(targets, preds) ]
            chunk_size = len(data[recall][program])
            matrix = [ scores[i:i+chunk_size] for i in range(0, len(scores), chunk_size)]
            
            result[recall] = data[recall][program]
            result[f"{recall}-Matrix"] = matrix
        user_result[program] = result
    
    with open(f"matrices/{os.path.basename(fn)}", "w") as f:
        json.dump(user_result, f, indent=2, ensure_ascii=False)

# STS

In [24]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

modules.json: 100%|██████████| 229/229 [00:00<00:00, 32.7kB/s]
config_sentence_transformers.json: 100%|██████████| 124/124 [00:00<00:00, 43.0kB/s]
README.md: 100%|██████████| 4.02k/4.02k [00:00<00:00, 1.38MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 14.1kB/s]
config.json: 100%|██████████| 707/707 [00:00<00:00, 178kB/s]
pytorch_model.bin: 100%|██████████| 467M/467M [00:06<00:00, 74.9MB/s] 
tokenizer_config.json: 100%|██████████| 394/394 [00:00<00:00, 31.1kB/s]
vocab.txt: 100%|██████████| 336k/336k [00:00<00:00, 604kB/s]
tokenizer.json: 100%|██████████| 967k/967k [00:00<00:00, 1.30MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 31.0kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 13.5kB/s]


In [49]:
from itertools import product
import numpy as np

output_dir = os.path.join("STS", "matrices")

os.makedirs(output_dir, exist_ok=True)

for fn in sorted(files):
    with open(fn, "r") as f:
        data = json.load(f)

    user_result = {}
    for program in programs:
        result = {"references": annotations[program]}
        reference_embeddings = model.encode(annotations[program])

        for recall in ["Recall1", "Recall2"]:
            recall_embeddings = model.encode(data[recall][program])

            reference_embeddings /= np.linalg.norm(reference_embeddings, axis=-1, ord=2, keepdims=True)
            recall_embeddings /= np.linalg.norm(recall_embeddings, axis=-1, ord=2, keepdims=True)

            matrix = reference_embeddings @ recall_embeddings.T
            
            result[recall] = data[recall][program]
            result[f"{recall}-Matrix"] = matrix.tolist()
        user_result[program] = result
    
    with open(os.path.join(output_dir, os.path.basename(fn)), "w") as f:
        json.dump(user_result, f, indent=2, ensure_ascii=False)