First, run the following command in the root path of this repository:

```shell
export archive_file="output/CE-CLCNN/wiki_title/ja/base/model.tar.gz"
export input_file="https://github.com/frederick0329/Learning-Character-Level/raw/master/data/ja_test.txt"
export output_file="output/CE-CLCNN/wiki_title/ja/with_RE_and_WT/prediction_result.jsonl"


CUDA_VISIBLE_DEVICES=0 allennlp predict \
    $archive_file \
    $input_file \
    --output-file $output_file \
    --use-dataset-reader \
    --dataset-reader-choice validation \
    --predictor wiki_title \
    --cuda-device 0
```

In [1]:
import warnings

# ignore warnings for t-SNE
warnings.filterwarnings('ignore')

import json
import pathlib

import numpy as np
import pandas as pd

from collections import defaultdict
from typing import Dict, List
from dataclasses import dataclass
from tqdm import tqdm

## Load the prediction result

- define some paths for loading files

In [2]:
ROOT_DIR = pathlib.Path('.').resolve().parent

prediction_result_jsonl_path = ROOT_DIR / 'output' / 'CE-CLCNN' / 'wiki_title' / 'ja' / 'base' / 'prediction_result.jsonl'

if not prediction_result_jsonl_path.exists():
    raise FileNotFoundError(prediction_result_jsonl_path)

- define helper `Result` class

In [3]:
@dataclass
class Result(object):
    logits: List[float]
    tokens: List[str]
    embeds: List[List[float]]
    loss: float
    num_categories: int = 12
    
    def __post_init__(self) -> None:
        assert len(self.tokens)  == len(self.embeds)
        assert len(self.logits) == self.num_categories

- load prediction result from the jsonl file

In [4]:
results: List[Result] = []
with prediction_result_jsonl_path.open('r') as rf:
    for line in tqdm(rf):
        result_dict = json.loads(line)
        result = Result(**result_dict)
        results.append(result)

161955it [01:31, 1775.50it/s]


- create dataframe of character and its embeddings

In [5]:
char_to_embeds: Dict[str, List[List[float]]] = defaultdict(list)

for result in results:
    for char, embed in zip(result.tokens, result.embeds):
        char_to_embeds[char].append(embed)

char_to_embed: Dict[str, List[float]] = {k: np.array(v).mean(axis=0).tolist() for k, v in tqdm(char_to_embeds.items())}

df_char_embed = pd.DataFrame(char_to_embed.items(), columns=['char', 'embed'])
df_char_embed.head()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4164/4164 [00:22<00:00, 183.04it/s]


Unnamed: 0,char,embed
0,梅,"[0.0, 0.09442270647608832, 0.0, 0.0, 0.0, 0.30..."
1,棹,"[0.0, 0.3299435079097748, 0.0, 0.0, 0.0, 0.446..."
2,忠,"[0.0, 0.3318681095811454, 0.0, 0.0, 0.0, 0.0, ..."
3,夫,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,加,"[0.0, 0.3346048396916444, 0.0, 0.0, 0.0, 0.0, ..."


## K-nearest neighbor-based analysis

In [6]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, n_jobs=4)

char_embeds = np.array(df_char_embed['embed'].values.tolist())
knn.fit(char_embeds)

NearestNeighbors(n_jobs=4, n_neighbors=10)

In [7]:
dists, indices = knn.kneighbors(char_embeds)

In [8]:
def get_df_char_dist(
    df_char_embed: pd.DataFrame, 
    target_character: str,
    knn_dists: np.ndarray,
    knn_indices: np.ndarray,
) -> pd.DataFrame:
    
    target_char_idx = df_char_embed[df_char_embed['char'] == target_character].index
    target_dists = knn_dists[target_char_idx][0]
    target_indices = knn_indices[target_char_idx][0]

    df_tmp = pd.DataFrame(df_char_embed.iloc[target_indices]['char'])
    df_tmp['dist'] = target_dists
    return df_tmp

- Nearest neighbor result of `銅` (copper), which consists of `金` (gold)

In [9]:
get_df_char_dist(target_character='銅', df_char_embed=df_char_embed, knn_dists=dists, knn_indices=indices)

Unnamed: 0,char,dist
906,銅,0.0
1643,錫,0.924561
2939,殉,0.991584
3018,鉤,1.009748
2463,銘,1.03426
2781,拉,1.038966
2113,鋸,1.058522
1868,紘,1.062821
3812,鎔,1.0689
3339,鏑,1.084629


- Nearest neighbor result of `痛` (pain), which consists of `疒` (disease)

In [10]:
get_df_char_dist(target_character='痛', df_char_embed=df_char_embed, knn_dists=dists, knn_indices=indices)

Unnamed: 0,char,dist
2299,痛,0.0
3985,濵,0.595612
3361,臈,0.611274
2735,癌,0.665778
2424,瘤,0.671743
1196,庵,0.697978
2330,碇,0.703681
3536,鮑,0.707066
573,硫,0.727393
2899,癬,0.728537


- Nearest neighbor result of `い`, which is Hiragana character

In [11]:
get_df_char_dist(target_character='い', df_char_embed=df_char_embed, knn_dists=dists, knn_indices=indices)

Unnamed: 0,char,dist
394,い,0.0
1748,ぃ,0.772097
3297,φ,1.061046
888,ゆ,1.078242
414,だ,1.154581
409,し,1.156721
2323,ぁ,1.16477
3602,∀,1.166024
1883,“,1.184222
1209,ね,1.189639
