In [1]:
# from pprint import pprint as print
import random

# 1. Building Knowledge Graph

## 1.1 Extract Data

In [2]:
import csv

In [3]:
def get_data_from_csv(filepath: str):
    data = []
    with open(filepath) as file:
        next(file)
        rows = csv.reader(file, delimiter=",")
        for row in rows:
            num,kanji,radicals,meanings,theme,subtheme = row
            radicals = radicals.split(':')
            meanings = meanings.split(':')
            data.append({
                'kanji': kanji,
                'radicals': radicals,
                'meanings': meanings,
                'theme': theme,
                'subtheme': subtheme,
            })
    return data

In [4]:
data = get_data_from_csv(filepath="s7_kanji_total_clean.csv")
print(data[:2])

[{'kanji': '亜', 'radicals': ['｜', '口'], 'meanings': ['Asia', 'rank next', 'come after', '-ous'], 'theme': '', 'subtheme': ''}, {'kanji': '哀', 'radicals': ['亠', '口', '衣'], 'meanings': ['sadness or pity'], 'theme': 'Emotions_and_Senses', 'subtheme': 'Emotions'}]


In [5]:
# sample data
# data = data[:60]

## 1.2 Get List of Nodes

In [6]:
class KanjigenNode:
    def __init__(self, name, dtype):
        self.name = name
        self.dtype = dtype
        
        mtc = {}
        mtc['kanji']   = 'red'
        mtc['radical'] = 'orange'
        mtc['meaning'] = 'meaning'
        
        self.color = mtc[dtype]

    def __repr__(self):
        return self.name

In [7]:
def get_list_node(data):
    list_kanji, list_radical, list_meaning, list_theme, list_subtheme = set(), set(), set(), set(), set()

    for d in data:
        list_kanji    |= set(d['kanji'])
        list_radical  |= set(d['radicals'])
        list_meaning  |= set(d['meanings'])
        list_theme.add(d['theme'])
        list_subtheme.add(d['subtheme'])

    list_radical.remove(''), list_theme.remove(''), list_subtheme.remove('')

    list_node = []
    # tuple = (0,1,2) = (symbol, type, color)
    list_node += [KanjigenNode(k, "kanji")       for k in list_kanji]
    list_node += [KanjigenNode(r, "radical")     for r in list_radical]
#     list_node += [KanjigenNode(m, "meaning")   for m in list_meaning]
#     list_node += [KanjigenNode(t, "theme")     for t in list_theme]
#     list_node += [KanjigenNode(s, "subtheme")  for s in list_subtheme]

    return list_node

In [8]:
list_node = get_list_node(data)
print(list_node[:10])

[家, 己, 心, 飽, 耕, 助, 出, 妹, 付, 校]


### 1.2.1 Helper

In [9]:
def find_node(list_node, name, dtype):
    list_node_found = list(filter(lambda node: node.name == name and node.dtype == dtype, list_node))
    if len(list_node_found) > 0:
        return list_node_found[0]
    return None

## 1.3 Get List of Edge

In [10]:
def get_list_edge(data, list_node):
    list_edge = []
    
    for d in data:
        node_kanji = find_node(list_node, d['kanji'], 'kanji')
            
        for radical in d['radicals']:
            node_radical = find_node(list_node, radical, 'radical')
            if node_radical != None:
                list_edge += [(node_kanji, node_radical)]
            
#         for meaning in d['meanings']:
#             node_meaning = find_node(list_node, meaning)
#             list_edge += [(node_kanji, node_meaning)]
        

    return list_edge

In [11]:
list_edge = get_list_edge(data, list_node)
print(list_edge)

[(亜, ｜), (亜, 口), (哀, 亠), (哀, 口), (哀, 衣), (挨, 厶), (挨, 矢), (挨, 扌), (愛, 冖), (愛, 夂), (愛, 心), (愛, 爪), (曖, 冖), (曖, 夂), (曖, 心), (曖, 日), (曖, 爪), (悪, ｜), (悪, 口), (悪, 心), (握, 厶), (握, 土), (握, 尸), (握, 至), (握, 扌), (圧, 厂), (圧, 土), (扱, 扌), (扱, 及), (宛, 卩), (宛, 夕), (宛, 宀), (嵐, 山), (嵐, 風), (安, 女), (安, 宀), (案, 女), (案, 宀), (案, 木), (暗, 日), (暗, 立), (暗, 音), (以, ｜), (以, 丶), (以, 人), (衣, 亠), (位, 立), (位, 亻), (囲, 囗), (囲, 井), (医, 匚), (医, 矢), (依, 亠), (依, 衣), (依, 亻), (委, 女), (委, 禾), (威, 丿), (威, 厂), (威, 女), (威, 戈), (為, 丿), (為, 灬), (畏, 田), (畏, 衣), (胃, 月), (胃, 田), (尉, 寸), (尉, 尸), (尉, 示), (異, 八), (異, 田), (異, 井), (移, 夕), (移, 禾), (萎, 女), (萎, 禾), (萎, 艹), (偉, 口), (偉, 韋), (偉, 亻), (椅, 亅), (椅, 口), (椅, 大), (椅, 木), (彙, 冖), (彙, 彐), (彙, 木), (彙, 田), (彙, 彑), (意, 心), (意, 日), (意, 立), (意, 音), (違, 口), (違, 韋), (違, 辶), (維, 糸), (維, 隹), (慰, 寸), (慰, 尸), (慰, 心), (慰, 示), (遺, 口), (遺, 貝), (遺, 辶), (緯, 口), (緯, 糸), (緯, 韋), (域, 口), (域, 土), (域, 戈), (育, 亠), (育, 厶), (育, 月), (壱, 冖), (壱, 匕), (壱, 士), (逸, 丿), (逸, 儿), (逸, 勹), (逸, 辶), (逸, 免), (茨, 冫), (茨, 欠),

## 1.4 Graph

In [12]:
import networkx as nx

In [13]:
G = nx.Graph()

In [14]:
G.add_nodes_from(list_node)
list(G.nodes())[:10]

[家, 己, 心, 飽, 耕, 助, 出, 妹, 付, 校]

In [15]:
G.add_edges_from(list_edge)
list(G.edges())[:10]

[(家, 宀),
 (家, 豕),
 (飽, 勹),
 (飽, 己),
 (飽, 食),
 (耕, ｜),
 (耕, 亅),
 (耕, 八),
 (耕, 土),
 (耕, 木)]

## 1.5 Visualization

In [16]:
import matplotlib
import matplotlib.pyplot as plt

### 1.5.1 Install Font

In [17]:
import matplotlib.font_manager as fm

# Reference: https://albertauyeung.github.io/2020/03/15/matplotlib-cjk-fonts.html
[f for f in fm.fontManager.ttflist if 'CJK JP' in f.name]

[<Font 'Noto Serif CJK JP' (NotoSerifCJK-Regular.ttc) normal normal 400 normal>,
 <Font 'Noto Sans CJK JP' (NotoSansCJK-Bold.ttc) normal normal 700 normal>,
 <Font 'Noto Serif CJK JP' (NotoSerifCJK-Bold.ttc) normal normal 700 normal>,
 <Font 'Noto Sans CJK JP' (NotoSansCJK-Regular.ttc) normal normal 400 normal>]

In [18]:
fname = "Noto Serif CJK JP"

### 1.5.2 Sample Visualization

In [19]:
k = 200
sample_nodes = random.sample(G.nodes, k)
sg = G.subgraph(sample_nodes)

In [None]:
color_map = [n.color for n in G]

plt.figure(1,figsize=(10,10)) 
nx.draw_kamada_kawai(G, node_color=color_map, with_labels=True, node_size=1000, font_size=20,font_family=fname)
plt.show()

In [None]:
import pydot
from networkx.drawing.nx_pydot import graphviz_layout
pos = graphviz_layout(G, prog="twopi")
nx.draw(G, pos)
plt.show()

In [None]:
p = find_node(list_node, '痘', 'kanji')

[n for n in G.neighbors(p)]

# 2. Querying Knowledge Graph