In [1]:
import pandas as pd
import numpy as np
from pycasia.CASIA import CASIA
from PIL import Image
from glob import glob
import re
import tqdm

In [2]:
### Whole Chinese Character
with open('data/tc_cha_4808.txt','r') as fp:
     all_chas = fp.readlines()
all_chas = ['{}'.format(cha) for cha in all_chas[0]]

In [3]:
with open('data/ids-cdp-character.csv','r') as fp:
     cdp = fp.readlines()

with open('data/ids-cdp-radicals.csv','r') as fp:
     rad = fp.readlines()
        
cdp.extend(rad)

In [4]:
## 僅保留第一種造字法則
## 去掉\n、空格
cdp_2 = []

for line in tqdm.tqdm_notebook(cdp):
    matchObj = re.search('\[\w{1,5}\]', line)
    if matchObj:
        # print (line[:matchObj.span()[0]])
        tmp_line = line[:matchObj.span()[0]].split(',')
        cdp_2.append(list(filter(lambda x:x != '', tmp_line)))
    else:
        # print(line)
        tmp_line = line.rstrip().split(',')
        cdp_2.append(list(filter(lambda x:x != '', tmp_line)))
all_cha = pd.DataFrame(cdp_2)
print(all_cha.shape)
all_cha.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=89591.0), HTML(value='')))


(89591, 6)


Unnamed: 0,0,1,2,3,4,5
0,U+2460,①,①,,,
1,U+2461,②,②,,,
2,U+2462,③,③,,,
3,U+2463,④,④,,,
4,U+2464,⑤,⑤,,,


In [5]:
## 保留常用中文，去掉日韓及符號等unicode

chi_cha = all_cha[all_cha[1].isin(all_chas)]
chi_cha.shape

(4808, 6)

In [6]:
struc_set = set(['⿰', '⿱', '⿲', 
                 '⿳', '⿴', '⿵', 
                 '⿶', '⿷', '⿸', 
                 '⿹', '⿺', '⿻'])

In [7]:
def rad_decompose(string):
    pattern_rad = re.compile('&[-A-Z0-9]+;')
    res = []
    while len(string)>0:
        matchObj = pattern_rad.match(string)
        if matchObj:
            i = matchObj.span()[1]
        else:
            i=1
        res.append(string[:i])
        string = string[i:]
    return res

def decompose(series):
    # print ('series1: {}'.format(series[1]), 'series2: {}'.format(series[2]))
    
    pattern_stc = re.compile('[⿰, ⿱, ⿲, ⿳, ⿴, ⿵, ⿶, ⿷, ⿸, ⿹, ⿺, ⿻]+')
    if series[1] == series[2]:
        ## return Leaf/Radical
        return series[2]
    else:
        ## find path
        root = series[2]
        split    = pattern_stc.split(root)[1:]
        matchObj = pattern_stc.findall(root)
        
        root_split = {}
        for i, k in enumerate(split):
            root_split[matchObj[i]] = rad_decompose(k)
        
        for k, v in root_split.items():
            for ii, vv in enumerate(v):
                root_split[k][ii] = decompose(all_cha[all_cha[1] == vv].squeeze())
        return root_split

In [8]:
print (decompose(all_cha[all_cha[1] == '萬'].squeeze()))
print (decompose(all_cha[all_cha[1] == '里'].squeeze()))
print (decompose(all_cha[all_cha[1] == '雲'].squeeze()))

{'⿱': [{'⿻': ['十', '丨']}, '禺']}
{'⿱': [{'⿻': ['甲', '一']}, '一']}
{'⿱': [{'⿱': ['一', {'⿻': ['冂', {'⿻': ['丨', {'⿱': ['丷', '八']}]}]}]}, {'⿱': [{'⿱': ['一', '一']}, '厶']}]}


In [9]:
%%time
chi_cha['hierarchical'] = chi_cha.apply(lambda x:decompose(x), axis=1)

CPU times: user 3min 11s, sys: 464 ms, total: 3min 11s
Wall time: 3min 12s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
def depth_loc(x, loc=[1]):
    tmp = []
    if isinstance(x, dict):
        for k, v in x.items():
            tmp.append([k, loc])
            tmp.extend(depth_loc(v, loc))
    if isinstance(x, list):
        for i, k in enumerate(x):
            tmp.extend(depth_loc(k, loc+[i+1]))
    if isinstance(x, str):
        tmp = [[x, loc]]
    return tmp

In [11]:
print (depth_loc(chi_cha[chi_cha[1] == '萬']['hierarchical'].values[0]))
print (depth_loc(chi_cha[chi_cha[1] == '里']['hierarchical'].values[0]))
print (depth_loc(chi_cha[chi_cha[1] == '雲']['hierarchical'].values[0]))

[['⿱', [1]], ['⿻', [1, 1]], ['十', [1, 1, 1]], ['丨', [1, 1, 2]], ['禺', [1, 2]]]
[['⿱', [1]], ['⿻', [1, 1]], ['甲', [1, 1, 1]], ['一', [1, 1, 2]], ['一', [1, 2]]]
[['⿱', [1]], ['⿱', [1, 1]], ['一', [1, 1, 1]], ['⿻', [1, 1, 2]], ['冂', [1, 1, 2, 1]], ['⿻', [1, 1, 2, 2]], ['丨', [1, 1, 2, 2, 1]], ['⿱', [1, 1, 2, 2, 2]], ['丷', [1, 1, 2, 2, 2, 1]], ['八', [1, 1, 2, 2, 2, 2]], ['⿱', [1, 2]], ['⿱', [1, 2, 1]], ['一', [1, 2, 1, 1]], ['一', [1, 2, 1, 2]], ['厶', [1, 2, 2]]]


In [12]:
%%time
chi_cha['depth_loc'] = chi_cha.apply(lambda x:depth_loc(x['hierarchical']), axis=1)

CPU times: user 187 ms, sys: 6.7 ms, total: 194 ms
Wall time: 196 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


# Embedding

In [13]:
rad = []
for c in chi_cha.depth_loc.values:
    for cc in c:
        rad.append(cc[0])
        
rad = set(rad) - struc_set 
rad = rad - set(['⿰⿱','⿰⿳','⿰⿵','⿱⿰','⿱⿱','⿱⿲','⿱⿳','⿱⿴','⿱⿵','⿱⿶','⿱⿻','⿳⿰','⿳⿲','⿸⿱','⿻⿱'])

#### config

In [14]:
alpha = 0.5
beta  = 0.001
la    = 0.5

In [15]:
Embedding = pd.DataFrame(index=list(rad) + list(struc_set))

In [16]:
for idx in tqdm.tqdm_notebook(range(chi_cha.shape[0])):
    Embedding[chi_cha.iloc[idx,:][1]] = 0
    for compment in chi_cha.iloc[idx,:]['depth_loc']:   
        v = alpha**(len(compment[1])-1)
        if len(compment[0])>2:              ### eq:'CDP-8A44'
            for i, c in enumerate(compment[1]):
                v += (alpha**(i))*(-c*beta)
            Embedding.loc[compment[0],chi_cha.iloc[idx,:][1]] += v
            
        elif len(compment[0])==2:           ### '⿰⿳'
            for i, c in enumerate(compment[1]):
                v += (alpha**(i))*(-c*beta)
            v *= la
            
            for c in compment[0]:
                Embedding.loc[c, chi_cha.iloc[idx,:][1]] += v
            
        elif compment[0] in struc_set:      ### '⿰'
            for i, c in enumerate(compment[1]):
                v += (alpha**(i))*(-c*beta)
            v *= la
            Embedding.loc[compment[0],chi_cha.iloc[idx,:][1]] += v
            
        else:                               ### '土'
            for i, c in enumerate(compment[1]):
                v += (alpha**(i))*(-c*beta)
            Embedding.loc[compment[0],chi_cha.iloc[idx,:][1]] += v

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4808.0), HTML(value='')))




In [17]:
Embedding['森'][Embedding['森']>0]

木    0.99375
⿱    0.49950
⿰    0.24900
Name: 森, dtype: float64

In [18]:
Embedding.to_csv('data/Embedding_tc_{}_{}_{}.csv'.format(alpha, beta, la))

In [28]:
((Embedding**2).sum().sort_values()>1).sum()

789

In [37]:
np.sum(np.sort(np.sqrt((Embedding**2).sum()))>1)

789

In [38]:
np.sum(np.sort(np.sqrt((Embedding**2).sum()))<=1)

4019