# Names in the Bible

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from build_data import *

ot_dict = read_cbol_dict('../data/dhfhl')
nt_dict = read_cbol_dict('../data/dgfhl')
ot_dict_strong = ot_dict[ot_dict.strong.astype(int) < 8675]
cbol_dict = pd.concat([ot_dict_strong, nt_dict]).dropna().reset_index(drop=True)
cbol_dict['def_lines'] = cbol_dict.defs.str.split('\\\\r\\\\n')

Missing entry omitted at the moment.

In [3]:
ot_dict[ot_dict.defs.isna()]

Unnamed: 0,strong,defs
1144,7427,


In [4]:
ntna = nt_dict[nt_dict.defs.isna()]
ntna[ntna.strong.astype(np.int32) < 5625]

Unnamed: 0,strong,defs
1109,4236,
4157,4191,


## Add POS

In [5]:
cbol_dict['pos'] = cbol_dict.def_lines.transform(find_pos)
cbol_dict['jieba_pos'] = cbol_dict.defs.transform(jieba_pos)
cbol_dict.head()

Unnamed: 0,strong,defs,def_lines,pos,jieba_pos
0,6359,06359 patiyr {paw-teer'}\r\n\r\n源自 06362; TWOT...,"[06359 patiyr {paw-teer'}, , 源自 06362; TWOT - ...",形容詞,a
1,4000,04000 mabown {maw-bone'}\r\n\r\n源自 0995; TWOT ...,"[04000 mabown {maw-bone'}, , 源自 0995; TWOT - 2...",實名詞（作名詞,nr
2,2453,02453 Chakmowniy {khak-mo-nee'}\r\n\r\n源自 0244...,"[02453 Chakmowniy {khak-mo-nee'}, , 源自 02449; ...",陽性專有名詞,nr
3,3143,03143 Yowshibyah {yo-shi-yaw'}\r\n\r\n源自 03427...,"[03143 Yowshibyah {yo-shi-yaw'}, , 源自 03427 和 ...",陽性專有名詞,nr
4,3412,03412 Yarmuwth {yar-mooth'}\r\n\r\n源自 07311; 專...,"[03412 Yarmuwth {yar-mooth'}, , 源自 07311; 專有名詞...",專有名詞,ns


## Extra vocabs

In [6]:
name_dict = create_name_dict(cbol_dict)
name_dict

Unnamed: 0,name,1,pos,jieba_pos
2,哈摩尼或哈革摩尼,"""智慧""",陽性專有名詞,nr
3,約示比,"""耶和華使我安居""",陽性專有名詞,nr
4,耶末,"""高地""",專有名詞,ns
10,瑪吉希錄,"""集會之處""",陰性複數專有名詞,ns
16,阿谷或亞谷,"""險惡的""",陽性專有名詞,nr
...,...,...,...,...
14482,彼西底,"""漆黑的""",專有名詞,ns
14489,腓力,"""馬的愛好者""",陽性專有名詞,nr
14490,安提阿,"'與之抗衡""",專有地名,ns
14492,希律,"""英勇的""",陽性專有名詞,nz


In [7]:
def is_chinese_char(cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if (
        (cp >= 0x4E00 and cp <= 0x9FFF)
        or (cp >= 0x3400 and cp <= 0x4DBF)  #
        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
        or (cp >= 0xF900 and cp <= 0xFAFF)
        or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
    ):  #
        return True

    return False


name_dict['name_list'] = name_dict.name.str.split('[或，,、/]')
non_chinese = name_dict.name_list.explode().apply(lambda name: [char for char in name if not is_chinese_char(ord(char))]).explode().drop_duplicates()
non_chinese

2      NaN
211      ・
Name: name_list, dtype: object

## Generate names file

In [9]:
from hanziconv import HanziConv

names = name_dict[['name_list', 'jieba_pos']].explode('name_list')
names['name_list'] = names.name_list.str.strip('族人的')\
                                    .str.replace('・', '')\
                                    .apply(HanziConv.toSimplified)
names.drop_duplicates('name_list')\
     .to_csv('word_tokens/names.txt', index=False, header=None, sep=' ')

## Check name occurrances

In [10]:
dots = r'[•‧．・\-]'
search_pattern = name_dict.name_list.apply(lambda list: f'(?:{"|".join(list)})').str.replace(dots, '')
search_pattern[search_pattern.str.contains(dots)]

Series([], Name: name_list, dtype: object)

In [11]:
unv = read_bible('../data/dnstrunv')

In [14]:
unv_text_no_dots = unv.text.str.replace(dots, '')
name_count = search_pattern.apply(lambda pat: unv_text_no_dots.str.count(pat).sum())

In [15]:
name_dict['count'] = name_count
name_dict.sort_values('count', ascending=False)[:20]

Unnamed: 0,name,1,pos,jieba_pos,name_list,count
6010,,"""受苦的"" (#耶46:9|)",專有名詞,nz,[],1096104
7838,耶和華,"""自有永有的""",專有名詞,nz,[耶和華],6980
4353,耶和華,是我們的公義,陽性專有名詞,nz,[耶和華],6980
7177,以色,"""離開""",專有名詞,ns,[以色],2706
5207,以色列,"""上帝勝過""",陽性專有名詞,nz,[以色列],2703
4976,以色列,"""上帝勝過""",專有名詞,nr,[以色列],2703
7138,安,"""財富"" 或 ""精力""",陽性專有名詞,nr,[安],1751
14338,耶穌,「耶和華是拯救」,陽性專有名詞,nr,[耶穌],1640
7498,大衛,"""受鍾愛的""",陽性專有名詞,nr,[大衛],1164
6766,愛、亞雅、亞葉或艾城,「荒場」,專有名詞,ns,"[愛, 亞雅, 亞葉, 艾城]",1040


In [None]:
'大馬士革' '低加坡里'