# Full Search

## Load Bible Text

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from hanziconv import HanziConv
from data import read_bible

unv = read_bible('data/dnstrunv.tgz')
unv['text_s'] = unv.text.apply(HanziConv.toSimplified)

## Tokenize

In [3]:
import jieba
import jieba.posseg as pseg

jieba.load_userdict('bible_terms.txt')

unv['text_tk'] = unv.text_s.apply(pseg.lcut)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w0/lx0qzxln5p72qnrg392g7nyc0000gn/T/jieba.cache
Loading model cost 2.007 seconds.
Prefix dict has been built successfully.


## Search

In [4]:
from typing import Iterable

def highlight_occurances(text: str, keywords: Iterable[str]) -> str:
    for i, kw in enumerate(keywords):
        text = text.replace(kw, highlight(kw, i))
    return text

def highlight(text: str, color_code: int) -> str:
    return f'\x1b[6;30;4{color_code + 1}m{text}\x1b[0m'

test_data = ['first', 'second', 'thrid', 'forth', 'fifth', 'sixth', 'seventh']
print(highlight_occurances(', '.join(test_data), test_data))

[6;30;41mfirst[0m, [6;30;42msecond[0m, [6;30;43mthrid[0m, [6;30;44mforth[0m, [6;30;45mfifth[0m, [6;30;46msixth[0m, [6;30;47mseventh[0m


In [41]:
from search import sentence_similarity, load_word_embeddings

load_word_embeddings('data/mini.h5')

In [42]:
searches = ['欢乐 祈祷', '挂虑 祈祷', '虑 祈祷', '喜乐 事奉', '求救', '信心 行事']

for search_term in searches:
    search_tk = pseg.lcut(search_term)
    print(f'Search for {search_term} ({search_tk}):')
    match_scores = {}
    for v, vers_tk in zip(unv.text_s, unv.text_tk):
        similarity = sentence_similarity(search_tk, vers_tk)
        match_kw = [kw for kw in similarity.keys() if similarity[kw] > 0]
        vers = highlight_occurances(v, match_kw)
        score = sum(similarity.values())
        match_scores[vers] = score
    for top_match in sorted(match_scores, key=match_scores.get, reverse=True)[:10]:
        print(f'Match: {match_scores[top_match]:7.4f} Verse: {top_match}')
    print()

Search for 欢乐 祈祷 ([pair('欢乐', 'a'), pair(' ', 'x'), pair('祈祷', 'v')]):
Match:  1.8144 Verse: 恶人献祭，为耶和华所憎恶；正直人[6;30;42m祈祷[0m，为他所[6;30;41m喜悦[0m。
Match:  1.7942 Verse: 撒母耳不[6;30;41m喜悦[0m他们说「立一个王治理我们」，他就[6;30;42m祷告[0m耶和华。
Match:  1.7942 Verse: 他[6;30;42m祷告[0m神，神就[6;30;41m喜悦[0m他，使他欢唿朝见神的面；神又看他为义。
Match:  1.7777 Verse: 我必领他们到我的圣山，使他们在[6;30;42m祷告[0m我的殿中[6;30;41m喜乐[0m。他们的燔祭和平安祭，在我坛上必蒙悦纳，因我的殿必称为万民[6;30;42m祷告[0m的殿。
Match:  1.7777 Verse: 你们中间有受苦的呢，他就该[6;30;42m祷告[0m；有[6;30;41m喜乐[0m的呢，他就该歌颂。
Match:  1.7777 Verse: 在指望中要[6;30;41m喜乐[0m；在患难中要忍耐；[6;30;42m祷告[0m要恆切。
Match:  1.7091 Verse: 哈拿[6;30;42m祷告[0m说：我的心因耶和华[6;30;41m快乐[0m；我的角因耶和华高举。我的口向仇敌张开；我因耶和华的救恩欢欣。
Match:  1.4742 Verse: 在那里，耶和华─你们神的面前，你们和你们的家属都可以吃，并且因你手所办的一切事蒙耶和华─你的神[6;30;42m赐福[0m，就都[6;30;41m欢乐[0m。
Match:  1.4742 Verse: 在耶和华所选择的地方，你当向耶和华─你的神守节七日；因为耶和华─你神在你一切的土产上和你手里所办的事上要[6;30;42m赐福[0m与你，你就非常地[6;30;41m欢乐[0m。
Match:  1.4094 Verse: 将要灭亡的为我[6;30;42m祝福[0m；我也使寡妇心中[6;30;41m欢乐[0m。

Search for 挂虑 祈祷 ([pair('挂虑', 