# Full Search

## Load Bible Text

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from hanziconv import HanziConv
from data import read_bible

unv = read_bible('data/dnstrunv.tgz')
unv['text_s'] = unv.text.apply(HanziConv.toSimplified)

## Tokenize

In [3]:
import jieba
import jieba.posseg as pseg

jieba.load_userdict('bible_terms.txt')

unv['text_tk'] = unv.text_s.apply(pseg.lcut)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w0/lx0qzxln5p72qnrg392g7nyc0000gn/T/jieba.cache
Loading model cost 2.007 seconds.
Prefix dict has been built successfully.


## Search

In [4]:
from typing import Iterable

def highlight_occurances(text: str, keywords: Iterable[str]) -> str:
    for i, kw in enumerate(keywords):
        text = text.replace(kw, highlight(kw, i))
    return text

def highlight(text: str, color_code: int) -> str:
    return f'\x1b[6;30;4{color_code + 1}m{text}\x1b[0m'

test_data = ['first', 'second', 'thrid', 'forth', 'fifth', 'sixth', 'seventh']
print(highlight_occurances(', '.join(test_data), test_data))

[6;30;41mfirst[0m, [6;30;42msecond[0m, [6;30;43mthrid[0m, [6;30;44mforth[0m, [6;30;45mfifth[0m, [6;30;46msixth[0m, [6;30;47mseventh[0m


In [8]:
from search import sentence_similarity

searches = ['欢乐 祈祷', '挂虑 祈祷', '虑 祈祷', '喜乐 事奉', '求救', '信心 行事']

for search_term in searches:
    search_tk = pseg.lcut(search_term)
    print(f'Search for {search_term} ({search_tk}):')
    match_scores = {}
    for v, vers_tk in zip(unv.text_s, unv.text_tk):
        similarity = sentence_similarity(search_tk, vers_tk)
        match_kw = [kw for kw in similarity.keys() if similarity[kw] > 0]
        vers = highlight_occurances(v, match_kw)
        score = sum(similarity.values())
        match_scores[vers] = score
    for top_match in sorted(match_scores, key=match_scores.get, reverse=True)[:10]:
        print(f'Match: {match_scores[top_match]:7.4f} Verse: {top_match}')
    print()

Search for 欢乐 祈祷 ([pair('欢乐', 'a'), pair(' ', 'x'), pair('祈祷', 'v')]):
Match:  1.2500 Verse: 耶何耶大派官看守耶和华的殿，是在祭司利未人手下。这祭司利未人是大卫分派在耶和华殿中、照摩西律法上所[6;30;42m写[0m的，给耶和华献燔祭，又按大卫所定的例，[6;30;41m欢乐[0m歌唱；
Match:  1.2500 Verse: 我向耶和华─我的神[6;30;42m祈祷[0m、认罪，说：「主啊，大而可畏的神，向爱主、守主诫命的人守约施[6;30;41m慈爱[0m。
Match:  1.2500 Verse: 就[6;30;42m祷告[0m耶和华说：「耶和华啊，我在本国的时候岂不是这样说吗？我知道你是有恩典、有怜悯的神，不轻易发怒，有丰盛的[6;30;41m慈爱[0m，并且后悔不降所说的灾，所以我急速逃往他施去。
Match:  1.2500 Verse: 耶稣在一个地方[6;30;42m祷告[0m；[6;30;42m祷告[0m完了，有个门徒对他说：「求主[6;30;41m教导[0m我们[6;30;42m祷告[0m，像约翰[6;30;41m教导[0m他的门徒。」
Match:  1.2500 Verse: 白昼，耶和华必向我施[6;30;41m慈爱[0m；黑夜，我要歌颂[6;30;42m祷告[0m赐我生命的神。
Match:  1.2500 Verse: 神是应当称颂的！他并没有推却我的[6;30;42m祷告[0m，也没有叫他的[6;30;41m慈爱[0m离开我。
Match:  1.2500 Verse: 但我在悦纳的时候向你─耶和华[6;30;42m祈祷[0m。神啊，求你按你丰盛的[6;30;41m慈爱[0m，凭你拯救的诚实应允我！
Match:  1.2000 Verse: 耶和华神啊，求你起来，和你[6;30;42m有[0m能力的约柜同入安息之所。耶和华神啊，愿你的祭司披上救恩；愿你的圣民蒙福[6;30;41m欢乐[0m。
Match:  1.2000 Verse: 看见王站在殿门的柱旁，百夫长和吹号的人侍立在王左右，国民都[6;30;41m欢乐[0m吹号，又[6;30;42m有[0