# Transformer models

## Load data

In [1]:
from hanziconv import HanziConv
from data import read_bible

unv = read_bible('data/dnstrunv.tgz')
unv['text_s'] = unv.text.apply(HanziConv.toSimplified)

## Compute embeddings

In [6]:
import pickle
from sentence_transformers import SentenceTransformer, util

max_seq = unv.text_s.str.len().max()
searches = ['挂虑 祈祷', '喜乐 事奉', '求救', '信心 行事']

def save_embeddings(searches_emb, verse_emb, path: str):
    with open(path, "wb") as fOut:
        pickle.dump({'searches_emb': searches_emb, 'verse_emb': verse_emb}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

def compute_embeddings(model: str):
    model = SentenceTransformer(model)
    model.max_seq_length = max_seq
    searches_emb = model.encode(searches)
    verse_emb = model.encode(unv.text_s, show_progress_bar=True, num_workers=2)
    save_embeddings(f'data/{model}-embeddings.pkl', searches_emb, verse_emb)
    return searches_emb, verse_emb

def search(searches_emb, verse_emb):
    results = util.semantic_search(searches_emb, verse_emb)
    for i, search_results in enumerate(results):
        print(f'Searches for {searches[i]}:')
        for top_k in search_results:
            print(f'Score: {top_k["score"]:7.4f} {unv.text.loc[top_k["corpus_id"]]}')
        print()

In [44]:
du = compute_embeddings('distiluse-base-multilingual-cased-v2')
du[0]

array([[ 0.012742  ,  0.07456179,  0.03821773, ...,  0.01008224,
         0.10611277, -0.06496906],
       [-0.00446616,  0.04929008,  0.00206162, ..., -0.02148391,
        -0.01632529, -0.00246834],
       [-0.01185791,  0.00333456, -0.04894161, ..., -0.00349449,
        -0.01004079,  0.02232193],
       ...,
       [ 0.00756844, -0.0408489 , -0.01786444, ...,  0.02728658,
         0.01209383, -0.02924234],
       [ 0.08686031,  0.06604788, -0.0246829 , ...,  0.02659308,
         0.01361549,  0.02791402],
       [ 0.03772836, -0.01862286,  0.01920143, ...,  0.01515482,
        -0.06437556,  0.02548239]], dtype=float32)

In [66]:
search(*du)

Searches for 挂虑 祈祷:
Score:  0.8784 不住地禱告，
Score:  0.6724 請弟兄們為我們禱告。
Score:  0.6638 你們要恆切禱告，在此警醒感恩。
Score:  0.6236 求你從天上垂聽他們的禱告祈求，使他們得勝。
Score:  0.6216 求你在天上垂聽他們的禱告祈求，使他們得勝。
Score:  0.6107 神啊，求你聽我的禱告，留心聽我口中的言語。
Score:  0.6091 你們禱告，無論求甚麼，只要信，就必得著。」
Score:  0.6088 應當一無掛慮，只要凡事藉著禱告、祈求，和感謝，將你們所要的告訴神。
Score:  0.5979 你們要呼求我，禱告我，我就應允你們。
Score:  0.5791 聽禱告的主啊，凡有血氣的都要來就你。

Searches for 喜乐 事奉:
Score:  0.6796 要常常喜樂，
Score:  0.4753 折腳折手的、
Score:  0.4182 惟有義人必然歡喜，在神面前高興快樂。
Score:  0.4167 猶大人有光榮，歡喜快樂而得尊貴。
Score:  0.4079 說：
Score:  0.4074 卑微的弟兄升高，就該喜樂；
Score:  0.3984 以利戶又說：
Score:  0.3984 以利戶又說：
Score:  0.3937 你們要靠主常常喜樂。我再說，你們要喜樂。
Score:  0.3900 不住地禱告，

Searches for 求救:
Score:  0.4954 說：
Score:  0.4761 不住地禱告，
Score:  0.4265 拯救我的主啊，求你快快幫助我！
Score:  0.4238 亞希雅、哈難、亞難、
Score:  0.4010 折腳折手的、
Score:  0.3940 a
Score:  0.3940 a
Score:  0.3940 a
Score:  0.3940 a
Score:  0.3940 a

Searches for 信心 行事:
Score:  0.4612 可見，信心是與他的行為並行，而且信心因著行為才得成全。
Score:  0.4397 必有人說：「你有信心，我有行為；你將你沒有行為的信心指給我看，我便藉著我的行為，將我的信心指給你看。」
Sco

In [64]:
cb = compute_embeddings('bert-base-chinese')

Exception when trying to download https://sbert.net/models/bert-base-chinese.zip. Response 404
Downloading: 100%|██████████| 624/624 [00:00<00:00, 165kB/s]
Downloading: 100%|██████████| 412M/412M [22:01<00:00, 312kB/s]
Downloading: 100%|██████████| 110k/110k [00:01<00:00, 107kB/s]  
Downloading: 100%|██████████| 269k/269k [00:01<00:00, 171kB/s]
Batches: 100%|██████████| 972/972 [3:09:39<00:00, 11.71s/it]


In [73]:
search(*cb)

Searches for 挂虑 祈祷:
Score:  0.7046 我便禁食，披麻蒙灰，定意向主神祈禱懇求。
Score:  0.6962 我想念神，就煩躁不安；我沉吟悲傷，心便發昏。（細拉）
Score:  0.6863 每逢為你們眾人祈求的時候，常是歡歡喜喜地祈求。
Score:  0.6860 我勸你，第一要為萬人懇求、禱告、代求、祝謝；
Score:  0.6840 從何羅念有喊荒涼大毀滅的哀聲：
Score:  0.6720 「當記念安息日，守為聖日。
Score:  0.6706 我有憂愁，願能自慰；我心在我裡面發昏。
Score:  0.6700 恐懼戰兢歸到我身；驚恐漫過了我。
Score:  0.6697 在思念夜中、異象之間，世人沉睡的時候，
Score:  0.6662 有一宗人（宗：原文是代；下同），咒詛父親，不給母親祝福。

Searches for 喜乐 事奉:
Score:  0.8040 要常常喜樂，
Score:  0.7612 常在殿裡稱頌神。
Score:  0.7516 我靈以神我的救主為樂；
Score:  0.7488 總要察驗何為主所喜悅的事。
Score:  0.7441 卑微的弟兄升高，就該喜樂；
Score:  0.7429 不住地禱告，
Score:  0.7420 猶大人有光榮，歡喜快樂而得尊貴。
Score:  0.7405 散布亮光是為義人；預備喜樂是為正直人。
Score:  0.7403 所羅門因為求這事，就蒙主喜悅。
Score:  0.7373 在那裡傳福音。

Searches for 求救:
Score:  0.7817 a
Score:  0.7817 a
Score:  0.7817 a
Score:  0.7817 a
Score:  0.7817 a
Score:  0.7817 a
Score:  0.7817 a
Score:  0.7817 a
Score:  0.7817 a
Score:  0.7817 a

Searches for 信心 行事:
Score:  0.7413 但要凡事察驗；善美的要持守，
Score:  0.7335 我知道世人，莫強如終身喜樂行善；
Score:  0.7248 你所命定的法度是憑公義和至誠。
Score:  0.7236 若是能行，總要

In [74]:
sts = compute_embeddings('stsb-xlm-r-multilingual')

 62%|██████▏   | 627M/1.01G [21:36<13:19, 484kB/s]


FileNotFoundError: [Errno 2] No such file or directory: '/Users/Ken/.cache/torch/sentence_transformers/sbert.net_models_stsb-xlm-r-multilingual'

In [4]:
db = compute_embeddings('quora-distilbert-multilingual')

100%|██████████| 501M/501M [17:07<00:00, 487kB/s]
Batches: 100%|██████████| 972/972 [1:17:18<00:00,  4.77s/it]


OSError: [Errno 63] File name too long: 'data/SentenceTransformer(\n  (0): Transformer(\n    (auto_model): DistilBertModel(\n      (embeddings): Embeddings(\n        (word_embeddings): Embedding(119547, 768, padding_idx=0)\n        (position_embeddings): Embedding(512, 768)\n        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n        (dropout): Dropout(p=0.1, inplace=False)\n      )\n      (transformer): Transformer(\n        (layer): ModuleList(\n          (0): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (1): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (2): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (3): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (4): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n          (5): TransformerBlock(\n            (attention): MultiHeadSelfAttention(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (q_lin): Linear(in_features=768, out_features=768, bias=True)\n              (k_lin): Linear(in_features=768, out_features=768, bias=True)\n              (v_lin): Linear(in_features=768, out_features=768, bias=True)\n              (out_lin): Linear(in_features=768, out_features=768, bias=True)\n            )\n            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n            (ffn): FFN(\n              (dropout): Dropout(p=0.1, inplace=False)\n              (lin1): Linear(in_features=768, out_features=3072, bias=True)\n              (lin2): Linear(in_features=3072, out_features=768, bias=True)\n            )\n            (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n          )\n        )\n      )\n    )\n  )\n  (1): Pooling()\n)-embeddings.pkl'

In [5]:
search(db)

NameError: name 'db' is not defined

In [None]:
xlm = compute_verse_embeddings('paraphrase-xlm-r-multilingual-v1')