In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q farm-haystack[colab,inference,metrics,elasticsearch,preprocessing]
!pip install -q datasets

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
from haystack.utils import launch_es
import time

launch_es()
time.sleep(30)



In [None]:
from bs4 import BeautifulSoup
import re

with open('/content/drive/MyDrive/makarem.xml', 'r', encoding="utf8") as f:
  content = f.read()
soup= BeautifulSoup(content, 'xml')

verses = soup.find_all("s")

dict_data_list = []

for verse in verses[:100]:
  data_dict = {}
  context = re.sub("[\(\[].*?[\)\]]", "", verse.contents[0])
  data_dict['id'] = verse.get('id')
  data_dict['text'] = context
  dict_data_list.append(data_dict)

In [None]:
from datasets import Dataset

dataset = Dataset.from_list(dict_data_list)

In [None]:
from haystack.schema import Document

documents = []
for doc in dataset:
    documents.append(
        Document(
            content=doc["text"],
            meta={"title": '', "abstract": doc["text"], "pmid": doc["id"]},
        )
    )


In [None]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

doc_index = "hybrid_docs"
label_index = "hybrid_labels"

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index=doc_index,
    label_index=label_index,
    embedding_field="emb",
    embedding_dim=768,
    excluded_meta_data=["emb"],
)

In [None]:
from haystack.nodes import PreProcessor

preprocessor = PreProcessor(
    split_by="word",
    language = 'fa',
    split_length=200,
    split_overlap=0,
    split_respect_sentence_boundary=False,
    clean_empty_lines=False,
    clean_whitespace=False,
)
document_store.delete_documents(index=doc_index)
document_store.delete_documents(index=label_index)

# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects. Those objects are then indexed in their respective document and label index in the document store. The method can be used with any dataset in SQuAD format.

docs_to_index = preprocessor.process(documents)

Preprocessing: 100%|██████████| 100/100 [00:00<00:00, 2594.67docs/s]


In [None]:
from haystack.nodes import DensePassageRetriever, BM25Retriever

sparse_retriever = BM25Retriever(document_store=document_store)
dense_retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="/content/drive/MyDrive/saved_models/dpr_mbert/query_encoder",
                                  passage_embedding_model="/content/drive/MyDrive/saved_models/dpr_mbert/passage_encoder",
                                  use_gpu=True,
                                  max_seq_len_passage=256,
                                  embed_title=True)


  return self.fget.__get__(instance, owner)()


In [None]:
document_store.delete_documents()
document_store.write_documents(docs_to_index)
document_store.add_eval_data(
    filename="/content/drive/MyDrive/retrieval_test.json",
    doc_index=doc_index,
    label_index=label_index,
    preprocessor=preprocessor,
)

Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 192.50docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1580.97docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1164.76docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1269.46docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1372.03docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 599.87docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1119.08docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 761.22docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 980.66docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1686.49docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1242.76docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 918.59docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1073.81docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1210.83docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 1258.79docs/s]
Preprocessing: 100%|██████████

In [None]:
document_store.update_embeddings(retriever=dense_retriever, index=doc_index)

Updating embeddings:   0%|          | 0/190 [00:00<?, ? Docs/s]
Create embeddings:   0%|          | 0/192 [00:00<?, ? Docs/s][A
Create embeddings:   8%|▊         | 16/192 [00:13<02:28,  1.18 Docs/s][A
Create embeddings:  17%|█▋        | 32/192 [00:27<02:15,  1.18 Docs/s][A
Create embeddings:  25%|██▌       | 48/192 [00:39<01:57,  1.22 Docs/s][A
Create embeddings:  33%|███▎      | 64/192 [00:51<01:41,  1.26 Docs/s][A
Create embeddings:  42%|████▏     | 80/192 [01:05<01:32,  1.22 Docs/s][A
Create embeddings:  50%|█████     | 96/192 [01:21<01:25,  1.13 Docs/s][A
Create embeddings:  58%|█████▊    | 112/192 [01:34<01:08,  1.16 Docs/s][A
Create embeddings:  67%|██████▋   | 128/192 [01:46<00:52,  1.22 Docs/s][A
Create embeddings:  75%|███████▌  | 144/192 [02:04<00:44,  1.08 Docs/s][A
Create embeddings:  83%|████████▎ | 160/192 [02:18<00:28,  1.11 Docs/s][A
Create embeddings:  92%|█████████▏| 176/192 [02:31<00:14,  1.14 Docs/s][A
Create embeddings: 100%|██████████| 192/192 [02:41<0

In [None]:
document_store.update_embeddings(retriever=dense_retriever, index=label_index)

Updating embeddings:   0%|          | 0/80 [00:00<?, ? Docs/s]


ValueError: ignored

In [None]:
from haystack.nodes import JoinDocuments, SentenceTransformersRanker

join_documents = JoinDocuments(join_mode="concatenate")
rerank = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2")

In [None]:
from haystack.pipelines import Pipeline

pipeline = Pipeline()
pipeline.add_node(component=sparse_retriever, name="SparseRetriever", inputs=["Query"])
pipeline.add_node(component=dense_retriever, name="DenseRetriever", inputs=["Query"])
pipeline.add_node(component=join_documents, name="JoinDocuments", inputs=["SparseRetriever", "DenseRetriever"])
pipeline.add_node(component=rerank, name="ReRanker", inputs=["JoinDocuments"])


In [None]:
from haystack.schema import EvaluationResult, MultiLabel

# We can load evaluation labels from the document store
# We are also opting to filter out no_answer samples
eval_labels = document_store.get_all_labels_aggregated(index=label_index, drop_negative_labels=True, drop_no_answers=True)

eval_result = pipeline.eval(labels=eval_labels, params={"ReRanker": {"top_k": 5}})
# eval_result = pipeline.eval(labels=eval_labels, params={"JoinDocuments": {"top_k_join": 2}})
# retriever_result = eval_result["JoinDocuments"]
retriever_result = eval_result["ReRanker"]
retriever_result.head()



Unnamed: 0,multilabel_id,query,filters,gold_answers,context,gold_contexts,gold_id_match,context_match,answer_match,gold_id_or_answer_match,...,rank,document_id,gold_document_ids,gold_documents_id_match,gold_contexts_similarity,gold_answers_match,type,node,eval_mode,index
0,1b1215113a85fe0065d50b104320a9ba,روح الهی در چه شکلی بر مریم ظاهر شد؟,b'null',[انسانی بی‌عیب و نقص],,[و در این کتاب ، مریم را یاد کن، آن هنگام که ا...,0.0,0.0,0.0,0.0,...,1.0,,[479193f847928d2d0432d033dcb7539d-0],[0.0],[0.0],[0.0],document,ReRanker,integrated,0
1,54889a0b99cb72a5291c400f0875e277,اعمال چه کسانی برایشان زینت داده شده است؟,b'null',[اسرافکاران],,[هنگامی که به انسان زیان رسد، ما را در حالی ...,0.0,0.0,0.0,0.0,...,1.0,,[55fe664a20ebefc258fc2bf356c99fe5-0],[0.0],[0.0],[0.0],document,ReRanker,integrated,0
2,1af6aab287280a498563f46078dd9273,دو مورد از کتاب‌های آسمانی که بعد ار ابراهیم ن...,b'null',[تورات و انجیل],,[ای اهل کتاب! چرا درباره ابراهیم، گفتگو و نزاع...,0.0,0.0,0.0,0.0,...,1.0,,[126a66e228b0a98f3c07044c655daa7b-0],[0.0],[0.0],[0.0],document,ReRanker,integrated,0
3,631c92294f9f601085b8c5a9e0e1b707,نهرهای بهشتی مملو از چه آبی هستند؟,b'null',[آب صاف و خالص که بدبو نشده],,[توصیف بهشتی که به پرهیزگاران وعده داده شده، چ...,0.0,0.0,0.0,0.0,...,1.0,,[79b1a3b8eb325076c0c1f3e8ed5af552-0],[0.0],[0.0],[0.0],document,ReRanker,integrated,0
4,fb3166c9be7f2deafcbd9cac37ef3d2b,چه کسانی متذکّر می‌شوند؟,b'null',[خردمندان],,[ یا کسی که در ساعات شب به عبادت مشغول است و د...,0.0,0.0,0.0,0.0,...,1.0,,[cfadbe1b2002e9be38231fd6d82e5f79-0],[0.0],[0.0],[0.0],document,ReRanker,integrated,0


In [None]:
document_store.get_label_count(index=label_index)

160

In [None]:
eval_labels

In [None]:
eval_result.save("../")

In [None]:
saved_eval_result = EvaluationResult.load("../")
metrics = saved_eval_result.calculate_metrics()
print(f'SparseRetriever - Recall (single relevant document): {metrics["SparseRetriever"]["recall_single_hit"]}')
print(f'SparseRetriever - Recall (multiple relevant documents): {metrics["SparseRetriever"]["recall_multi_hit"]}')
print(f'SparseRetriever - Mean Reciprocal Rank: {metrics["SparseRetriever"]["mrr"]}')
print(f'SparseRetriever - Precision: {metrics["SparseRetriever"]["precision"]}')
print(f'SparseRetriever - Mean Average Precision: {metrics["SparseRetriever"]["map"]}')

SparseRetriever - Recall (single relevant document): 0.0
SparseRetriever - Recall (multiple relevant documents): 0.0
SparseRetriever - Mean Reciprocal Rank: 0.0
SparseRetriever - Precision: 0.0
SparseRetriever - Mean Average Precision: 0.0


In [None]:
saved_eval_result = EvaluationResult.load("../")
metrics = saved_eval_result.calculate_metrics()
print(f'SparseRetriever - Recall (single relevant document): {metrics["SparseRetriever"]["recall_single_hit"]}')
print(f'SparseRetriever - Recall (multiple relevant documents): {metrics["SparseRetriever"]["recall_multi_hit"]}')
print(f'SparseRetriever - Mean Reciprocal Rank: {metrics["SparseRetriever"]["mrr"]}')
print(f'SparseRetriever - Precision: {metrics["SparseRetriever"]["precision"]}')
print(f'SparseRetriever - Mean Average Precision: {metrics["SparseRetriever"]["map"]}')

In [None]:
params={"SparseRetriever": {"top_k": 100},
                             "DenseRetriever": {"top_k": 100},
                             "JoinDocuments": {"top_k_join": 200},
                             "ReRanker": {"top_k": 100},},

In [None]:
from haystack.schema import EvaluationResult, MultiLabel

# We can load evaluation labels from the document store
# We are also opting to filter out no_answer samples
eval_labels = document_store.get_all_labels_aggregated(index=label_index, drop_negative_labels=True, drop_no_answers=True)

eval_result = pipeline.eval(labels=eval_labels, params={"SparseRetriever": {"top_k": 10},
                             "DenseRetriever": {"top_k": 10},
                             "JoinDocuments": {"top_k_join": 15},
                             "ReRanker": {"top_k": 10},})
# eval_result = pipeline.eval(labels=eval_labels, params={"JoinDocuments": {"top_k_join": 2}})
# retriever_result = eval_result["JoinDocuments"]
retriever_result = eval_result["ReRanker"]
retriever_result.head()

Unnamed: 0,multilabel_id,query,filters,gold_answers,context,gold_contexts,gold_id_match,context_match,answer_match,gold_id_or_answer_match,...,rank,document_id,gold_document_ids,gold_documents_id_match,gold_contexts_similarity,gold_answers_match,type,node,eval_mode,index
0,1b1215113a85fe0065d50b104320a9ba,روح الهی در چه شکلی بر مریم ظاهر شد؟,b'null',[انسانی بی‌عیب و نقص],هنگامی را که خداوند به عیسی بن مریم گفت: «یاد...,[و در این کتاب ، مریم را یاد کن، آن هنگام که ا...,0.0,0.0,0.0,0.0,...,1.0,fdfeb89addc5d7621da5a553bbdcf31f-0,[479193f847928d2d0432d033dcb7539d-0],[0.0],[44.98093181528009],[0.0],document,ReRanker,integrated,0
1,1b1215113a85fe0065d50b104320a9ba,روح الهی در چه شکلی بر مریم ظاهر شد؟,b'null',[انسانی بی‌عیب و نقص],و زمانی را که گفتید: «ای موسی! هرگز حاضر نیست...,[و در این کتاب ، مریم را یاد کن، آن هنگام که ا...,0.0,0.0,0.0,0.0,...,2.0,af7a2c4afd92bbc91a7b9035a2d148a4,[479193f847928d2d0432d033dcb7539d-0],[0.0],[45.070422535211264],[0.0],document,ReRanker,integrated,1
2,1b1215113a85fe0065d50b104320a9ba,روح الهی در چه شکلی بر مریم ظاهر شد؟,b'null',[انسانی بی‌عیب و نقص],اگر تو را تکذیب کنند، پیش از آنها قوم نوح و ع...,[و در این کتاب ، مریم را یاد کن، آن هنگام که ا...,0.0,0.0,0.0,0.0,...,3.0,e585378a61acfe7df4037e812678f784-0,[479193f847928d2d0432d033dcb7539d-0],[0.0],[43.89438943894389],[0.0],document,ReRanker,integrated,2
3,1b1215113a85fe0065d50b104320a9ba,روح الهی در چه شکلی بر مریم ظاهر شد؟,b'null',[انسانی بی‌عیب و نقص],هنگامی که قرآن می‌خوانی، از شرّ شیطان مطرود، ب...,[و در این کتاب ، مریم را یاد کن، آن هنگام که ا...,0.0,0.0,0.0,0.0,...,4.0,85f7bf4730beccd59ecaca869f142bf0-0,[479193f847928d2d0432d033dcb7539d-0],[0.0],[45.366795366795365],[0.0],document,ReRanker,integrated,3
4,1b1215113a85fe0065d50b104320a9ba,روح الهی در چه شکلی بر مریم ظاهر شد؟,b'null',[انسانی بی‌عیب و نقص],ما به موسی کتاب دادیم؛ و بعد از او، پیامبرانی...,[و در این کتاب ، مریم را یاد کن، آن هنگام که ا...,0.0,0.0,0.0,0.0,...,5.0,962ba5cf3c0ed02e75e92a9a8159b780,[479193f847928d2d0432d033dcb7539d-0],[0.0],[46.83098591549296],[0.0],document,ReRanker,integrated,4


In [None]:
eval_result.save("../")
saved_eval_result = EvaluationResult.load("../")
metrics = saved_eval_result.calculate_metrics()
print(f'ReRanker - Recall (single relevant document): {metrics["ReRanker"]["recall_single_hit"]}')
print(f'ReRanker - Recall (multiple relevant documents): {metrics["ReRanker"]["recall_multi_hit"]}')
print(f'ReRanker - Mean Reciprocal Rank: {metrics["ReRanker"]["mrr"]}')
print(f'ReRanker - Precision: {metrics["ReRanker"]["precision"]}')
print(f'ReRanker - Mean Average Precision: {metrics["ReRanker"]["map"]}')

ReRanker - Recall (single relevant document): 0.8987341772151899
ReRanker - Recall (multiple relevant documents): 0.8987341772151899
ReRanker - Mean Reciprocal Rank: 0.3781182600802854
ReRanker - Precision: 0.09535864978902953
ReRanker - Mean Average Precision: 0.3575702944110117


In [None]:
print(f'JoinDocuments - Recall (single relevant document): {metrics["JoinDocuments"]["recall_single_hit"]}')
print(f'JoinDocuments - Recall (multiple relevant documents): {metrics["JoinDocuments"]["recall_multi_hit"]}')
print(f'JoinDocuments - Mean Reciprocal Rank: {metrics["JoinDocuments"]["mrr"]}')
print(f'JoinDocuments - Precision: {metrics["JoinDocuments"]["precision"]}')
print(f'JoinDocuments - Mean Average Precision: {metrics["JoinDocuments"]["map"]}')

JoinDocuments - Recall (single relevant document): 0.8987341772151899
JoinDocuments - Recall (multiple relevant documents): 0.8987341772151899
JoinDocuments - Mean Reciprocal Rank: 0.7728350411894715
JoinDocuments - Precision: 0.09535864978902953
JoinDocuments - Mean Average Precision: 0.7168606992710368


In [None]:
print(f'SparseRetriever - Recall (single relevant document): {metrics["SparseRetriever"]["recall_single_hit"]}')
print(f'SparseRetriever - Recall (multiple relevant documents): {metrics["SparseRetriever"]["recall_multi_hit"]}')
print(f'SparseRetriever - Mean Reciprocal Rank: {metrics["SparseRetriever"]["mrr"]}')
print(f'SparseRetriever - Precision: {metrics["SparseRetriever"]["precision"]}')
print(f'SparseRetriever - Mean Average Precision: {metrics["SparseRetriever"]["map"]}')

SparseRetriever - Recall (single relevant document): 0.8987341772151899
SparseRetriever - Recall (multiple relevant documents): 0.8987341772151899
SparseRetriever - Mean Reciprocal Rank: 0.7728350411894715
SparseRetriever - Precision: 0.13291139240506325
SparseRetriever - Mean Average Precision: 0.7279403510114782


In [None]:
print(f'DenseRetriever - Recall (single relevant document): {metrics["DenseRetriever"]["recall_single_hit"]}')
print(f'DenseRetriever - Recall (multiple relevant documents): {metrics["DenseRetriever"]["recall_multi_hit"]}')
print(f'DenseRetriever - Mean Reciprocal Rank: {metrics["DenseRetriever"]["mrr"]}')
print(f'DenseRetriever - Precision: {metrics["DenseRetriever"]["precision"]}')
print(f'DenseRetriever - Mean Average Precision: {metrics["DenseRetriever"]["map"]}')

DenseRetriever - Recall (single relevant document): 0.26582278481012656
DenseRetriever - Recall (multiple relevant documents): 0.25949367088607594
DenseRetriever - Mean Reciprocal Rank: 0.10811231665662044
DenseRetriever - Precision: 0.040506329113924044
DenseRetriever - Mean Average Precision: 0.09395218002812941
