In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q parsivar
!pip install -q farm-haystack[colab,inference,elasticsearch,preprocessing]
!pip install -q datasets sentencepiece
!pip install -q transformers==4.44.2
!pip install --upgrade Pillow

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.2/152.2 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m114.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.0/386.0 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [38]:
import json
import torch
import re
from parsivar import Normalizer
from bs4 import BeautifulSoup
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import pipeline
from haystack.nodes import FARMReader
from haystack.pipelines import Pipeline

# Index Construction

In [12]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [13]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [14]:
from haystack.utils import launch_es
import time

launch_es()
time.sleep(30)



In [5]:
from bs4 import BeautifulSoup
import re

with open('/content/drive/MyDrive/corpus/makarem.xml', 'r', encoding="utf8") as f:
  content = f.read()
soup= BeautifulSoup(content, 'xml')

verses = soup.find_all("s")

dict_data_list = []

for verse in verses:
  data_dict = {}
  context = re.sub("[\(\[].*?[\)\]]", "", verse.contents[0])
  data_dict['id'] = verse.get('id')
  data_dict['text'] = context
  dict_data_list.append(data_dict)

In [6]:
from datasets import Dataset

dataset = Dataset.from_list(dict_data_list)

In [7]:
from haystack.schema import Document

documents = []
for doc in dataset:
    documents.append(
        Document(
            content=doc["text"],
            meta={"title": '', "abstract": doc["text"], "pmid": doc["id"]},
        )
    )

In [8]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

doc_index = "hybrid_docs"
label_index = "hybrid_labels"

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index=doc_index,
    label_index=label_index,
    embedding_field="emb",
    embedding_dim=768,
    excluded_meta_data=["emb"],
)

In [9]:
from haystack.nodes import PreProcessor

preprocessor = PreProcessor(
    split_by="word",
    language = 'fa',
    split_length=200,
    split_overlap=0,
    split_respect_sentence_boundary=False,
    clean_empty_lines=False,
    clean_whitespace=False,
)
document_store.delete_documents(index=doc_index)
document_store.delete_documents(index=label_index)

docs_to_index = preprocessor.process(documents)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Preprocessing: 100%|██████████| 6236/6236 [00:00<00:00, 7571.05docs/s]


In [10]:
from haystack.nodes import DensePassageRetriever, BM25Retriever

sparse_retriever = BM25Retriever(document_store=document_store)
dense_retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="/content/drive/MyDrive/saved_models/dpr_parsbert/query_encoder",
                                  passage_embedding_model="/content/drive/MyDrive/saved_models/dpr_parsbert/passage_encoder",
                                  use_gpu=True,
                                  max_seq_len_passage=256,
                                  embed_title=True)


In [None]:
document_store.write_documents(docs_to_index)
document_store.update_embeddings(retriever=dense_retriever, index=doc_index)

Updating embeddings:   0%|          | 0/6168 [00:00<?, ? Docs/s]
Create embeddings:   0%|          | 0/6176 [00:00<?, ? Docs/s][A
Create embeddings:   0%|          | 16/6176 [00:01<09:06, 11.26 Docs/s][A
Create embeddings:   1%|          | 32/6176 [00:01<04:24, 23.23 Docs/s][A
Create embeddings:   1%|          | 48/6176 [00:01<02:55, 34.96 Docs/s][A
Create embeddings:   1%|          | 64/6176 [00:01<02:13, 45.85 Docs/s][A
Create embeddings:   1%|▏         | 80/6176 [00:02<01:50, 55.36 Docs/s][A
Create embeddings:   2%|▏         | 96/6176 [00:02<01:36, 63.03 Docs/s][A
Create embeddings:   2%|▏         | 112/6176 [00:02<01:28, 68.17 Docs/s][A
Create embeddings:   2%|▏         | 128/6176 [00:02<01:22, 73.35 Docs/s][A
Create embeddings:   2%|▏         | 144/6176 [00:02<01:17, 77.58 Docs/s][A
Create embeddings:   3%|▎         | 160/6176 [00:03<01:15, 80.13 Docs/s][A
Create embeddings:   3%|▎         | 176/6176 [00:03<01:13, 82.16 Docs/s][A
Create embeddings:   3%|▎         | 192

In [15]:
from haystack.nodes import JoinDocuments, SentenceTransformersRanker

join_documents = JoinDocuments(join_mode="concatenate")
rerank = SentenceTransformersRanker(model_name_or_path="NeginShams/cross_encoder_v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/853 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

# Question classification

In [16]:
def Question_classifier(question):
  if 'سوره' in question and 'آیه' in question:
    return 'referential'
  else:
    return 'non_referential'

# find verse address

In [17]:
def find_verse_address(question):

    with open('/content/drive/MyDrive/QuranInfo.json', 'r', encoding = 'utf_8_sig') as json_file:
        quran_data_list = json.load(json_file)

    quran_info = {}

    for each in quran_data_list:
        quran_info[each['chapter_name']] = each['chapter_number']

    spl_word = 'آیه'
    res = question[question.find(spl_word)+len(spl_word):]
    verse_number = res.split()[0]

    spl_word = 'سوره'
    res = question[question.find(spl_word)+len(spl_word):]
    chapter_name = res.split()[0]
    chapter_name = chapter_name.replace("،","")

    if chapter_name == '‌ی':
        chapter_name = res.split()[1]

    if chapter_name == 'صاد':
        chapter_name = 'ص'

    if chapter_name == 'قاف':
        chapter_name = 'ق'

    if chapter_name == 'آل‌عمران':
        chapter_name = 'آل عمران'

    if chapter_name == 'انبیا':
        chapter_name = "انبیاء"

    if chapter_name == 'شرح':
        chapter_name = "انشراح"


    chapter_name = chapter_name.replace("،","")

    # verse_number = int(verse_number)
    try:
        chapter_number = quran_info[chapter_name]
    except:
        my_normalizer = Normalizer()
        chapter_name = my_normalizer.normalize(chapter_name)
        chapter_number = quran_info[chapter_name]

    if verse_number == 'اول':
        verse_number = '1'

    if verse_number == 'دوم':
        verse_number = '2'

    if verse_number == 'سوم':
        verse_number = '3'

    if verse_number == 'هفتم':
        verse_number = '7'

    if verse_number == 'نهم':
        verse_number = '9'

    verse_number = verse_number.replace("سوره","")

    verse_number = str(int(verse_number))

    verse_id = 's' + chapter_number + '.' + verse_number
    return verse_id

    # if verse_id == 's48.پایانی':
    #     verse_id = 's48.29'

    # if verse_id == 's103.اولیه':
    #     verse_id = 's103.2'

    # if verse_id == 's89.آخر':
    #     verse_id = 's89.30'

    # if verse_id == 's96.ابتدایی':
    #     verse_id = 's96.1'


# Reader

In [36]:
def answer_ensemble(context, question):
  checkpoint_list = ['NeginShams/albert-Quran_QA',
                  'NeginShams/xlm-roberta-Quran_QA',
                    'NeginShams/mbert-Quran_QA',
                    'NeginShams/parsbert-Quran_QA']
  results = []
  device = 0 if torch.cuda.is_available() else -1  # -1 for CPU

  for checkpoint in checkpoint_list:
    question_answerer = pipeline("question-answering", model=checkpoint, device=device)
    result = question_answerer(question=question, context=context)
    result['model_name'] = checkpoint
    results.append(result)
  # print(results)

  for i in range(len(results)):
      maximum = 0
      answer = ''
      for result in results:
        if result['score'] > maximum:
          maximum = result['score']
          answer = result['answer']
  # print(maximum)
  return answer

# **QA** (non-referential)
***Retriever-Reader***

In [41]:
def qa_pipeline(question):


  checkpoint_list = ['NeginShams/albert-Quran_QA',
                  'NeginShams/xlm-roberta-Quran_QA',
                    'NeginShams/mbert-Quran_QA',
                    'NeginShams/parsbert-Quran_QA']
  results = []
  for checkpoint in checkpoint_list:
    reader = FARMReader(model_name_or_path=checkpoint, top_k=4, use_gpu=True)
    pipeline = Pipeline()
    pipeline.add_node(component=sparse_retriever, name="SparseRetriever", inputs=["Query"])
    pipeline.add_node(component=dense_retriever, name="DenseRetriever", inputs=["Query"])
    pipeline.add_node(component=join_documents, name="JoinDocuments", inputs=["SparseRetriever", "DenseRetriever"])
    pipeline.add_node(component=rerank, name="ReRanker", inputs=["JoinDocuments"])
    pipeline.add_node(component=reader, name="Reader", inputs=["ReRanker"])

    k=1
    prediction = pipeline.run(
        query=question, params={"SparseRetriever": {"top_k": k},
                                "DenseRetriever": {"top_k": k},
                                "JoinDocuments": {"top_k_join": 2*k},
                                "ReRanker": {"top_k": k},
                                "Reader": {"top_k": 1}}
    )

    result = {}
    answer_dict = prediction['answers'][0].__dict__
    result['answer'] = answer_dict['answer']
    result['score'] = answer_dict['score']
    result['model_name'] = checkpoint
    results.append(result)

  # print(results)

  for i in range(len(results)):
      maximum = 0
      answer = ''
      for result in results:
        if result['score'] > maximum:
          maximum = result['score']
          answer = result['answer']
  # print(maximum)
  return answer

# Question Answering

In [34]:
def find_answer(question):
  if Question_classifier(question) == 'referential':
    verse_id = find_verse_address(question)
    file_path = '/content/drive/MyDrive/corpus/makarem.xml'
    with open(file_path, 'r', encoding="utf8") as f:
      content = f.read()
    soup= BeautifulSoup(content, 'xml')
    # print(verse_id)
    verse_text = soup.find(id=verse_id).contents[0]
    context = re.sub("[\(\[].*?[\)\]]", "", verse_text)
    # print(context)

    final_answer = answer_ensemble(context, question)
    print(final_answer)

  else:
    final_answer = qa_pipeline(question)

    print('\n'+ final_answer)



In [43]:
find_answer("آیه 45 سوره‌ی مدثر به کدام خطر همرنگی و همنشینی با چه کسانی اشاره دارد؟")

اهل باطل


In [44]:
find_answer("بر اساس قرآن کسی که به مکر و نیرنگ دیگران متمایل شود در زمره چه افرادی خواهد بود؟")

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.92 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 30.58 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 36.32 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 36.63 Batches/s]


جاهلان



