In [None]:
#importing required libraries
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader
from haystack.nodes import TextConverter, PDFToTextConverter, PreProcessor
import os

import warnings
warnings.filterwarnings("ignore")

In [None]:
from haystack.document_stores import InMemoryDocumentStore,SQLDocumentStore

#Using In memory document store to save corpus
document_store = InMemoryDocumentStore()

# document_store = SQLDocumentStore(url="sqlite:///qa.db")

In [None]:
# filename = "Employee Manual_India.pdf"
# converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
# doc_pdf = converter.convert(file_path=str("E:\\AA_TE_SEM_VI\\persistant\\learning material\\contextual search\\data\\" + filename), meta=None)[0]
# doc_pdf
data_dir = os.getcwd() + "\\data\\"

#iterating and converting pdf files to document type
all_docs = convert_files_to_docs(dir_path=data_dir)

In [None]:
all_docs[0]

In [None]:
#cleaning the document
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True, #false hota
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)

docs = preprocessor.process(all_docs)
print(f"\n n_files_input: {len(all_docs)}\n n_docs_output: {len(docs)}")

document_store.write_documents(docs)

In [None]:
len(docs)

In [None]:
from haystack.nodes import BM25Retriever,TfidfRetriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader, TransformersReader
from haystack.nodes import EmbeddingRetriever

In [None]:
#creating input output pipeline

retriever = TfidfRetriever(document_store=document_store)
# retriever = BM25Retriever(document_store=document_store) #worked with elasticsearch as document store on colab

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
# prediction = pipe.run(query="What is the frequency of project party?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

prediction = pipe.run(query="What is the budget for project party?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

In [None]:
from pprint import pprint
# print(len(prediction))
pprint(prediction)


# {'answer': 'Once in six months', 
#  'type': 'extractive',
#  'score': 0.5211937427520752, 
#  'context': 'es are held outside normal working hours. 2. Budget:\n• Frequency: Once in six months (i.e. 2 times in year)\n• Amount: Rs. 750/- per head per duration\n',
#  'offsets_in_document': [{'start': 470, 'end': 488}],
#  'offsets_in_context': [{'start': 66, 'end': 84}],
#  'document_id': '6d7afbcd7156492026b208c99c427021', 
#  'meta': {'name': 'Employee Manual_India.pdf', '_split_id': 236}
# }

In [None]:
print_answers(prediction, details="medium")

In [None]:
# prediction

In [None]:
retriever2 = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers"
)

document_store.update_embeddings(retriever2)

reader2 = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

pipe2 = ExtractiveQAPipeline(reader2, retriever2)

prediction2 = pipe2.run(query="What is the budget for project party?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

# print(len(prediction))
pprint(prediction2)
#~ 7-8min to train

In [None]:
print_answers(prediction2, details="medium")

In [None]:
#Trial1 : using elasticsearch

# import os
# import time
# from subprocess import Popen, PIPE, STDOUT
# from haystack.document_stores import ElasticsearchDocumentStore
# from haystack.utils import launch_es
# curr_dir = os.getcwd()

# launch_es()
# es_server = Popen([curr_dir + "\\elasticsearch-7.9.2\\bin\\elasticsearch.bat"], stdout=PIPE, stderr=STDOUT)

# print("Waiting for ES to start...")
# time.sleep(30) # Waiting until ES has started

# # JubJObatm9NMq3cJLOa4
# document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")