<a href="https://colab.research.google.com/github/JonathanCecil01/OfficePlacementM7/blob/main/Haystack_Based_Context_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash

pip install --upgrade pip
pip install farm-haystack[colab,faiss,inference,ocr,preprocessing,file-conversion,pdf]

In [2]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
from haystack.document_stores import FAISSDocumentStore

document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

In [None]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http
from haystack.nodes import PreProcessor

# Let's first get some files that we want to use
doc_dir = "/content/data"

docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)
preprocessed_docs = preprocessor.process(docs)
print(f"n_docs_input: 1\nn_docs_output: {len(preprocessed_docs)}")
# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(preprocessed_docs)

In [None]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
)
# Important:
# Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
document_store.update_embeddings(retriever)

In [None]:
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [7]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
from haystack.utils import print_answers

prediction = pipe.run(
    query="How much is the Main Fund?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)
print(prediction['answers'][0].meta['name'])
print_answers(prediction, details="minimum")

In [None]:
print(prediction['answers'][0].meta)

In [None]:
queries = ["what is the Fund Name?", "When is the Start Date?", "Which Section has Carried Interest?", "Who is the General Partner?", "Which secition is about Initial Closing Date?", "When is the Final Closing Date?", "Which is the Management Company?", "What are the Investment Limitations?", "What is the Purpose?", "How long is the Partnership Term?", "How much is the Main Fund?", "How Much is the Transaction Fees?", "How much is the Makeup Contribution?"]
predictions = {}
for query in queries:
    prediction = pipe.run( query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
    predictions[query] = []
    for i in prediction['answers']:
      temp_dict = {}
      temp_dict['answer'] = i.answer
      temp_dict['context'] = i.context
      temp_dict['filename'] = i.meta['name']
      temp_dict['score'] = i.score

      predictions[query].append(temp_dict)
print(predictions)

In [24]:
import json
json_string = json.dumps(predictions, indent = 2)
with open("context_search_results.json", "w") as f:
  f.write(json_string)