In [8]:
import logging

from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import convert_files_to_docs
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline

In [9]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [10]:
document_store = InMemoryDocumentStore(use_bm25=True)

doc_dir = "./api/data/build_your_first_question_answering_system/"

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [11]:
all_docs = convert_files_to_docs(dir_path=doc_dir)

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)
docs = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

pdftotext version 22.02.0
Copyright 2005-2022 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011 Glyph & Cog, LLC
INFO:haystack.utils.preprocessing:Converting api/data/build_your_first_question_answering_system/politique_de_confidentialite_donnees_personnelles_bdf_mars_2020.pdf


Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]



n_files_input: 1
n_docs_output: 29


In [12]:
document_store.write_documents(docs)

Updating BM25 representation...:   0%|          | 0/29 [00:00<?, ? docs/s]

In [13]:
retriever = BM25Retriever(document_store=document_store)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

pipe = ExtractiveQAPipeline(reader, retriever)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
  return self.fget.__get__(instance, owner)()
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [14]:
prediction = pipe.run(
    query="Quelle est la banque?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)

prediction



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

{'query': 'Quelle est la banque?',
 'no_ans_gap': 2.915030598640442,
 'answers': [<Answer {'answer': 'confiée', 'type': 'extractive', 'score': 0.5874723196029663, 'context': ' obligation\nlégale, d’exécuter une mission d’intérêt public qui lui est confiée ou un contrat (par exemple : personnes\nhabilitées à consulter les fich', 'offsets_in_document': [{'start': 426, 'end': 433}], 'offsets_in_context': [{'start': 72, 'end': 79}], 'document_ids': ['5a05a44c9c48514d0cc494a65e1b7026'], 'meta': {'name': 'politique_de_confidentialite_donnees_personnelles_bdf_mars_2020.pdf', '_split_id': 15}}>,
  <Answer {'answer': 'conduite', 'type': 'extractive', 'score': 0.29515087604522705, 'context': ' Pour mener à bien l’ensemble de ses missions, la Banque de France est\nconduite à collecter et à traiter différentes catégories de données à caractère', 'offsets_in_document': [{'start': 209, 'end': 217}], 'offsets_in_context': [{'start': 71, 'end': 79}], 'document_ids': ['30c832b416df4bb6900469895ffb36d5'