In [1]:
import logging

from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import convert_files_to_docs
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline

In [2]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
document_store = InMemoryDocumentStore(use_bm25=True)

doc_dir = "./api/data/build_your_first_question_answering_system/"

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [4]:
all_docs = convert_files_to_docs(dir_path=doc_dir)

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)
docs = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

pdftotext version 22.02.0
Copyright 2005-2022 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011 Glyph & Cog, LLC
INFO:haystack.utils.preprocessing:Converting api/data/build_your_first_question_answering_system/politique_de_confidentialite_donnees_personnelles_bdf_mars_2020.pdf


Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]



n_files_input: 1
n_docs_output: 29


In [5]:
document_store.write_documents(docs)

Updating BM25 representation...:   0%|          | 0/29 [00:00<?, ? docs/s]

In [6]:
retriever = BM25Retriever(document_store=document_store)

reader = FARMReader(model_name_or_path='etalab-ia/camembert-base-squadFR-fquad-piaf', use_gpu=True)

pipe = ExtractiveQAPipeline(reader, retriever)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


Downloading (…)lve/main/config.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'etalab-ia/camembert-base-squadFR-fquad-piaf' (Camembert)


Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
INFO:haystack.modeling.model.language_model:Auto-detected model language: french
INFO:haystack.modeling.model.language_model:Loaded 'etalab-ia/camembert-base-squadFR-fquad-piaf' (Camembert model) from model hub.


Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [7]:
prediction = pipe.run(
    query="Quelle est la banque?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [8]:
prediction

{'query': 'Quelle est la banque?',
 'no_ans_gap': 28.806551933288574,
 'answers': [<Answer {'answer': 'Banque de France', 'type': 'extractive', 'score': 0.9490039348602295, 'context': 'ans les\ndocuments de collecte, la(les) convention(s) la liant à la Banque de France, responsable de traitement, ou encore\nles « Infos légales » située', 'offsets_in_document': [{'start': 271, 'end': 287}], 'offsets_in_context': [{'start': 67, 'end': 83}], 'document_ids': ['af03ed4d30d5944406b34bd1e4e5d6bc'], 'meta': {'name': 'politique_de_confidentialite_donnees_personnelles_bdf_mars_2020.pdf', '_split_id': 24}}>,
  <Answer {'answer': 'Banque de France', 'type': 'extractive', 'score': 0.9353502988815308, 'context': 'ces à l’économie. Pour mener à bien l’ensemble de ses missions, la Banque de France est\nconduite à collecter et à traiter différentes catégories de do', 'offsets_in_document': [{'start': 188, 'end': 204}], 'offsets_in_context': [{'start': 67, 'end': 83}], 'document_ids': ['30c832b416df4bb6

In [9]:
prediction['answers']

[<Answer {'answer': 'Banque de France', 'type': 'extractive', 'score': 0.9490039348602295, 'context': 'ans les\ndocuments de collecte, la(les) convention(s) la liant à la Banque de France, responsable de traitement, ou encore\nles « Infos légales » située', 'offsets_in_document': [{'start': 271, 'end': 287}], 'offsets_in_context': [{'start': 67, 'end': 83}], 'document_ids': ['af03ed4d30d5944406b34bd1e4e5d6bc'], 'meta': {'name': 'politique_de_confidentialite_donnees_personnelles_bdf_mars_2020.pdf', '_split_id': 24}}>,
 <Answer {'answer': 'Banque de France', 'type': 'extractive', 'score': 0.9353502988815308, 'context': 'ces à l’économie. Pour mener à bien l’ensemble de ses missions, la Banque de France est\nconduite à collecter et à traiter différentes catégories de do', 'offsets_in_document': [{'start': 188, 'end': 204}], 'offsets_in_context': [{'start': 67, 'end': 83}], 'document_ids': ['30c832b416df4bb6900469895ffb36d5'], 'meta': {'name': 'politique_de_confidentialite_donnees_personne