In [1]:
import time

start = time.time()

import logging

from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import convert_files_to_docs
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline

2023-03-24 13:59:40.617374: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
document_store = InMemoryDocumentStore(use_bm25=True)

doc_dir = "./api/data/build_your_first_question_answering_system/"


INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [4]:
all_docs = convert_files_to_docs(dir_path=doc_dir)

preprocessor = PreProcessor(
    language='fr',
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)
docs = preprocessor.process(all_docs)

for doc in docs:
    doc.content = doc.content.replace('\n', ' ')

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

pdftotext version 22.02.0
Copyright 2005-2022 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011 Glyph & Cog, LLC
INFO:haystack.utils.preprocessing:Converting api/data/build_your_first_question_answering_system/politique_de_confidentialite_donnees_personnelles_bdf_mars_2020.pdf


Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]



n_files_input: 1
n_docs_output: 29


In [5]:
document_store.write_documents(docs)

Updating BM25 representation...:   0%|          | 0/29 [00:00<?, ? docs/s]

In [6]:
retriever = BM25Retriever(document_store=document_store)


In [7]:
# Load a remote model from HuggingFace's model hub
reader1 = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

# Save locally (in FARM format)
reader1.save("./api/data/model_en")

# Load locally (FARM format)
reader_local1 = FARMReader(model_name_or_path="./api/data/model_en")

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
  return self.fget.__get__(instance, owner)()
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.nodes.reader.farm:Saving reader model to ./api/data/model_en
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.model.language_model: * LOADING MODEL: './api/data/model_en' (Roberta)
INFO:haystack.modeling.model.language_model:Loaded './api/data/model_en' (Roberta model) from local file system.
INFO:haystack.modeling.model.adapti

In [8]:
# Load a remote model from HuggingFace's model hub
reader = FARMReader(model_name_or_path="etalab-ia/camembert-base-squadFR-fquad-piaf")

# Save locally (in FARM format)
reader.save("./api/data/model_fr")

# Load locally (FARM format)
reader_local = FARMReader(model_name_or_path="./api/data/model_fr")

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'etalab-ia/camembert-base-squadFR-fquad-piaf' (Camembert)
INFO:haystack.modeling.model.language_model:Auto-detected model language: french
INFO:haystack.modeling.model.language_model:Loaded 'etalab-ia/camembert-base-squadFR-fquad-piaf' (Camembert model) from model hub.
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.nodes.reader.farm:Saving reader model to ./api/data/model_fr
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.model.language_model: * LOADING MODEL: './api/data/model_fr' (Camembert)
INFO:haystack.modeling.model.language_model:Loaded './api/data/model_fr' (Camembert model) from local file system.
INFO:haystack.modeling.model.adaptive_mode

In [9]:
pipe = ExtractiveQAPipeline(reader_local, retriever)

In [10]:
prediction = pipe.run(
    query="De quel système fait parti intégrante la banque de france?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [11]:
prediction

{'query': 'De quel système fait parti intégrante la banque de france?',
 'no_ans_gap': 28.93003273010254,
 'answers': [<Answer {'answer': 'paiement', 'type': 'extractive', 'score': 0.9637909531593323, 'context': 'les fichiers tenus par la Banque de France, autorités de supervision, prestataires de services de paiement et gestionnaires de systèmes de paiement). ', 'offsets_in_document': [{'start': 594, 'end': 602}], 'offsets_in_context': [{'start': 98, 'end': 106}], 'document_ids': ['5a05a44c9c48514d0cc494a65e1b7026'], 'meta': {'name': 'politique_de_confidentialite_donnees_personnelles_bdf_mars_2020.pdf', '_split_id': 15}}>,
  <Answer {'answer': 'Banque de France', 'type': 'extractive', 'score': 0.85355544090271, 'context': 'fidentialité est effective dès sa publication sur les sites Internet de la Banque de France. Seule la version en vigueur est accessible sur ces sites.', 'offsets_in_document': [{'start': 125, 'end': 141}], 'offsets_in_context': [{'start': 75, 'end': 91}], 'document_

In [12]:
for prediction in prediction['answers']:
    print(prediction.answer, round(prediction.score, 2))

paiement 0.96
Banque de France 0.85
Vos demandes en ligne 0.74
Système européen de banques centrales 0.66
Banque de France a désigné un délégué à la protection des données 0.56


In [13]:
end = time.time()

time_out = end-start

time_out

24.14249897003174