In [1]:
from haystack.file_converter.pdf import PDFToTextConverter
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])

In [2]:
book = converter.convert(file_path="/home/elena/Downloads/data/9781839217579-THE_DEEP_LEARNING_WITH_KERAS_WORKSHOP_SECOND_EDITION.pdf", meta=None)

In [3]:
type(book), len(book), book.keys(), len(book["text"]), type(book["meta"])

(dict, 2, dict_keys(['text', 'meta']), 579763, NoneType)

In [4]:
from haystack.preprocessor.preprocessor import PreProcessor

# we can use PreProcessor to split by passage, sentence and word
# passage wil split data, i.e. book in to dicts; one with meta _split_id 0 and the other _split_id 1 
# sentence with split length 260 will split book in 14 dicts; key meta numerating those dicts (from 0 to 13)
# word with split length 260 will split book into 329 dicts

In [5]:
processor_word = PreProcessor(clean_empty_lines=True,
                         clean_whitespace=True,
                         clean_header_footer=True,
                         split_by="word",
                         split_length = 260,
                         split_respect_sentence_boundary=True)

[nltk_data] Downloading package punkt to /home/elena/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
word=processor_word.process(book)

A sentence found with word count higher than the split length.


In [7]:
type(word), len(word)

(list, 329)

In [8]:
from haystack.reader.farm import FARMReader
farm_reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

12/02/2020 11:03:12 - INFO - faiss -   Loading faiss with AVX2 support.
12/02/2020 11:03:12 - INFO - faiss -   Loading faiss.
12/02/2020 11:03:13 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
12/02/2020 11:03:13 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
12/02/2020 11:03:25 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
12/02/2020 11:03:25 - INFO - farm.infer -   Got ya 3 p

In [9]:
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

12/02/2020 11:03:31 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.008s]
12/02/2020 11:03:31 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.005s]
12/02/2020 11:03:31 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.015s]
12/02/2020 11:03:31 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.003s]


In [10]:
document_store.write_documents(word)

12/02/2020 11:04:07 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.826s]


In [13]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [14]:
from haystack import Finder

In [15]:
finder = Finder(farm_reader, retriever)

In [16]:
prediction = finder.get_answers(question="layer", top_k_retriever=1, top_k_reader=5)

12/02/2020 11:04:59 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.012s]
12/02/2020 11:04:59 - INFO - haystack.finder -   Got 1 candidates from retriever
12/02/2020 11:04:59 - INFO - haystack.finder -   Reader is looking for detailed answer in 1459 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.70s/ Batches]


In [17]:
from haystack.utils import print_answers

In [18]:
print_answers(prediction, details="medium")

[   {   'answer': 'layer 1',
        'context': 'se, W2 has\n'
                   'three rows and three columns because the input to layer 2 '
                   'is layer 1, which has two\n'
                   'nodes, and layer 2 has five nodes.The bias, however, is',
        'score': 1.060227632522583},
    {   'answer': '2',
        'context': 'tivation of layer 1.The output\n'
                   'of layer 1 is, in fact, the input for layer 2.Next, the '
                   'activation of layer 1 is the matrix\n'
                   'multiplied by the weight ma',
        'score': -2.83345890045166}]


In [19]:
from haystack.preprocessor.utils import convert_files_to_dicts
dicts=convert_files_to_dicts("/home/elena/Downloads/data/", split_paragraphs=True) # no cleaning function applied

12/02/2020 11:05:45 - INFO - haystack.preprocessor.utils -   Converting /home/elena/Downloads/data/9781839217579-THE_DEEP_LEARNING_WITH_KERAS_WORKSHOP_SECOND_EDITION.pdf


In [20]:
document_store.delete_all_documents(index="document")

12/02/2020 11:06:08 - INFO - elasticsearch -   POST http://localhost:9200/document/_delete_by_query [status:200 request:0.328s]


In [21]:
document_store.write_documents(dicts)

12/02/2020 11:06:19 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.981s]
12/02/2020 11:06:20 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.112s]
12/02/2020 11:06:21 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.085s]


In [22]:
retriever = ElasticsearchRetriever(document_store=document_store)

In [23]:
finder = Finder(farm_reader, retriever)

In [26]:
prediction = finder.get_answers(question="layer", top_k_retriever=10, top_k_reader=5)

12/02/2020 11:07:15 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.008s]
12/02/2020 11:07:15 - INFO - haystack.finder -   Got 10 candidates from retriever
12/02/2020 11:07:15 - INFO - haystack.finder -   Reader is looking for detailed answer in 7625 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.10 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.44 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.42 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.46 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.84 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.30 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.10s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.26 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.48 Batches/s]
Inferencing Samples: 100%|██████████|

In [27]:
print_answers(prediction, details="medium")

[   {   'answer': 'two-layer',
        'context': 'ample.\n'
                   "Let's go through forward propagation equations one by one "
                   'for a two-layer\n'
                   'neural network (shown in the following image) where the '
                   'input data is',
        'score': 1.6360710859298706},
    {   'answer': '1',
        'context': '2. Next, the layer 1 output is computed by applying an '
                   'activation function to z1, which\n'
                   'is the output of the previous step:\n'
                   'a1 = tanh(z1)\n'
                   '3. a1 is the',
        'score': 0.4360466003417969},
    {   'answer': 'four',
        'context': '2. Import all the necessary dependencies. Build a '
                   'four-layer Keras sequential model\n'
                   'without dropout regularization. Build the network with 16 '
                   'units in',
        'score': 0.2117246687412262},
    {   'answer': 'two hidden layers',
