In [1]:
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

In [3]:
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.document_store.faiss import FAISSDocumentStore

# document_store_faiss = FAISSDocumentStore(sql_url = "sqlite:///",        # SQL DB for text + meta data
#                                    vector_size = 768)
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")


09/26/2020 16:11:17 - INFO - faiss -   Loading faiss.
09/26/2020 16:11:17 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.026s]
09/26/2020 16:11:17 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.002s]


In [4]:
# Let's first fetch some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "data/10k-txt"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is:
# {
#    'text': "<DOCUMENT_TEXT_HERE>",
#    'meta': {'name': "<DOCUMENT_NAME_HERE>", ...}
#}
# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
# can be accessed later for filtering or shown in the responses of the Finder)

# Let's have a look at the first 3 entries:
print(dicts[:3])

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

09/26/2020 16:11:34 - INFO - haystack.preprocessor.utils -   Found data stored in `data/article_txt_got`. Delete this first if you really want to fetch new data.


[{'text': "Linda Antonsson and Elio García at Archipelacon on June 28, 2015.\n'''Elio Miguel García Jr.''' (born May 6, 1978) and '''Linda Maria Antonsson''' (born November 18, 1974) are authors known for their contributions and expertise in the ''A Song of Ice and Fire'' series by George R. R. Martin, co-writing in 2014 with Martin ''The World of Ice & Fire'', a companion book for the series. They are also the founders of the fansite Westeros.org, one of the earliest fan websites for ''A Song of Ice and Fire''.", 'meta': {'name': '145_Elio_M._García_Jr._and_Linda_Antonsson.txt'}}, {'text': '\n==Career==\nElio García was attending the University of Miami, while his partner Linda Antonsson was living in Sweden. At that time, in 1996, Antonsson introduced García to the \'\'A Song of Ice and Fire\'\' book series when it came out on paperback. After the second book, \'\'A Clash of Kings\'\', was released, they decided to create a forum for discussion of the series, creating an early iterat

In [9]:
from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.retriever.dense import DensePassageRetriever
from haystack.retriever.base import BaseRetriever

retriever = ElasticsearchRetriever(document_store=document_store)

In [10]:
reader = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2", use_gpu=False)
finder = Finder(reader, retriever)

09/26/2020 16:18:39 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
09/26/2020 16:18:39 - INFO - farm.infer -   Could not find `bert-large-uncased-whole-word-masking-finetuned-squad` locally. Try to download from model hub ...
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
09/26/2020 16:18:56 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
09/26/2020 16:18:56 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
09/26/2020 16:18:56 - INFO - farm.infer -    0    0    0    0    0    0    0 
09/26/2020 16:18:56 - INFO - farm.infer -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
09/26/2020 16:18:56 - INFO - farm.infer -   /'\  / \  /'\  /'\  / \  / \  /'\
09/26/2020 16:18:56 - INFO - farm.infer -               


In [14]:
prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=1, top_k_reader=1)

09/26/2020 16:19:35 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.009s]
09/26/2020 16:19:35 - INFO - haystack.retriever.sparse -   Got 1 candidates from retriever
09/26/2020 16:19:35 - INFO - haystack.finder -   Reader is looking for detailed answer in 868 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.17 Batches/s]


In [15]:
prediction

{'question': 'Who is the father of Arya Stark?',
 'no_ans_gap': 13.233774662017822,
 'answers': [{'answer': 'Robb',
   'score': 7.357494831085205,
   'probability': 0.7149782957253283,
   'context': 'allow the army to cross the river and to commit his troops in return for Robb and Arya Stark marrying two of his children.\nTyrion Lannister suspects h',
   'offset_start': 73,
   'offset_end': 77,
   'offset_start_in_doc': 193,
   'offset_end_in_doc': 197,
   'document_id': '111ec714-331e-46d6-9421-705d1a1bf3c4',
   'meta': {'name': '450_Baelor.txt'}}]}

In [13]:
print_answers(prediction, details="minimal")

[   {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Ned',
        'context': 'rya to reveal her true identity, and is surprised to learn '
                   "she is in fact Ned Stark's daughter. After the Goldcloaks "
                   'get help from Ser Amory Lorch and'},
    {   'answer': 'Ned',
        'context': 'rya to reveal her true identity, and is surprised to learn '
                   "she is in fact Ned Stark's daughter. After the Goldcloaks "
           

In [10]:
import requests

In [11]:
res = requests.get('https://www.sec.gov/Archives/edgar/data/0000320193/000032019319000119/a10-k20199282019.htm')

In [14]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text, 'html.parser')

import re

clean_text = re.sub('\s+', ' ', soup.text)


In [15]:
dicts = [
    {
        'text': clean_text,
        'meta': {'name': 'doc1'}
    }
]

In [16]:
document_store.write_documents(dicts)

09/25/2020 20:10:42 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.037s]


In [17]:
prediction = finder.get_answers(question="What is the federal income tax rate?", top_k_retriever=10, top_k_reader=5)

09/25/2020 20:12:37 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.019s]
09/25/2020 20:12:37 - INFO - haystack.retriever.sparse -   Got 10 candidates from retriever
09/25/2020 20:12:37 - INFO - haystack.finder -   Reader is looking for detailed answer in 274487 chars ...
Inferencing Samples: 100%|██████████| 10/10 [08:12<00:00, 49.23s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.08 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.11 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.06 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.05 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:04<00:00,  4.47s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:04<00:00,  4.42s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.12s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.04s/ Batches]
Inferencing Samples: 10

In [18]:
print_answers(prediction, details="minimal")

[   {   'answer': '24.5% 35.0%',
        'context': 'ctive tax rate15.9% 18.3% 24.6%Statutory federal income '
                   'tax rate21.0% 24.5% 35.0%On December 22, 2017, the U.S. '
                   'enacted the Tax Cuts and Jobs Act (the'},
    {   'answer': '21%',
        'context': ' lowered the Company’s U.S. statutory federal income tax '
                   'rate from 35% to 21% effective January 1, 2018, while also '
                   'imposing a deemed repatriation tax'},
    {   'answer': '21.0%',
        'context': '5,738Effective tax rate15.9% 18.3% 24.6%Statutory federal '
                   'income tax rate21.0% 24.5% 35.0%On December 22, 2017, the '
                   'U.S. enacted the Tax Cuts and Jobs'},
    {   'answer': 'about a page of art for each page of text',
        'context': 'tory and atmosphere of the novel closely, at a rate of '
                   'about a page of art for each page of text, and was '
                   'projected to cover 24 issues

In [20]:
from haystack.document_store