In [1]:
# Make sure you have a GPU running
!nvidia-smi

Wed Jul 27 07:44:18 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Install the latest release of Haystack in your own environment
#! pip install farm-haystack

# Install the latest master of Haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

In [3]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader

In [4]:
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen(
    ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1)  # as daemon
)
# wait until ES has started
! sleep 30

In [5]:
# Connect to Elasticsearch

from haystack.document_stores import ElasticsearchDocumentStore

document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

In [6]:
#custom_dir
doc_dir = "/content/custom_data"

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

print(docs[:3])

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(docs)



## Initialize Retriever, Reader & Pipeline

### Retriever

Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered.
They use some simple but fast algorithm.

**Here:** We use Elasticsearch's default BM25 algorithm

**Alternatives:**

- Customize the `BM25Retriever`with custom queries (e.g. boosting) and filters
- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging
- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)
- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6)

In [7]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)

In [8]:
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

#### TransformersReader

In [None]:
# Alternative:
# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

In [9]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [10]:
# You can configure how many candidates the Reader and Retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers.
prediction = pipe.run(
    query="what is abilify?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████| 12/12 [00:10<00:00,  1.13 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.50 Batches/s]


In [11]:
# Now you can either print the object directly...
from pprint import pprint

pprint(prediction)



{'answers': [<Answer {'answer': 'ABILIFY solution', 'type': 'extractive', 'score': 0.6105697453022003, 'context': 'iprazole to affect other medicinal products \nThe administration of ABILIFY solution for injection had no effect on the pharmacokinetics of \nlorazepam ', 'offsets_in_document': [{'start': 177659, 'end': 177675}], 'offsets_in_context': [{'start': 67, 'end': 83}], 'document_id': 'da0b4a155105d3e95536247852a4f32e', 'meta': {'name': 'abilify-epar-product-information_en.txt'}}>,
             <Answer {'answer': 'Maintena', 'type': 'extractive', 'score': 0.5581095367670059, 'context': ' eye movement disorder, of which 22 were serious. The SmPC for Abilify Maintena \nprolonged-release suspension for injection lists oculogyric crisis in', 'offsets_in_document': [{'start': 290237, 'end': 290245}], 'offsets_in_context': [{'start': 71, 'end': 79}], 'document_id': 'da0b4a155105d3e95536247852a4f32e', 'meta': {'name': 'abilify-epar-product-information_en.txt'}}>,
             <Answer {'

In [12]:
# ...or use a util to simplify the output
# Change `minimum` to `medium` or `all` to raise the level of detail
print_answers(prediction, details="minimum")


Query: what is abilify?
Answers:
[   {   'answer': 'ABILIFY solution',
        'context': 'iprazole to affect other medicinal products \n'
                   'The administration of ABILIFY solution for injection had '
                   'no effect on the pharmacokinetics of \n'
                   'lorazepam '},
    {   'answer': 'Maintena',
        'context': ' eye movement disorder, of which 22 were serious. The SmPC '
                   'for Abilify Maintena \n'
                   'prolonged-release suspension for injection lists '
                   'oculogyric crisis in'},
    {   'answer': 'an antipsychotic medicine for patients with schizophrenia '
                  'and bipolar I disorder',
        'context': 'd what is it used for? \n'
                   'Abilify is an antipsychotic medicine for patients with '
                   'schizophrenia and bipolar I disorder. \n'
                   'In schizophrenia, a mental illne'},
    {   'answer': '30 mg tablets',
        'contex

In [13]:
prediction = pipe.run(
    query="how to use abilify?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)



Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.34 Batches/s]
Inferencing Samples: 100%|██████████| 12/12 [00:07<00:00,  1.58 Batches/s]


In [14]:
pprint(prediction)

{'answers': [<Answer {'answer': 'by mouth', 'type': 'extractive', 'score': 0.9430475533008575, 'context': ' Abilify? \nIn adults, the most common side effects when taking Abilify by mouth (occurring in up to 10 patients \nin 100) are restlessness, difficulty ', 'offsets_in_document': [{'start': 3940, 'end': 3948}], 'offsets_in_context': [{'start': 71, 'end': 79}], 'document_id': '6789808e8e1e54c9a08af20fff96df7d', 'meta': {'name': 'abilify-epar-summary-public_en.txt'}}>,
             <Answer {'answer': 'injection', 'type': 'extractive', 'score': 0.5656982064247131, 'context': 's on their metabolic capacity. \nAdministration of ABILIFY solution for injection was well tolerated and produced no direct target \norgan toxicity in r', 'offsets_in_document': [{'start': 199513, 'end': 199522}], 'offsets_in_context': [{'start': 71, 'end': 80}], 'document_id': 'da0b4a155105d3e95536247852a4f32e', 'meta': {'name': 'abilify-epar-product-information_en.txt'}}>,
             <Answer {'answer': 'you s

In [15]:
print_answers(prediction, details="minimum")


Query: how to use abilify?
Answers:
[   {   'answer': 'by mouth',
        'context': ' Abilify? \n'
                   'In adults, the most common side effects when taking '
                   'Abilify by mouth (occurring in up to 10 patients \n'
                   'in 100) are restlessness, difficulty '},
    {   'answer': 'injection',
        'context': 's on their metabolic capacity. \n'
                   'Administration of ABILIFY solution for injection was well '
                   'tolerated and produced no direct target \n'
                   'organ toxicity in r'},
    {   'answer': 'you should see your doctor',
        'context': 'l symptom taking any of these medicines together with '
                   'ABILIFY you should see your doctor. \n'
                   'Medicines that increase the level of serotonin are '
                   'typicall'},
    {   'answer': 'double-blind aripiprazole or placebo',
        'context': 'en randomised to continue the \n'
                 

In [16]:
prediction = pipe.run(
    query="what are the side effects of abilify?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████| 12/12 [00:07<00:00,  1.57 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.46 Batches/s]


In [17]:
pprint(prediction)

{'answers': [<Answer {'answer': 'Dizziness and vision problems', 'type': 'extractive', 'score': 0.8417303562164307, 'context': 'about the best way to feed your baby if you are taking this \nDizziness and vision problems may occur during treatment with this medicine (see section ', 'offsets_in_document': [{'start': 225561, 'end': 225590}], 'offsets_in_context': [{'start': 61, 'end': 90}], 'document_id': 'da0b4a155105d3e95536247852a4f32e', 'meta': {'name': 'abilify-epar-product-information_en.txt'}}>,
             <Answer {'answer': 'sleepiness, \ndizziness and nausea', 'type': 'extractive', 'score': 0.8033950328826904, 'context': ' the injection (occurring in up to 10 patients in 100) are sleepiness, \ndizziness and nausea.  For the full list of all side effects and restrictions,', 'offsets_in_document': [{'start': 4631, 'end': 4664}], 'offsets_in_context': [{'start': 59, 'end': 92}], 'document_id': '6789808e8e1e54c9a08af20fff96df7d', 'meta': {'name': 'abilify-epar-summary-public_en.txt

In [18]:
print_answers(prediction, details="minimum")


Query: what are the side effects of abilify?
Answers:
[   {   'answer': 'Dizziness and vision problems',
        'context': 'about the best way to feed your baby if you are taking '
                   'this \n'
                   'Dizziness and vision problems may occur during treatment '
                   'with this medicine (see section '},
    {   'answer': 'sleepiness, \ndizziness and nausea',
        'context': ' the injection (occurring in up to 10 patients in 100) are '
                   'sleepiness, \n'
                   'dizziness and nausea.  For the full list of all side '
                   'effects and restrictions,'},
    {   'answer': 'too strong or too weak',
        'context': ' day. \n'
                   'If you have the impression that the effect of ABILIFY is '
                   'too strong or too weak, talk to your doctor or \n'
                   'Try to take ABILIFY at the same time e'},
    {   'answer': 'weight gain and changes in prolactin levels',
     

In [19]:
prediction = pipe.run(
    query="what are the advantages of abilify?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████| 12/12 [00:07<00:00,  1.55 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.40 Batches/s]


In [20]:
pprint(prediction)

{'answers': [<Answer {'answer': 'rapid control of agitation and disturbed behaviours', 'type': 'extractive', 'score': 0.8019429445266724, 'context': 'ILIFY solution for injection is indicated for the rapid control of agitation and disturbed behaviours \nin adult patients with schizophrenia or with ma', 'offsets_in_document': [{'start': 159517, 'end': 159568}], 'offsets_in_context': [{'start': 50, 'end': 101}], 'document_id': 'da0b4a155105d3e95536247852a4f32e', 'meta': {'name': 'abilify-epar-product-information_en.txt'}}>,
             <Answer {'answer': 'greater than its risks', 'type': 'extractive', 'score': 0.7609768807888031, 'context': ' Use (CHMP) therefore decided that the benefits \nof Abilify are greater than its risks and recommended its approval in the EU. \nWhat measures are bein', 'offsets_in_document': [{'start': 5381, 'end': 5403}], 'offsets_in_context': [{'start': 64, 'end': 86}], 'document_id': '6789808e8e1e54c9a08af20fff96df7d', 'meta': {'name': 'abilify-epar-summary-p

In [21]:
print_answers(prediction, details="minimum")


Query: what are the advantages of abilify?
Answers:
[   {   'answer': 'rapid control of agitation and disturbed behaviours',
        'context': 'ILIFY solution for injection is indicated for the rapid '
                   'control of agitation and disturbed behaviours \n'
                   'in adult patients with schizophrenia or with ma'},
    {   'answer': 'greater than its risks',
        'context': ' Use (CHMP) therefore decided that the benefits \n'
                   'of Abilify are greater than its risks and recommended its '
                   'approval in the EU. \n'
                   'What measures are bein'},
    {   'answer': 'Some patients may benefit from a higher dose',
        'context': 'notherapy or combination \n'
                   'therapy (see section 5.1). Some patients may benefit from '
                   'a higher dose. The maximum daily dose \n'
                   'Recurrence prevention of ma'},
    {   'answer': 'QUALITATIVE AND QUANTITATIVE COMPOSITION',