In [1]:
doc_index = "evaluation_docs"
label_index = "evaluation_labels"

In [2]:
filename="../data/nq/nq_dev_subset_v2.json"

In [3]:
from haystack.preprocessor.utils import eval_data_from_file
docs, labels = eval_data_from_file(filename=filename) # return: (List of Documents, List of Labels)

12/03/2020 14:03:29 - INFO - faiss -   Loading faiss with AVX2 support.
12/03/2020 14:03:29 - INFO - faiss -   Loading faiss.


In [4]:
type(docs), len(docs), type(docs[5])

(list, 50, haystack.schema.Document)

In [5]:
type(labels), len(labels), type(labels[4])

(list, 96, haystack.schema.Label)

In [4]:
from haystack.document_store.faiss import FAISSDocumentStore

In [5]:
document_store = FAISSDocumentStore()

In [6]:
document_store.write_documents(docs, index=doc_index)

In [7]:
from haystack.retriever.dense import DensePassageRetriever
dpr = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=64,
                                  max_seq_len_passage=256,
                                  batch_size=16,
                                  use_gpu=False,
                                  embed_title=True,
                                  use_fast_tokenizers=True)

	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.


In [8]:
document_store.update_embeddings(dpr, index=doc_index)

12/03/2020 14:04:02 - INFO - haystack.document_store.faiss -   Updating embeddings for 50 docs...
Inferencing Samples: 100%|██████████| 4/4 [00:22<00:00,  5.62s/ Batches]
12/03/2020 14:04:25 - INFO - haystack.document_store.faiss -   Indexing embeddings and updating vectors_ids...
100%|██████████| 1/1 [00:00<00:00, 98.46it/s]


In [9]:
document_store.write_labels(labels, index=label_index)

In [10]:
## Evaluate Retriever on its own
dpr_eval_results = dpr.eval(top_k=20, label_index=label_index, doc_index=doc_index)

12/03/2020 14:04:41 - INFO - haystack.retriever.base -   Performing eval queries...
  0%|          | 0/54 [00:00<?, ?it/s]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  3.95 Batches/s]
  2%|▏         | 1/54 [00:00<00:14,  3.64it/s]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.06 Batches/s]
  4%|▎         | 2/54 [00:00<00:14,  3.68it/s]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.96 Batches/s]
  6%|▌         | 3/54 [00:00<00:12,  4.04it/s]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.86 Batches/s]
  7%|▋         | 4/54 [00:00<00:12,  4.16it/s]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  

In [11]:
## Retriever Recall is the proportion of questions for which the correct document containing the answer is
## among the correct documents
print("Retriever Recall:", dpr_eval_results["recall"])
## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
print("Retriever Mean Avg Precision:", dpr_eval_results["map"])

Retriever Recall: 1.0
Retriever Mean Avg Precision: 0.9573045267489712


In [12]:
from haystack.reader.farm import FARMReader
farm_reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

12/03/2020 14:06:27 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
12/03/2020 14:06:27 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
12/03/2020 14:06:41 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
12/03/2020 14:06:41 - INFO - farm.infer -   Got ya 3 parallel workers to do inference ...
12/03/2020 14:06:41 - INFO - farm.infer -    0    0    0 
12/03/2020 14:06:41 - INFO - far

In [13]:
from haystack import Finder
from haystack.utils import print_answers

In [15]:
finder = Finder(farm_reader, dpr)

In [16]:
from farm.utils import initialize_device_settings

device, n_gpu = initialize_device_settings(use_cuda=False)

12/03/2020 14:07:32 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None


In [17]:
# Evaluate Reader on its own
reader_eval_results = farm_reader.eval(document_store=document_store, device=device, label_index=label_index, doc_index=doc_index)
# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
#reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)

## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])

12/03/2020 14:07:42 - INFO - haystack.reader.farm -   Performing Evaluation using top_k_per_candidate = 3 
and consequently, QuestionAnsweringPredictionHead.n_best = 4. 
This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5
Evaluating: 100%|██████████| 73/73 [29:52<00:00, 24.55s/it]Reader Top-N-Accuracy: 0.6111111111111112
Reader Exact Match: 0.2777777777777778
Reader F1-Score: 0.30750487329434695



In [18]:
# EVALUATE Finder

finder_eval_results = finder.eval(top_k_retriever=1, top_k_reader=10, label_index=label_index, doc_index=doc_index)
finder.print_eval_results(finder_eval_results)

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.50 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.24 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.65 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.95 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.51 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.51 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.30 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.01 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.88 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.92 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.90 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.16 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.29 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00