# Installing Haystack

In [2]:
!pip install farm-haystack[colab,faiss]==1.17.2

Collecting farm-haystack==1.17.2 (from farm-haystack[colab,faiss]==1.17.2)
  Using cached farm_haystack-1.17.2-py3-none-any.whl.metadata (24 kB)
Collecting azure-ai-formrecognizer>=3.2.0b2 (from farm-haystack==1.17.2->farm-haystack[colab,faiss]==1.17.2)
  Using cached azure_ai_formrecognizer-3.3.3-py3-none-any.whl.metadata (64 kB)
Collecting boilerpy3 (from farm-haystack==1.17.2->farm-haystack[colab,faiss]==1.17.2)
  Using cached boilerpy3-1.0.7-py3-none-any.whl.metadata (5.8 kB)
Collecting canals==0.2.2 (from farm-haystack==1.17.2->farm-haystack[colab,faiss]==1.17.2)
  Using cached canals-0.2.2-py3-none-any.whl.metadata (4.3 kB)
Collecting dill (from farm-haystack==1.17.2->farm-haystack[colab,faiss]==1.17.2)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting lazy-imports==0.3.1 (from farm-haystack==1.17.2->farm-haystack[colab,faiss]==1.17.2)
  Using cached lazy_imports-0.3.1-py3-none-any.whl.metadata (10 kB)
Collecting posthog (from farm-haystack==1.17.2->farm-hays

In [3]:
from haystack.telemetry import tutorial_running

tutorial_running(7)

# Logging

In [4]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

# Fetching and Cleaning Documents

In [5]:
import pandas as pd
from haystack.utils import fetch_archive_from_http

# Download sample
doc_dir = "data/tutorial7/"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Create dataframe with columns "title" and "text"
df = pd.read_csv(f"{doc_dir}/small_generator_dataset.csv", sep=",")
# Minimal cleaning
df.fillna(value="", inplace=True)

print(df.head())

INFO:haystack.utils.import_utils:Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip to 'data/tutorial7/'


               title                                               text
0  "Albert Einstein"  to Einstein in 1922. Footnotes Citations Alber...
1  "Albert Einstein"  Albert Einstein Albert Einstein (; ; 14 March ...
2  "Albert Einstein"  observations were published in the internation...
3  "Albert Einstein"  model for depictions of mad scientists and abs...
4     "Alfred Nobel"  was adopted as the standard technology for min...


We can cast our data into Haystack Document objects. Alternatively, we can also just use dictionaries with "text" and "meta" fields

In [6]:
from haystack import Document

# Use data to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={"name": title or ""}))

# Initializing the DocumentStore

In [8]:
from haystack.document_stores import FAISSDocumentStore

document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)

# Initializing the Retriever

In [9]:
from haystack.nodes import RAGenerator, DensePassageRetriever

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
INFO:haystack.modeling.model.language_model:Auto-detected model language: english


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model:Auto-detected model language: english


# Initializing the Generator

In [10]:
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]



(…)_encoder_tokenizer/tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

question_encoder_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)ncoder_tokenizer/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


(…)enerator_tokenizer/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

generator_tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

generator_tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)erator_tokenizer/special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.


pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-token-nq were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Writing Documents

In [11]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)

Writing Documents:   0%|          | 0/75 [00:00<?, ?it/s]

INFO:haystack.document_stores.faiss:Updating embeddings for 68 docs...


Updating Embedding:   0%|          | 0/68 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/80 [00:00<?, ? Docs/s]

# Initializing the Pipeline

In [12]:
from haystack.pipelines import GenerativeQAPipeline

pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)

# **Asking a Question**

In [13]:
from haystack.utils import print_answers

QUESTIONS = [
    "who got the first nobel prize in physics",
    "when is the next deadpool movie being released",
    "which mode is used for short wave broadcast service",
    "who is the owner of reading football club",
    "when is the next scandal episode coming out",
    "when is the last time the philadelphia won the superbowl",
    "what is the most current adobe flash player version",
    "how many episodes are there in dragon ball z",
    "what is the first step in the evolution of the eye",
    "where is gall bladder situated in human body",
    "what is the main mineral in lithium batteries",
    "who is the president of usa right now",
    "where do the greasers live in the outsiders",
    "panda is a national animal of which country",
    "what is the name of manchester united stadium",
]

for question in QUESTIONS:
    res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
    print_answers(res, details="minimum")

  torch.sparse.LongTensor(banned_mask.t(), indices, scores.size())


'Query: who got the first nobel prize in physics'
'Answers:'
[{'answer': ' albert einstein'}]
'Query: when is the next deadpool movie being released'
'Answers:'
[{'answer': ' september 22, 2017'}]
'Query: which mode is used for short wave broadcast service'
'Answers:'
[{'answer': ' amplitude modulation'}]
'Query: who is the owner of reading football club'
'Answers:'
[{'answer': ' stefan persson'}]
'Query: when is the next scandal episode coming out'
'Answers:'
[{'answer': ' april 20, 2018'}]
'Query: when is the last time the philadelphia won the superbowl'
'Answers:'
[{'answer': ' the 1970s'}]
'Query: what is the most current adobe flash player version'
'Answers:'
[{'answer': ' 7.1. 2'}]
'Query: how many episodes are there in dragon ball z'
'Answers:'
[{'answer': ' 13'}]
'Query: what is the first step in the evolution of the eye'
'Answers:'
[{'answer': ' step by step'}]
'Query: where is gall bladder situated in human body'
'Answers:'
[{'answer': ' stomach'}]
'Query: what is the main mi