In [20]:
%%bash

pip install --upgrade pip
pip install farm-haystack[colab]
pip install xlrd
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 250.0/250.0 kB 3.4 MB/s eta 0:00:00
[?25hCollecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


In [2]:
from haystack.telemetry import tutorial_running

tutorial_running(4)
'''
Knowing you’re using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See Telemetry for more details.
'''

  from .autonotebook import tqdm as notebook_tqdm


'\nKnowing you’re using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See Telemetry for more details.\n'

In [3]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [4]:
#Create a simple DocumentStore
#The InMemoryDocumentStore is good for quick development and prototyping
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [5]:
#Create a Retriever using embeddings
#Instead of retrieving via Elasticsearch’s plain BM25, we want to use vector 
# similarity of the questions (user question vs. FAQ ones). We can use the EmbeddingRetriever
# for this purpose and specify a model that we use for the embeddings.

from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
    use_gpu=True,
    scale_score=False,
)


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/all-MiniLM-L6-v2
  return self.fget.__get__(instance, owner)()


In [6]:
#Prepare & Index FAQ data
#We create a pandas dataframe containing some FAQ data (i.e curated pairs of question + answer)
# and index those in our documentstore. Here: We download some question-answer pairs related to COVID-19

import pandas as pd

from haystack.utils import fetch_archive_from_http

# Download
doc_dir = "data/tutorial4"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Get dataframe with columns "question", "answer" and some custom metadata
df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
df



INFO - haystack.utils.import_utils -  Found data stored in 'data/tutorial4'. Delete this first if you really want to fetch new data.


Unnamed: 0,question,answer,answer_html,link,name,source,category,country,region,city,lang,last_update
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,<p>A novel coronavirus is a new coronavirus th...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...","<p>On February 11, 2020 the World Health Organ...",\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,<p>People in the U.S. may be worried or anxiou...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17
3,How can people help stop stigma related to COV...,"People can fight stigma and help, not hurt, ot...","<p>People can fight stigma and help, not hurt,...",\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),How It Spreads,USA,,,en,2020/03/17
4,What is the source of the virus?,Coronaviruses are a large family of viruses. S...,<p>Coronaviruses are a large family of viruses...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),How It Spreads,USA,,,en,2020/03/17
...,...,...,...,...,...,...,...,...,...,...,...,...
208,Is water a possible source of infection in the...,SARS-CoV-2 is similar to other coronaviruses f...,<p>SARS-CoV-2 is similar to other coronaviruse...,https://www.bundesgesundheitsministerium.de/en...,Frequently asked questions,Bundesministerium für Gesundheit,,Germany,,,en,2020/03/18
209,Where can doctors and clinics obtain additiona...,The Robert Koch Institute posts information fo...,"<p>The Robert Koch Institute posts <a href=""ht...",https://www.bundesgesundheitsministerium.de/en...,Frequently asked questions,Bundesministerium für Gesundheit,,Germany,,,en,2020/03/18
210,When was the first information about the outbr...,"On 31 December 2019, China’s WHO country offic...","<p>On 31 December 2019, China’s WHO country of...",https://www.bundesgesundheitsministerium.de/en...,Frequently asked questions,Bundesministerium für Gesundheit,,Germany,,,en,2020/03/18
211,Where did the outbreak start?\n,According to information from the Chinese auth...,"<p>According to information from <span lang=""...",https://www.bundesgesundheitsministerium.de/en...,Frequently asked questions,Bundesministerium für Gesundheit,,Germany,,,en,2020/03/18


In [7]:
# Minimal cleaning
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
print(df.head())

                                            question  \
0                       What is a novel coronavirus?   
1  Why is the disease being called coronavirus di...   
2  Why might someone blame or avoid individuals a...   
3  How can people help stop stigma related to COV...   
4                   What is the source of the virus?   

                                              answer  \
0  A novel coronavirus is a new coronavirus that ...   
1  On February 11, 2020 the World Health Organiza...   
2  People in the U.S. may be worried or anxious a...   
3  People can fight stigma and help, not hurt, ot...   
4  Coronaviruses are a large family of viruses. S...   

                                         answer_html  \
0  <p>A novel coronavirus is a new coronavirus th...   
1  <p>On February 11, 2020 the World Health Organ...   
2  <p>People in the U.S. may be worried or anxiou...   
3  <p>People can fight stigma and help, not hurt,...   
4  <p>Coronaviruses are a large family of viru

In [8]:
# Create embeddings for our questions from the FAQs
# In contrast to most other search use cases, we don't create the embeddings here from the content of our documents,
# but rather from the additional text field "question" as we want to match "incoming question" <-> "stored question".
questions = list(df["question"].values)
df["embedding"] = retriever.embed_queries(queries=questions).tolist()
df = df.rename(columns={"question": "content"})

Batches: 100%|██████████| 7/7 [00:01<00:00,  4.73it/s]


In [9]:
df

Unnamed: 0,content,answer,answer_html,link,name,source,category,country,region,city,lang,last_update,embedding
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,<p>A novel coronavirus is a new coronavirus th...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17,"[-0.039143458008766174, 0.05274822935461998, -..."
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...","<p>On February 11, 2020 the World Health Organ...",\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17,"[-0.01713435910642147, 0.04649306833744049, -0..."
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,<p>People in the U.S. may be worried or anxiou...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17,"[0.043992456048727036, 0.04200296476483345, 0...."
3,How can people help stop stigma related to COV...,"People can fight stigma and help, not hurt, ot...","<p>People can fight stigma and help, not hurt,...",\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),How It Spreads,USA,,,en,2020/03/17,"[0.038570791482925415, 0.07700609415769577, -0..."
4,What is the source of the virus?,Coronaviruses are a large family of viruses. S...,<p>Coronaviruses are a large family of viruses...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),How It Spreads,USA,,,en,2020/03/17,"[-0.04010911285877228, 0.07110288739204407, -0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,Is water a possible source of infection in the...,SARS-CoV-2 is similar to other coronaviruses f...,<p>SARS-CoV-2 is similar to other coronaviruse...,https://www.bundesgesundheitsministerium.de/en...,Frequently asked questions,Bundesministerium für Gesundheit,,Germany,,,en,2020/03/18,"[-0.04632820188999176, 0.03541133552789688, -0..."
209,Where can doctors and clinics obtain additiona...,The Robert Koch Institute posts information fo...,"<p>The Robert Koch Institute posts <a href=""ht...",https://www.bundesgesundheitsministerium.de/en...,Frequently asked questions,Bundesministerium für Gesundheit,,Germany,,,en,2020/03/18,"[0.04052278771996498, 0.029574502259492874, -0..."
210,When was the first information about the outbr...,"On 31 December 2019, China’s WHO country offic...","<p>On 31 December 2019, China’s WHO country of...",https://www.bundesgesundheitsministerium.de/en...,Frequently asked questions,Bundesministerium für Gesundheit,,Germany,,,en,2020/03/18,"[-0.029741434380412102, 0.06130392104387283, -..."
211,Where did the outbreak start?,According to information from the Chinese auth...,"<p>According to information from <span lang=""...",https://www.bundesgesundheitsministerium.de/en...,Frequently asked questions,Bundesministerium für Gesundheit,,Germany,,,en,2020/03/18,"[-0.02439279295504093, 0.0250445194542408, -0...."


In [10]:
# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.write_documents(docs_to_index)

INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'a8d4ddffcab67801c7a1a13d85fbe84a' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'e4fae6647538bfddae6c8d8771fd613' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'f6dd87c6e090d56685b37554befe602' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '719668a041cff08136aad7f4e2876a3a' already exists in index 'document'


In [11]:
#Initialize a Pipeline (this time without a reader) and ask questions
from haystack.pipelines import FAQPipeline

pipe = FAQPipeline(retriever=retriever)

In [12]:
from haystack.utils import print_answers

# Run any question and change top_k to see more or less answers
prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 1}})

print_answers(prediction, details="medium")

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.14it/s]

'Query: How is the virus spreading?'
'Answers:'
[   {   'answer': 'This virus was first detected in Wuhan City, Hubei '
                  'Province, China. The first infections were linked to a live '
                  'animal market, but the virus is now spreading from '
                  'person-to-person. It’s important to note that '
                  'person-to-person spread can happen on a continuum. Some '
                  'viruses are highly contagious (like measles), while other '
                  'viruses are less so.\n'
                  '\n'
                  'The virus that causes COVID-19 seems to be spreading easily '
                  'and sustainably in the community (“community spread”) in '
                  'some affected geographic areas. Community spread means '
                  'people have been infected with the virus in an area, '
                  'including some who are not sure how or where they became '
                  'infected.\n'
                  '




NOW WITH DATA FROM CSV

SyntaxError: invalid syntax (2041880870.py, line 2)

In [21]:
#import csv from directory raw_data
import pandas as pd
import os


# Specify the directory path where the Excel file is located
directory_path = '../raw_data/'

# Specify the file name
file_name = 'War_and_peace_ORIGINAL.xlsx'

# Construct the complete file path
excel_file_path = os.path.join(directory_path, file_name)

# Load the Excel data into a DataFrame
data = pd.read_excel(excel_file_path)


# Now you can work with the 'data' DataFrame and use the data from the Excel file
# For example, you can access columns using data['column_name'] or perform data manipulation and analysis

In [22]:
data

Unnamed: 0,Id,id_tag,question,largo (borrar),answer
0,1.0,1.0,What is the main theme of War and Peace?,121,"The main theme of ""War and Peace"" revolves aro..."
1,2.0,1.0,What are the central themes explored in Leo To...,121,"The main theme of ""War and Peace"" revolves aro..."
2,3.0,1.0,Could you provide an overview of the primary t...,121,"The main theme of ""War and Peace"" revolves aro..."
3,4.0,1.0,What are the underlying messages or ideas conv...,121,"The main theme of ""War and Peace"" revolves aro..."
4,5.0,1.0,What is the overarching theme that Tolstoy exp...,121,"The main theme of ""War and Peace"" revolves aro..."
...,...,...,...,...,...
139,140.0,120.0,What consequences can be derived from it?,0,
140,141.0,121.0,What do people think about the topic?,0,
141,142.0,122.0,What other issues that you know of can be rela...,0,
142,,,What alternative title would you give it?,0,


In [24]:
# Minimal cleaning
data.fillna(value="", inplace=True)
data["question"] = data["question"].apply(lambda x: x.strip())
print(data.head())

    Id id_tag                                           question  \
0  1.0    1.0           What is the main theme of War and Peace?   
1  2.0    1.0  What are the central themes explored in Leo To...   
2  3.0    1.0  Could you provide an overview of the primary t...   
3  4.0    1.0  What are the underlying messages or ideas conv...   
4  5.0    1.0  What is the overarching theme that Tolstoy exp...   

   largo (borrar)                                             answer  
0             121  The main theme of "War and Peace" revolves aro...  
1             121  The main theme of "War and Peace" revolves aro...  
2             121  The main theme of "War and Peace" revolves aro...  
3             121  The main theme of "War and Peace" revolves aro...  
4             121  The main theme of "War and Peace" revolves aro...  


In [25]:
# Create embeddings for our questions from the FAQs
# In contrast to most other search use cases, we don't create the embeddings here from the content of our documents,
# but rather from the additional text field "question" as we want to match "incoming question" <-> "stored question".
questions_real = list(data["question"].values)
data["embedding"] = retriever.embed_queries(queries=questions_real).tolist()
data = data.rename(columns={"question": "content"})

Batches: 100%|██████████| 5/5 [00:00<00:00,  9.88it/s]


In [26]:
data

Unnamed: 0,Id,id_tag,content,largo (borrar),answer,embedding
0,1.0,1.0,What is the main theme of War and Peace?,121,"The main theme of ""War and Peace"" revolves aro...","[0.017468363046646118, 0.11221697181463242, -0..."
1,2.0,1.0,What are the central themes explored in Leo To...,121,"The main theme of ""War and Peace"" revolves aro...","[-0.03387292101979256, 0.06380553543567657, -0..."
2,3.0,1.0,Could you provide an overview of the primary t...,121,"The main theme of ""War and Peace"" revolves aro...","[0.03855377063155174, 0.0803561583161354, -0.0..."
3,4.0,1.0,What are the underlying messages or ideas conv...,121,"The main theme of ""War and Peace"" revolves aro...","[0.06016567721962929, 0.0465988926589489, 0.01..."
4,5.0,1.0,What is the overarching theme that Tolstoy exp...,121,"The main theme of ""War and Peace"" revolves aro...","[-0.03796044737100601, 0.0634809285402298, -0...."
...,...,...,...,...,...,...
139,140.0,120.0,What consequences can be derived from it?,0,,"[-0.06275063753128052, 0.10002031922340393, -0..."
140,141.0,121.0,What do people think about the topic?,0,,"[-0.05018947273492813, 0.09918671101331711, -0..."
141,142.0,122.0,What other issues that you know of can be rela...,0,,"[0.03791387006640434, 0.09207935631275177, -0...."
142,,,What alternative title would you give it?,0,,"[-0.09728950262069702, 0.08584637194871902, 0...."


In [27]:
# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = data.to_dict(orient="records")
document_store.write_documents(docs_to_index)

INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '515c32152d9360582c5b89f773a7949c' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '91e4f8a19a66be01041d4aef31e3b3eb' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'ca5c30fa7314be2c40ed8fc989f019d6' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'f62f37ec05b7e0b37749064cbab1e062' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'd9ed9d798b2cee6fb16cc78e9c1aa48a' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '7169e56aee99f30c4225b4321aa5858' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'd6bc502d32bcfa45566bd9f1635b0496' 

In [28]:
#Initialize a Pipeline (this time without a reader) and ask questions
from haystack.pipelines import FAQPipeline

pipe = FAQPipeline(retriever=retriever)

In [29]:
from haystack.utils import print_answers

# Run any question and change top_k to see more or less answers
prediction = pipe.run(query="What is the theme of War and Peace?", params={"Retriever": {"top_k": 1}})

print_answers(prediction, details="medium")

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.11it/s]

'Query: What is the theme of War and Peace?'
'Answers:'
[   {   'answer': 'The main theme of "War and Peace" revolves around the '
                  'impact of war on society, individuals, and personal '
                  'transformation.',
        'context': 'The main theme of "War and Peace" revolves around the '
                   'impact of war on society, individuals, and personal '
                   'transformation.',
        'score': 0.9781467914581299}]



