**Installs and Imports**

In [None]:
from google.colab import drive
drive.mount('/content/mnt')
%cd ./mnt/MyDrive/Erasmus/Transformer_Pretraining/Bounwiki

In [None]:
# Normal installs
## RESTART after and continue with cells below
!pip install --upgrade pip
!apt install libgraphviz-dev
!pip install -r requirements.txt

In [3]:
# imports
import time
import logging
import os
import torch
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes.retriever import EmbeddingRetriever
from haystack.nodes import TableReader, FARMReader, RouteDocuments, JoinAnswers
from haystack.utils import print_answers
from haystack import Document
from haystack import Pipeline
import requests
from bs4 import BeautifulSoup as soup
import torch_scatter
import pandas as pd


Mounted at /content/mnt
/content/mnt/MyDrive/Erasmus/Transformer_Pretraining


**Setup Elasticsearch Server for storing data**

In [None]:
# configure logger
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
# download elasticsearch
%%bash
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
# start elasticsearch server
%%bash --bg
sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
time.sleep(30)
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
document_index = "document"
document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")

INFO:haystack.telemetry:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry


**Load scraped data and write to database**

In [None]:
%cd ./mnt/MyDrive/Erasmus/Transformer_Pretraining/Bounwiki/
import pickle
with open("./data/website_data/processed_website_tables", "rb") as fp:
    processed_website_tables = pickle.load(fp)
with open("./data/website_data/processed_website_text", "rb") as fp:
    processed_website_text = pickle.load(fp)
with open("./data/website_data/processed_schedule_tables", "rb") as fp:
    processed_schedule_tables = pickle.load(fp)

# write website tables to database
document_store.write_documents(processed_website_tables, index=document_index)
# write website text to database
document_store.write_documents(processed_website_text, index=document_index)
# write schedule tables to database
document_store.write_documents(processed_schedule_tables, index=document_index)

**Create Pipeline for document retrieving and answer finding**

In [None]:
# initialize retriever for tables
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/all-mpnet-base-v2-table")
# Add table embeddings to the tables in DocumentStore
document_store.update_embeddings(retriever=retriever)

## Retriever TEST
#retrieved_tables = retriever.retrieve("Where is Murat Gülsoy instructor?", top_k=3)

## Get highest scored table
#print(retrieved_tables[0].content)

In [None]:
# initialize reader
text_reader = FARMReader("deepset/roberta-base-squad2")
# In order to get meaningful scores from the TableReader, use "deepset/tapas-large-nq-hn-reader" or
# "deepset/tapas-large-nq-reader" as TableReader models. The disadvantage of these models is, however,
# that they are not capable of doing aggregations over multiple table cells.
table_reader = TableReader("deepset/tapas-large-nq-hn-reader")
route_documents = RouteDocuments()
join_answers = JoinAnswers()
## Reader TEST
#table_doc = document_store.get_document_by_id(0)
#print(table_doc.content)
#prediction = reader.predict(query="Who teaches readings on asian economics?", documents=[table_doc])
#print_answers(prediction, details="all")
#print(f"Predicted answer: {prediction['answers'][0].answer}")
#print(f"Meta field: {prediction['answers'][0].meta}")

In [None]:
# Initialize pipeline
text_table_qa_pipeline = Pipeline()
text_table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"])
text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["EmbeddingRetriever"])
text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"])
text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"])
text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"])
## Pipeline TEST
#prediction = table_qa_pipeline.run("Who teaches readings on asian economics?", params={"top_k": 2})
#print(prediction['answers'][0].answer)

**Examples and tests**

In [None]:
predictions = text_table_qa_pipeline.run("Who teaches readings on asian economics?", params={"top_k": 2})
# We can see both text passages and tables as contexts of the predicted answers.
print_answers(predictions, details="minimum")

**Evaluation**