**BounWiki - Experiments**

- Tutorial for running Experiments from the report and evaluate them

##**Installs and Imports**

In [None]:
from google.colab import drive
drive.mount('/content/mnt')
%cd ./mnt/MyDrive/Erasmus/Transformer_Pretraining/Bounwiki

Mounted at /content/mnt
/content/mnt/MyDrive/Erasmus/Transformer_Pretraining/Bounwiki


In [None]:
import torch
!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html

In [None]:
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

In [None]:
## RESTART KERNEL and continue with cells below

In [None]:
# check if all imports work correctly
import time
import logging
import os
import torch
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes.retriever import EmbeddingRetriever
from haystack.nodes import TableReader, FARMReader, RouteDocuments, JoinAnswers
from haystack.utils import print_answers
from haystack import Document
from haystack import Pipeline
import requests
from bs4 import BeautifulSoup as soup
import torch_scatter
import pandas as pd


In [None]:
%load_ext autoreload
%autoreload 2

##**Run experiments**

In [None]:
## if desired, instead of default in-memory document store, you can use elasticsearch. 
## Please change line 11 in "setup_database.py" to ElasticSearchDocumentStore() if you do so

## download elasticsearch
# %%bash
# wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
# tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
# chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
## start elasticsearch server
# %%bash --bg
# sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
## wait until elastich search server started
#time.sleep(30)

In [None]:
# see below for standard context options, for gpt-3, the reduced context sets have the following names:
# processed_website_text -> processed_website_text_longexcluded
# processed_schedule_tables -> processed_schedule_tables_textified_small
contexts = [["processed_website_tables","processed_website_text","processed_schedule_tables"], 
            ["processed_website_tables","processed_website_text"],
            ["processed_website_tables"],
            ["processed_website_text"],
            ["processed_schedule_tables"]]

text_readers = ["minilm", "distilroberta", "electra-base", "deberta-large", "bert-base", "gpt3"]
table_reader = "tapas"
seperate_evaluation = True
api_key = "<insert api_key>"
top_ks = [1,3,10]
use_ada_embeddings = False


In [None]:
%cd ./mnt/MyDrive/Erasmus/Transformer_Pretraining/Bounwiki/

In [None]:
from run import main

In [None]:
# Example run
main(contexts[0], text_readers[0], table_reader, seperate_evaluation, api_key, top_ks[0], use_ada_embeddings)

##**Look at Results**

In [None]:
# Example to read out results, first load 
import pandas as pd
df = pd.read_csv('./output/results.csv')

In [None]:
# Filter results as wished, round the numbers and save to seperate csv
df[(df["Topk"]==3.0) & (df["Label type"]=="all_eval") & (df["Context"]=="processed_website_tables_processed_website_text_processed_schedule_tables")].round(3).to_csv("./output/all_eval_top_3__all_data.csv")

In [None]:
!python run.py --context "processed_website_text" --text_reader "bert-base" --table_reader "tapas" --seperate_evaluation

##**More Ideas**

In [None]:
# install tabulate to convert tables to text style tables and remove the need for table reader
!pip install tabulate

In [None]:
from tabulate import tabulate
import pickle
# load the schedule tables
with open("/content/mnt/MyDrive/Erasmus/Transformer_Pretraining/Bounwiki/data/website_data/processed_schedule_tables", "rb") as fp:
  res = pickle.load(fp)
# formate them to table, and save the result if desired
print(tabulate(res[0].content, headers = 'keys', tablefmt = 'psql'))

+----+------------+------------------------------------------+--------+--------------------------+------------------+--------------------+-------------------+
|    | Code       | Name                                     |   Ects | Instructor               | Days             | Hours              | Rooms             |
|----+------------+------------------------------------------+--------+--------------------------+------------------+--------------------+-------------------|
|  0 | ASIA502.02 | READINGS ON ASIAN ECONOMIES              |      8 | ALTAY ATLI               | MMM              | 101112             | No Room specified |
|  1 | ASIA518.02 | HISTORY OF MODERN JAPAN                  |      8 | SELÇUK ESENBEL           | TTT              | 101112             | No Room specified |
|  2 | ASIA520.02 | JAPANESE SOCIETY THROUGH MODERN LITERATU |      8 | OĞUZ BAYKARA             | TTT              | 101112             | No Room specified |
|  3 | ASIA521.02 | JAPANESE SOCIETY THROUGH A