# Math Indexer and Searcher at ARQMath
Download the ARQMath 2020 dataset.

In [1]:
%%bash
set -e

mkdir ARQMath_CLEF2020
mkdir ARQMath_CLEF2020/Formulas
mkdir -p ARQMath_CLEF2020/Task1/Topics
mkdir ARQMath_CLEF2020-output

DRIVE_NAME=mygoogledrive
rclone copy -v $DRIVE_NAME:ARQMath_CLEF2020/Collection ARQMath_CLEF2020/Collection
rclone copy -v $DRIVE_NAME:ARQMath_CLEF2020/Formulas/opt_representation_V1.0.zip ARQMath_CLEF2020/Formulas
rclone copy -v $DRIVE_NAME:ARQMath_CLEF2020/Task1/Topics/Formula_topics_opt_V2.0.tsv ARQMath_CLEF2020/Task1/Topics
rclone copy -v $DRIVE_NAME:ARQMath_CLEF2020/Task1/Topics/Topics_V2.0.xml ARQMath_CLEF2020/Task1/Topics

pushd ARQMath_CLEF2020/Formulas
unzip opt_representation_V1.0.zip
popd

Install Python packages.

In [5]:
%%bash
set -e

pip install wheel~=0.35.1 --use-feature=2020-resolver |& grep 'Successfully installed'
pip install git+https://github.com/MIR-MU/ARQMathCode@30063ed --use-feature=2020-resolver |& grep 'Successfully installed'
pip install git+https://github.com/MIR-MU/ARQMath-eval@0.0.18 --use-feature=2020-resolver |& grep 'Successfully installed'
pip install tqdm~=4.46.0 lxml~=4.5.2 requests~=2.24.0 gensim~=3.8.3 --use-feature=2020-resolver |& grep 'Successfully installed'

Successfully installed wheel-0.35.1
Successfully installed arqmathcode-0.0.1
Successfully installed arqmath-eval-0.0.18 tqdm-4.46.1
Successfully installed gensim-3.8.3 lxml-4.5.2 requests-2.24.0


Convert the ARQMath 2020 dataset to XHTML.

In [6]:
from arqmathcode.post_reader_record import DataReaderRecord
from arqmathcode.Visualization.generate_html_file import HtmlGenerator
import csv
import ctypes
from glob import glob
from io import TextIOWrapper
from lxml import etree
from multiprocessing import Pool
from tqdm import tqdm

csv_parameters = {'delimiter': '\t', 'quotechar': '"', 'quoting': csv.QUOTE_MINIMAL}
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
html_parser = etree.HTMLParser(huge_tree=True)
xml_parser = etree.XMLParser(huge_tree=True)
tostring_parameters = {'xml_declaration': True, 'encoding': 'UTF-8', 'pretty_print': True}

reader = DataReaderRecord('ARQMath_CLEF2020/Collection')

def read_formulae(filename):
    formulae = dict()
    with open(filename, 'rt') as f:
        formula_rows = csv.reader(f, **csv_parameters)
        next(formula_rows)
        for formula_row in formula_rows:
            formula_id = formula_row[0]
            formula = formula_row[-1].encode('UTF-8')
            formulae[formula_id] = formula
    return formulae

all_formulae = dict()
filenames = glob('ARQMath_CLEF2020/Formulas/opt_representation_V1.0/*.tsv')
with Pool(processes=48) as pool:
    for formulae in tqdm(pool.imap(read_formulae, filenames), total=len(filenames)):
        all_formulae.update(formulae)

def write_answer(answer_id):
    answer = reader.post_parser.map_just_answers[answer_id]
    is_selected = False
    user = None
    if answer.owner_user_id in reader.user_parser.map_of_user:
        user = reader.user_parser.map_of_user[answer.owner_user_id]
    answer_html = HtmlGenerator.generate_answer(
        is_selected,
        answer.post_id,
        answer.score,
        answer.body,
        HtmlGenerator.process_user(user, answer.creation_date),
        HtmlGenerator.process_comments(answer.comments, answer_id),
    )
    try:
        answer_document = etree.XML(answer_html, html_parser)
        for span in answer_document.xpath('//span[@class="math-container"]'):
            if 'id' not in span.attrib:  # remove math containers without ids
                replacement = etree.Element("span")
                replacement.text = span.tail
            else:
                formula_id = span.attrib['id']
                if formula_id not in all_formulae:  # remove containers without matching cmml formulae
                    replacement = etree.Element("span")
                    replacement.text = span.tail
                else:  # replace containers with latex formulae for cmml formulae
                    try:
                        replacement = etree.XML(all_formulae[formula_id], xml_parser)
                        replacement.tail = span.tail
                    except (etree.Error, UnicodeDecodeError) as e:
                        replacement = etree.Element("span")
                        replacement.text = span.tail   # remove containers with malformed matching cmml formulae
            span.getparent().replace(span, replacement)
        answer_xhtml = etree.tostring(answer_document, **tostring_parameters).decode('UTF-8')
        with open('ARQMath_CLEF2020-output/{}.html'.format(answer_id), 'wt') as f:
            print(answer_xhtml, file=f)
    except (etree.Error, UnicodeDecodeError):  # skip malformed answers
        pass

answer_ids = reader.post_parser.map_just_answers.keys()
with Pool(processes=48) as pool:
    for _ in tqdm(pool.imap(write_answer, answer_ids), total=len(answer_ids)):
        pass

converted_filenames = glob('ARQMath_CLEF2020-output/*.html')
print(
    'Converted {} / {} ({:.2f}%) answers.'.format(
        len(converted_filenames),
        len(answer_ids),
        100.0 * len(converted_filenames) / len(answer_ids),
    )
)

100%|██████████| 90/90 [02:31<00:00,  1.69s/it] 
100%|██████████| 1445495/1445495 [5:14:46<00:00, 76.53it/s]   


Converted 1445495 / 1445495 (100.00%) answers.


Install MIaS.

In [8]:
%%bash
set -e

curl --location https://github.com/AdoptOpenJDK/openjdk8-upstream-binaries/releases/download/jdk8u262-b10/OpenJDK8U-jdk-jfr_x64_linux_8u262b10.tar.gz | tar xzv
export JAVA_HOME="$PWD"/openjdk-8u262-b10
export PATH="$PWD"/openjdk-8u262-b10/bin:"$PATH"

curl https://mirror.dkm.cz/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz | tar xz
rm -rf ~/.m2  # erase previously-built mvn projects
PATH="$PWD"/apache-maven-3.6.3/bin/:"$PATH"

git clone https://github.com/MIR-MU/MathMLCan.git
pushd MathMLCan
git checkout 38063b9
mvn clean install
popd

git clone https://github.com/MIR-MU/MathMLUnificator.git
pushd MathMLUnificator
git checkout 0473904
mvn clean install
popd

git clone https://github.com/MIR-MU/MIaSMath.git
pushd MIaSMath
git checkout 0961efb
git apply << 'EOF'  # patch out structural unification
diff --git a/src/main/java/cz/muni/fi/mias/math/MathTokenizer.java b/src/main/java/cz/muni/fi/mias/math/MathTokenizer.java
index 53c7380..f2a0b44 100644
--- a/src/main/java/cz/muni/fi/mias/math/MathTokenizer.java
+++ b/src/main/java/cz/muni/fi/mias/math/MathTokenizer.java
@@ -418,7 +418,8 @@ public class MathTokenizer extends Tokenizer {
                 }
                 if (store && !MathMLConf.ignoreNode(name)) {
                     addFormula(position, new Formula(n, rank, originalRank));
-                    loadUnifiedNodes(n, rank, originalRank, position);
+                    // FIXME: structural unification disabled
+                    // loadUnifiedNodes(n, rank, originalRank, position);
                 }
             }
         }
@@ -696,7 +697,8 @@ public class MathTokenizer extends Tokenizer {
     private void modify() {
         unifyVariables(vCoef);
         unifyConst(cCoef);
-        unifyOperators(oCoef);
+        // FIXME: operator unification disabled
+        // unifyOperators(oCoef);
         processAttributes(aCoef);
     }

EOF
mvn clean install
popd

git clone https://github.com/MIR-MU/MIaS.git
pushd MIaS
git checkout 3dbaa02
mvn clean install
popd

pushd MIaS/target
mkdir index
cat > mias.properties <<EOF
INDEXDIR=$PWD/index
UPDATE=false
THREADS=48
MAXRESULTS=1000
DOCLIMIT=-1
FORMULA_DOCUMENTS=false
EOF
popd

Index the ARQMath 2020 dataset in MIaS.

In [5]:
! set -e; \
  export JAVA_HOME="$PWD"/openjdk-8u262-b10; \
  export PATH="$PWD"/openjdk-8u262-b10/bin:"$PATH"; \
  PATH="$PWD"/apache-maven-3.6.3/bin/:"$PATH"; \
  pushd MIaS/target; \
  time java -jar MIaS-1.6.6-4.10.4-SNAPSHOT.jar -conf mias.properties -stats -overwrite \
  "$OLDPWD"/ARQMath_CLEF2020-output "$OLDPWD"/ARQMath_CLEF2020-output |& \
  python -m tqdm > output.log; \
  grep 'Documents indexed' < output.log | tail -n 1; \
  grep 'Indexed formulae'  < output.log | tail -n 1; \
  popd

4358173it [2:47:57, 432.47it/s]

2020-08-28 22:52:58,745 [main] INFO  cz.muni.fi.mias.indexing.Indexing - Documents indexed: 1445494
2020-08-28 22:52:58,747 [main] INFO  cz.muni.fi.mias.math.MathTokenizer - Indexed formulae: 155132667

real	167m57,884s
user	281m59,032s
sys	21m54,002s


Install WebMIaS.

In [11]:
%%bash
set -e

export JAVA_HOME="$PWD"/openjdk-8u262-b10
export PATH="$PWD"/openjdk-8u262-b10/bin:"$PATH"
PATH="$PWD"/apache-maven-3.6.3/bin/:"$PATH"
    
git clone https://github.com/MIR-MU/WebMIaS.git
pushd WebMIaS
git checkout b2d9222
cat > src/main/resources/cz/muni/fi/webmias/indexes.properties << EOF
INDEX_NAMES=arqmath-clef2020
PATHS=$OLDPWD/MIaS/target/index
STORAGES=$OLDPWD/ARQMath_CLEF2020-output
MAXRESULTS=1000
EOF
mvn clean install
popd

curl --location downloads.apache.org/tomcat/tomcat-8/v8.5.56/bin/apache-tomcat-8.5.56.tar.gz | tar xz
pushd apache-tomcat-8.5.56
cp "$OLDPWD"/WebMIaS/target/WebMIaS-1.6.6-4.10.4-SNAPSHOT.war webapps/WebMIaS.war
bin/startup.sh
popd

Tomcat started.


Produce the task 1 results.

In [1]:
from collections import deque
import csv
import ctypes
from gensim.utils import simple_preprocess as gensim_simple_preprocess
from itertools import chain, cycle
from lxml import etree
import re
import requests
from tqdm import tqdm

csv_parameters = {'delimiter': '\t', 'quotechar': '"', 'quoting': csv.QUOTE_MINIMAL}
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
html_parser = etree.HTMLParser(huge_tree=True)
xml_parser = etree.XMLParser(huge_tree=True)
tostring_parameters = {'xml_declaration': True, 'encoding': 'UTF-8', 'pretty_print': True}
webmias_url = 'http://localhost:8080/WebMIaS/ws/search'
num_results = 1000
webmias_parameters = {'limit': num_results, 'index': 0}
score_regex = r'\s*score\s*=\s*(?P<score>[-0-9.]*)'

formulae = dict()
with open('ARQMath_CLEF2020/Task1/Topics/Formula_topics_opt_V2.0.tsv', 'rt') as f:
    formula_rows = csv.reader(f, **csv_parameters)
    next(formula_rows)
    for formula_row in formula_rows:
        formula_id = formula_row[0]
        formula = formula_row[-1]
        formulae[formula_id] = formula

def simple_preprocess(text):
    return gensim_simple_preprocess(text, max_len=float('inf'))

def parse_topic_element(topic_id, topic_element):
    text_keywords, formula_keywords = [], []
    title_element, question_element, tags_element = topic_element
    title_document = etree.XML(title_element.text, html_parser)
    for topic_subdocument in (title_document, title_document):
        for span in topic_subdocument.xpath('//span[@class="math-container"]'):
            if 'id' in span.attrib:
                formula_id = span.attrib['id']
                if formula_id in formulae:
                    formula_keywords.append(formulae[formula_id])
                else:
                    print('Undefined formula {} in topic {}'.format(formula_id, topic_id))
            replacement = etree.Element("span")
            replacement.text = span.tail
            span.getparent().replace(span, replacement)
        text_keywords.extend(simple_preprocess(' '.join(topic_subdocument.itertext())))
    text_keywords.extend(simple_preprocess(tags_element.text))
    return (text_keywords, formula_keywords)

def leave_rightmost_out(text_keywords, formula_keywords):
    num_queries = len(text_keywords) + len(formula_keywords) + 1
    stripe_width = num_queries
    for first_text_keyword in range(len(text_keywords) + 1):
        yield (text_keywords[first_text_keyword:], formula_keywords, stripe_width)
        stripe_width -= 1
    for last_formula_keyword in range(len(formula_keywords) - 1, -1, -1):
        yield (text_keywords, formula_keywords[0:last_formula_keyword], stripe_width)
        stripe_width -= 1

def merge_result_lists(result_lists):
    stripe_widths, result_deques = zip(*[
        (stripe_width, deque(results))
        for stripe_width, results
        in result_lists
    ])
    maximum_score = float('-inf')
    for result_deque in result_deques:
        try:
            maximum_score = max(maximum_score, result_deque[0][1])
        except IndexError:
            pass
    assert maximum_score > float('-inf')
    seen_answer_ids = set()
    results = []
    for stripe_width, result_deque in cycle(zip(stripe_widths, result_deques)):
        if not sum(len(result_deque) for result_deque in result_deques):
            break  # all deques are empty
        if len(results) == num_results:
            break  # result list is full
        if not result_deque:
            continue  # current deque is empty
        try:
            for _ in range(stripe_width):
                answer_id, result_score = result_deque.popleft()
                while answer_id in seen_answer_ids:
                    answer_id, result_score = result_deque.popleft()
                merged_score = maximum_score * (num_results - len(results)) + result_score
                merged_score /= maximum_score * (num_results + 1)
                results.append((answer_id, merged_score))
                seen_answer_ids.add(answer_id)
                if len(results) == num_results:
                    break
        except IndexError:
                continue
    assert len(results) == num_results
    return results

with open('MIRMU-task1-MIaS-auto-both-P.tsv', 'wt') as results_f:
    results_csv_writer = csv.writer(results_f, **csv_parameters)
    with open('ARQMath_CLEF2020/Task1/Topics/Topics_V2.0.xml', 'rt') as topics_f:
        topics_document = etree.parse(topics_f, xml_parser)
        topic_elements = tqdm(topics_document.xpath('/Topics/Topic'), desc='reading queries')
        durations = dict()
        for topic_element in topic_elements:
            topic_id = topic_element.attrib['number']
            text_keywords, formula_keywords = parse_topic_element(topic_id, topic_element)
            subqueries = list(leave_rightmost_out(text_keywords, formula_keywords))
            subquery_result_lists = []
            for text_keywords, formula_keywords, stripe_width in subqueries:
                subquery_payload = ' '.join(chain(text_keywords, formula_keywords)).encode('UTF-8')
                subquery_data = {'query': subquery_payload, **webmias_parameters}
                subquery_response = requests.post(webmias_url, data=subquery_data)
                subquery_response_document = etree.XML(subquery_response.content, xml_parser)
                subquery_results = []
                for subquery_result_element in subquery_response_document.xpath('//result'):
                    id_element, info_element, *_ = subquery_result_element
                    answer_id = re.sub('\.html$', '', id_element.text)
                    score = float(re.match(score_regex, info_element.text).group('score'))
                    subquery_results.append((answer_id, score))
                subquery_results.sort(key=lambda x: (x[1], x[0]), reverse=True)
                subquery_result_lists.append((stripe_width, subquery_results))
            query_results = merge_result_lists(subquery_result_lists)
            for rank, (answer_id, score) in enumerate(query_results):
                row = (topic_id, answer_id, rank + 1, score, 'Run_MIaS_0')
                results_csv_writer.writerow(row)

reading queries: 100%|██████████| 98/98 [05:14<00:00,  3.21s/it]


Evaluate the task 1 results.

In [2]:
! python -m arqmath_eval.evaluate MIRMU-task1-MIaS-auto-both-P.tsv

0.155, 95% CI: [0.121; 0.192]
