# Preparing datasets for external evaluation

This notebook prepares the datasets that are going to be used for external evaluation. Each dataset is stored into a dataframe that has some columns that are the same in all the dataframes, plus some additional columns that are unique for this particular dataset.

Columns in common in all dataframes are:
- `source` (url or identifier of the document)
- `text` (full text of the document)
- `toponym` (each toponym as appears in the text)
- `startCh` (start character of the toponym in the text)
- `endCh` (end character of the toponym in the text)
- `lat` (latitude of the resolved location)
- `lon` (longitude of the resolved location)
- `reference` (whether the main reference is Wikipedia identifier or coordinates. If the reference is Wikipedia, we've in any case derived its coordinates when possible)

## Obtaining the external datasets

Download the following datasets and store them in `processing/resources/`:
- [x] [The War of the Rebellion dataset](https://github.com/utcompling/WarOfTheRebellion/archive/master.zip) (unzip it)
- [x] [La Argentina Manuscrita](https://recogito.pelagios.org/document/wzqxhk0h3vpikm/downloads) (Download the .csv file and rename it as `argentina_manuscrita.csv`)

In [None]:
import xml.etree.ElementTree as ET
import urllib.parse
import pandas as pd
import glob
import json
import ast
import csv
import re

### Process the War of the Rebellion test set

In [None]:
def parse_wotr_coords(coords_string):
    latitude = None
    longitude = None
    re_coords = r'.*Point.*coordinates\"\:\[\s?(.*)\t?\,(.*)\]\}'
    if re.match(re_coords, coords_string):
        longitude, latitude = re.match(re_coords, coords_string).groups()
        latitude = latitude.replace("\t", "").strip()
        longitude = longitude.replace("\t", "").strip()
    return latitude, longitude

In [None]:
def process_wotr_split(input_directory, output_directory, json_split, split):
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'reference', 'volume', 'vol_charrange']
    rows = []
    numCoords = 0
    for item in json_split:
        volume = item['vol']
        text = item['text'].replace("\n", " ")
        docid = item['docid']
        vol_charrange = item['vol_charrange']
        for ne in item['named_entities']:
            startCh = ne['char_end']
            endCh = ne['char_start']
            toponym = ne['entity_string']
            lat, lon = parse_wotr_coords(ne['geo'])
            if lat and lon:
                rows.append([docid, text, toponym, startCh, endCh, lat, lon, 'coordinates', volume, vol_charrange])
    wotrcorpusdf = pd.DataFrame(rows, columns = cols)
    wotrcorpusdf.to_pickle(output_directory + "wotr_" + split + '.pkl')

In [None]:
def process_wotr_corpus(input_directory, output_directory, corpus):
    datasplit = ['test']
    for split in datasplit:
        with open(input_directory + corpus + 'Toponym/json/wotr-topo-' + split + '.json') as json_file:
            json_split = json.load(json_file)
            process_wotr_split(input_directory, output_directory, json_split, split)

In [None]:
input_directory = "../resources/"
output_directory = "../../datasets/candidate_ranking_datasets/"
process_wotr_corpus(input_directory, output_directory, "WarOfTheRebellion-master/")

### Process Argentina Manuscrita

In [None]:
def process_recogito(input_directory, output_directory, corpus):
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'reference', "uri", "vocab", "comments"]
    rows = []
    
    df = pd.read_csv(input_directory + corpus)
    df_places = df[(df["TYPE"] == "PLACE") & (df["VERIFICATION_STATUS"] == "VERIFIED")]
    for i, row in df_places.iterrows():
        rows.append([row["FILE"], "", row["QUOTE_TRANSCRIPTION"], row["ANCHOR"].split(":")[-1], str(int(row["ANCHOR"].split(":")[-1]) + len(row["QUOTE_TRANSCRIPTION"])), row["LAT"], row["LNG"], "coordinates", row["URI"], row["VOCAB_LABEL"], row["COMMENTS"]])
    recogito_corpusdf = pd.DataFrame(rows, columns = cols)
    recogito_corpusdf.to_pickle(output_directory + corpus.split(".")[0] + '.pkl')

In [None]:
input_directory = "../resources/"
output_directory = "../../datasets/candidate_ranking_datasets/"

process_recogito(input_directory, output_directory, "argentina_manuscrita.csv")