# Preparing datasets for external evaluation

This notebook prepares the datasets that are going to be used for external evaluation. Each dataset is stored into a dataframe that has some columns that are the same in all the dataframes, plus some additional columns that are unique for this particular dataset.

Columns in common in all dataframes are:
- `source` (url or identifier of the document)
- `text` (full text of the document)
- `toponym` (each toponym as appears in the text)
- `startCh` (start character of the toponym in the text)
- `endCh` (end character of the toponym in the text)
- `lat` (latitude of the resolved location)
- `lon` (longitude of the resolved location)
- `reference` (whether the main reference is Wikipedia identifier or coordinates. If the reference is Wikipedia, we've in any case derived its coordinates when possible)

## Obtaining the external datasets

Download the following datasets and store them in the `datasets` folder:
- [x] [The War of the Rebellion dataset](https://github.com/utcompling/WarOfTheRebellion/archive/master.zip) (unzip it)
- [x] [GeoVirus](https://github.com/milangritta/Pragmatic-Guide-to-Geoparsing-Evaluation/blob/master/data/Corpora/GeoVirus.xml)
- [x] [TR-News](https://github.com/milangritta/Pragmatic-Guide-to-Geoparsing-Evaluation/blob/master/data/Corpora/TR-News.xml)
- [x] [Local Global Lexicon](https://github.com/milangritta/Pragmatic-Guide-to-Geoparsing-Evaluation/blob/master/data/Corpora/lgl.xml)
- [x] [GeoWebNews](https://github.com/milangritta/Pragmatic-Guide-to-Geoparsing-Evaluation/blob/master/data/GWN.xml)
- [x] [La Argentina Manuscrita](https://recogito.pelagios.org/document/wzqxhk0h3vpikm/downloads) (Download the .csv file and rename it as `argentina_manuscrita.csv`)
- [x] [Pausanias: Periegesis](https://recogito.pelagios.org/document/35zv4zm4iqp4aw/downloads) (Download the .csv file and rename it as `pausanias_periegesis.csv`)

In [43]:
import xml.etree.ElementTree as ET
from conllu import parse
from conllu import parse_tree
import urllib.parse
import pandas as pd
import glob
import json
import ast
import csv
import re

import warnings
warnings.filterwarnings('ignore')

### Process the external datasets

In [2]:
def process_geovirus_corpus(corpus):
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'reference', 'wikititle']
    rows = []
    tree = ET.parse(corpus)
    root = tree.getroot()
    for article in root:
        source = article.find('./source').text
        text = article.find('./text').text
        for location in article.findall('./locations/location'):
            toponym = location.find('name').text
            startCh = location.find('start').text
            endCh = location.find('end').text
            lat = location.find('lat').text
            lon = location.find('lon').text
            wikititle = location.find('page').text.split("/")[-1]
            rows.append([source, text, toponym, startCh, endCh, lat, lon, 'wikipedia', wikititle])
    xmlcorpusdf = pd.DataFrame(rows, columns = cols)
    xmlcorpusdf.to_pickle("../../datasets/extrinsic/" + corpus.split('.xml')[0].split("/")[-1] + '.pkl')

In [3]:
def process_lgl_corpus(corpus):
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'reference', 'geoname', 'geonameid', 'fclass', 'fcode', 'feedid', 'title', 'dltime', 'domain']
    rows = []
    tree = ET.parse(corpus)
    root = tree.getroot()
    for article in root:
        text = article.find('./text').text
        feedid = article.find('./feedid').text
        url = article.find('./url').text
        title = article.find('./title').text
        domain = article.find('./domain').text
        dltime = article.find('./dltime').text
        for location in article.findall('./toponyms/toponym'):
            toponym = location.find('phrase').text
            startCh = location.find('start').text
            endCh = location.find('end').text
            gaztag = location.find('gaztag')
            geonameid = ""
            locname = ""
            fclass = ""
            fcode = ""
            lat = ""
            lon = ""
            if gaztag:
                geonameid = gaztag.attrib['geonameid']
                locname = gaztag.find('./name').text
                fclass = gaztag.find('./fclass').text
                fcode = gaztag.find('./fcode').text
                lat = gaztag.find('./lat').text
                lon = gaztag.find('./lon').text
            rows.append([url, text, toponym, startCh, endCh, lat, lon, 'coordinates', locname, geonameid, fclass, fcode, feedid, title, dltime, domain])
    print(len(rows))
    xmlcorpusdf = pd.DataFrame(rows, columns = cols)
    print(xmlcorpusdf.shape)
    xmlcorpusdf.to_pickle("../../datasets/extrinsic/" + corpus.split('.xml')[0].split("/")[-1] + '.pkl')

In [4]:
def process_trnews_corpus(corpus):
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'reference', 'geoname', 'geonameid', 'fclass', 'fcode', 'title', 'dltime', 'domain']
    rows = []
    tree = ET.parse(corpus)
    root = tree.getroot()
    for article in root:
        text = article.find('./text').text
        url = article.find('./url').text
        title = article.find('./title').text
        domain = article.find('./domain').text
        dltime = article.find('./dltime').text
        for location in article.findall('./toponyms/toponym'):
            toponym = location.find('phrase').text
            startCh = location.find('start').text
            endCh = location.find('end').text
            gaztag = location.find('gaztag')
            geonameid = ""
            locname = ""
            fclass = ""
            fcode = ""
            lat = ""
            lon = ""
            if gaztag:
                geonameid = gaztag.attrib['geonameid']
                locname = gaztag.find('./name').text
                fclass = gaztag.find('./fclass')
                fcode = gaztag.find('./fcode')
                fclass = fclass.text if not fclass == None else ""
                fcode = fcode.text if not fcode == None else ""
                lat = gaztag.find('./lat').text
                lon = gaztag.find('./lon').text
            rows.append([url, text, toponym, startCh, endCh, lat, lon, 'coordinates', locname, geonameid, fclass, fcode, title, dltime, domain])
    xmlcorpusdf = pd.DataFrame(rows, columns = cols)
    xmlcorpusdf.to_pickle("../../datasets/extrinsic/" + corpus.split('.xml')[0].split("/")[-1] + '.pkl')

In [5]:
def process_gwn_corpus(corpus):
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'reference', 'geoname', 'geonameid', 'title']
    rows = []
    tree = ET.parse(corpus)
    root = tree.getroot()
    for article in root:
        text = article.find('./text').text
        url = article.find('./link').text
        title = article.find('./title').text
        for location in article.findall('./toponyms/toponym'):
            toponym = location.find('extractedName').text
            startCh = location.find('start').text
            endCh = location.find('end').text
            locname = location.find('normalisedName')
            locname = locname.text if not locname == None else ""
            loctype = location.find('type')
            loctype = loctype.text if not loctype == None else ""
            geonameid = location.find('geonamesID')
            geonameid = geonameid.text if not geonameid == None else ""
            lat = location.find('latitude')
            lat = lat.text if not lat == None else ""
            lon = location.find('longitude')
            lon = lon.text if not lon == None else ""
            if lat and lon:
                rows.append([url, text, toponym, startCh, endCh, lat, lon, 'coordinates', locname, geonameid, title])
    xmlcorpusdf = pd.DataFrame(rows, columns = cols)
    xmlcorpusdf.to_pickle("../../datasets/extrinsic/" + corpus.split('.xml')[0].split("/")[-1] + '.pkl')

In [6]:
def parse_wotr_coords(coords_string):
    latitude = None
    longitude = None
    re_coords = r'.*Point.*coordinates\"\:\[\s?(.*)\t?\,(.*)\]\}'
    if re.match(re_coords, coords_string):
        longitude, latitude = re.match(re_coords, coords_string).groups()
        latitude = latitude.replace("\t", "").strip()
        longitude = longitude.replace("\t", "").strip()
    return latitude, longitude

In [7]:
def process_wotr_split(directory, json_split, split):
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'reference', 'volume', 'vol_charrange']
    rows = []
    numCoords = 0
    for item in json_split:
        volume = item['vol']
        text = item['text'].replace("\n", " ")
        docid = item['docid']
        vol_charrange = item['vol_charrange']
        for ne in item['named_entities']:
            startCh = ne['char_end']
            endCh = ne['char_start']
            toponym = ne['entity_string']
            lat, lon = parse_wotr_coords(ne['geo'])
            if lat and lon:
                rows.append([docid, text, toponym, startCh, endCh, lat, lon, 'coordinates', volume, vol_charrange])
    wotrcorpusdf = pd.DataFrame(rows, columns = cols)
    wotrcorpusdf.to_pickle("../../datasets/extrinsic/wotr_" + split + '.pkl')

In [8]:
def process_wotr_corpus(directory, corpus):
    datasplit = ['train', 'test']
    for split in datasplit:
        with open(directory + corpus + 'Toponym/json/wotr-topo-' + split + '.json') as json_file:
            json_split = json.load(json_file)
            process_wotr_split(directory, json_split, split)

In [9]:
directory = "ext_datasets/"
process_geovirus_corpus(directory + "GeoVirus.xml")
process_lgl_corpus(directory + "lgl.xml")
process_trnews_corpus(directory + "TR-News.xml")
process_gwn_corpus(directory + "GWN.xml")
process_wotr_corpus(directory, "WarOfTheRebellion-master/")

5088
(5088, 16)


In [78]:
# source (url or identifier of the document)
# text (full text of the document)
# toponym (each toponym as appears in the text)
# startCh (start character of the toponym in the text)
# endCh (end character of the toponym in the text)
# lat (latitude of the resolved location)
# lon (longitude of the resolved location)
# reference (whether the main reference is Wikipedia identifier or coordinates. If the reference is Wikipedia, we've in any case derived its coordinates when possible)

def process_recogito(directory, corpus):
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'reference', "uri", "vocab", "comments"]
    rows = []
    
    df = pd.read_csv(directory + corpus)
    df_places = df[(df["TYPE"] == "PLACE") & (df["VERIFICATION_STATUS"] == "VERIFIED")]
    for i, row in df_places.iterrows():
        rows.append([row["FILE"], "", row["QUOTE_TRANSCRIPTION"], row["ANCHOR"].split(":")[-1], str(int(row["ANCHOR"].split(":")[-1]) + len(row["QUOTE_TRANSCRIPTION"])), row["LAT"], row["LNG"], "coordinates", row["URI"], row["VOCAB_LABEL"], row["COMMENTS"]])
    recogito_corpusdf = pd.DataFrame(rows, columns = cols)
    recogito_corpusdf.to_pickle("../../datasets/extrinsic/" + corpus.split(".")[0] + '.pkl')

In [79]:
process_recogito(directory, "argentina_manuscrita.csv")
process_recogito(directory, "pausanias_periegesis.csv")

## Process the FMP dataset annotated by LwM

From this point on, to run this notebook you will need the annotations created by the LwM project and a local version of WikiGazetteer. To build a WikiGazetteer (into a MySQL database) for a specific Wikipedia language and version follow [these instructions](https://github.com/Living-with-machines/lwm_GIR19_resolving_places/tree/master/gazetteer_construction). Make sure you change your credentials in the next cell.

In [12]:
import mysql.connector
from mysql.connector import Error

gazDB = ""
cursorGaz = ""
try:
    gazDB = mysql.connector.connect(
            host='localhost',
            database='gazetteer',
            user='testGazetteer',
            password='1234')
    if gazDB.is_connected():
        cursorGaz = gazDB.cursor(dictionary=True)
except Error as e:
    print("Error while connecting to MySQL", e)

In [13]:
def get_wiki_coords(wikititle):
    if gazetteerDB_server == "mysqlGaz":
        cursorGaz.execute("""
            SELECT lat, lon FROM location
            where wiki_title=%s
        """, (wikititle,))
    results = cursorGaz.fetchall()
    if len(results) >= 1:
        lat = results[0]["lat"]
        lon = results[0]["lon"]
        return lat, lon
    else:
        return "", ""

In [14]:
def process_fmp_corpus(directory, corpus):
    retoken = r'^[0-9]+\-[0-9]+\t([0-9]+)\-([0-9]+)\t([^\t]+)\t([^\t]+)\t(.*)$'
    reIndex = r'(.*)\[([0-9]+)\]'
    rows = []
    cols = ['source', 'text', 'toponym', 'startCh', 'endCh', 'lat', 'lon', 'wikititle', 'reference', 'loctype', 'publplace', 'publdate']
    for i in glob.glob(directory + corpus + "*/annotation/*/*"):
        if i.split("/")[-1] == "mariona.tsv":
            fileid = i.split("/")[-2]
            publplace = fileid.split("_")[1].split(".txt")[0][:-4]
            publdate = fileid.split("_")[1].split(".txt")[0][-4:]
            
            text = ""
            with open(directory + corpus + "/" + i.split("/")[-4] + "/source/" + i.split("/")[-2]) as fr:
                text = fr.read()
            
            dMultigrams = dict()
            with open(i) as fr:
                lines = fr.readlines()
                for line in lines:
                    line = line.strip()
                    if re.match(retoken, line):
                        startCh, endCh, toponym, wikiurl, loctype = re.match(retoken, line).groups()
                        wikiurl = urllib.parse.unquote(wikiurl).replace("\\", "")
                        if "—" in toponym:
                            toponym = toponym.split("—")[0]
                            endCh = str(int(startCh) + len(toponym))
                        if 'wiki' in wikiurl and loctype.lower() == 'locwiki':
                            wikiurl = wikiurl.split("/")[-1]
                            if re.match(reIndex, loctype):
                                wikiurl, entindex = re.match(reIndex, wikiurl).groups()
                                if entindex in dMultigrams:
                                    dMultigrams[entindex].append((toponym, wikiurl, startCh, endCh, loctype))
                                else:
                                    dMultigrams[entindex] = [(toponym, wikiurl, startCh, endCh, loctype)]
                            # If the entity is just one token:        
                            else:
                                lat, lon = get_wiki_coords(wikiurl)
                                rows.append((fileid, text, toponym, startCh, endCh, lat, lon, wikiurl, "wikipedia", loctype, publplace, publdate))
            
            for entindex in dMultigrams:
                multitoken_startCh = int(dMultigrams[entindex][0][2])
                multitoken_endCh = int(dMultigrams[entindex][-1][3])
                multitoken_toponym = text[multitoken_startCh:multitoken_endCh]
                multitoken_url = dMultigrams[entindex][0][1].split("/")[-1]
                multitoken_loctype = dMultigrams[entindex][0][-1].split("[")[0]
                lat, lon = get_wiki_coords(multitoken_url)
                rows.append((fileid, text, multitoken_toponym, multitoken_startCh, multitoken_endCh, lat, lon, multitoken_url, "wikipedia", multitoken_loctype, publplace, publdate))
    fmpcorpusdf = pd.DataFrame(rows, columns = cols)
    fmpcorpusdf.to_pickle("../../datasets/extrinsic/fmp.pkl")

In [15]:
directory = "ext_datasets/"

In [16]:
process_fmp_corpus(directory, "ToponymResolutionGoldStandard/")

In [17]:
if gazetteerDB_server == "mysqlGaz":
    if (gazDB.is_connected()):
        cursorGaz.close()
        gazDB.close()