In [None]:
import pandas as pd
import numpy as np
import time
import math
import os
import re
from pathlib import Path
import glob
from pandarallel import pandarallel
import unicodedata

In [None]:
def format_for_candranker(gazname, unique_placenames_array):
    """
    This function returns the unique alternate names in a given gazetteer
    in the format required by DeezyMatch candidate ranker."""
    with open("../../datasets/candidate_mentions_sets/" + gazname + ".txt", "w") as fw:
        for pl in unique_placenames_array:
            pl = pl.strip()
            if pl:
                if not "wikipedia" in pl: # Remove altnames that are wikiURLs (from geonames)
                    if not any(char.isdigit() for char in pl):
                        if not '"' in pl:
                            fw.write(pl.strip() + "\t0\tfalse\n")

## 1. Processing WikiGazetteer

This step assumes the user has already run [this notebook](https://github.com/Living-with-machines/LwM_SIGSPATIAL2020_ToponymMatching/blob/master/processing/gazetteers/generate_wikigazetteers.ipynb).

In [None]:
gaz_languages = ["en", "es", "el"]

for language in gaz_languages:
    wiki_lang = pd.read_pickle("../resources/wikigaz_" + language + "_basic.pkl")
    wiki_lang = wiki_lang.drop(columns=['source'])
    wiki_lang.to_pickle("../../datasets/gazetteers/wikigaz_" + language + ".pkl")
    wiki_lang = wiki_lang.drop_duplicates(subset = ['altname', 'lat', 'lon'])
    unique_placenames_array = list(set(list(np.array(wiki_lang["altname"]))))
    format_for_candranker("wikigaz_" + language, unique_placenames_array)

In [None]:
for language in gaz_languages:
    wgdf = pd.read_pickle("../resources/wikigaz_" + language + "_basic.pkl")
    wgdf = wgdf[wgdf["source"].isin(['wikimain', 'geonamesmain', 'geonamesascii', 'geonamesalt', 'wikiredirect'])]
    wgdf = wgdf[wgdf["altname"].str.len() < 30]
    wgdf['lat'] = wgdf['lat'].astype(float)
    wgdf['lon'] = wgdf['lon'].astype(float)
    wgdf = wgdf[wgdf['lat'].notna()]
    wgdf = wgdf[wgdf['lon'].notna()]
    wgdf = wgdf.rename(columns={"altname": "name", "pid": "wikititle", "lat": "latitude", "lon": "longitude"})
    wgdf.to_pickle("../resources/wikiGaz_" + language + "_filtered.pkl")

## 2. Processing Pleiades

Download Pleiades gazetteer [from here](http://atlantides.org/downloads/pleiades/dumps/pleiades-names-latest.csv.gz), unzip it, and store it in `toponym_matching/processing/resources/`.

In [None]:
df = pd.read_csv("../resources/pleiades-names-latest.csv")

In [None]:
df.head()

Filter only interested in entries written in Greek alphabet, and format them according to the format needed as input for DeezyMatch:

In [None]:
alternatename = []
pid = []
lat = []
lon = []

for i, row in df.iterrows():
    if row["nameLanguage"] == "grc" or row["nameLanguage"] == "el":
        if type(row["nameAttested"]) == str and type(row["reprLat"]) == float and type(row["reprLong"]):
            toponym = row["nameAttested"]
            alternatename.append(toponym)
            pid.append(row["pid"])
            lat.append(row["reprLat"])
            lon.append(row["reprLong"])

In [None]:
pleiades_gaz = pd.DataFrame()
pleiades_gaz['altname'] = alternatename
pleiades_gaz['pid'] = pid
pleiades_gaz['lat'] = lat
pleiades_gaz['lon'] = lon
            
pleiades_gaz['lat'] = pd.to_numeric(pleiades_gaz['lat'], errors = 'coerce')
pleiades_gaz['lon'] = pd.to_numeric(pleiades_gaz['lon'], errors = 'coerce')
pleiades_gaz.dropna(inplace = True)

In [None]:
pleiades_gaz.to_pickle("../../datasets/gazetteers/pleiades.pkl")

unique_placenames_array = list(set(list(np.array(pleiades_gaz["altname"]))))
format_for_candranker("pleiades", unique_placenames_array)

### 2.1 Combine Pleiades and WikiGazEL

In [None]:
greek_wgaz_pleiades = pd.concat([pd.read_pickle("../../datasets/gazetteers/pleiades.pkl"), pd.read_pickle("../../datasets/gazetteers/wikigaz_el.pkl")])
greek_wgaz_pleiades = greek_wgaz_pleiades.drop_duplicates(subset = ['altname', 'lat', 'lon'])
greek_wgaz_pleiades.to_pickle("../../datasets/gazetteers/wikigaz_pleiades_el.pkl")

unique_placenames_array = list(set(list(np.array(greek_wgaz_pleiades["altname"].unique()))))
format_for_candranker("wikigaz_pleiades_el", unique_placenames_array)

## 3. Processing HGIS de las Indias

Do the following four steps only once:
1. Download gazetteer from https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/FUSJD3/DK27GE&version=2.0
2. Unzip and file in `toponym_matching/processing/resources/`.
3. Convert zip to df (uncomment and run cell below).
4. Store dataframe in `toponym_matching/processing/resources/`.

In [None]:
## Uncomment and run this only once. Change filename accordingly:
# import simpledbf
# dbf = simpledbf.Dbf5('../../resources/gazetteer-2019-03-28/gazetteer-2019-03-28.dbf')
# df = dbf.to_dataframe()
# df.to_pickle("../../resources/hgis_de_las_indias.pkl")

In [None]:
df = pd.read_pickle("../resources/hgis_de_las_indias.pkl")
df.head()

In [None]:
df_indias = pd.DataFrame()
df_indias["altname"] = df["label"]
df_indias["pid"] = df["gz_id"]
df_indias["lat"] = df["lat"]
df_indias["lon"] = df["lon"]
            
df_indias['lat'] = pd.to_numeric(df_indias['lat'], errors = 'coerce')
df_indias['lon'] = pd.to_numeric(df_indias['lon'], errors = 'coerce')
df_indias.dropna(inplace = True)

In [None]:
df_indias = df_indias.drop_duplicates(subset=["altname", "pid", "lat", "lon"])

In [None]:
df_indias.reset_index(drop=True, inplace=True)

In [None]:
df_indias.head()

In [None]:
df_indias.to_pickle("../../datasets/gazetteers/hgisindias.pkl")

unique_placenames_array = list(set(list(np.array(df_indias["altname"]))))
format_for_candranker("hgisindias", unique_placenames_array)

### 3.1. Combine HGISindias and WikiGazES

In [None]:
wges = pd.read_pickle("../../datasets/gazetteers/wikigaz_es.pkl")
wges = wges.rename(columns={"wikititle": "pid"})

es_wgaz_hgisindias = pd.concat([pd.read_pickle("../../datasets/gazetteers/hgisindias.pkl"), wges])
es_wgaz_hgisindias = es_wgaz_hgisindias.drop_duplicates(subset = ['altname', 'lat', 'lon'])
es_wgaz_hgisindias.to_pickle("../../datasets/gazetteers/wikigaz_hgisindias_es.pkl")

unique_placenames_array = list(set(list(np.array(es_wgaz_hgisindias["altname"]))))
format_for_candranker("wikigaz_hgisindias_es", unique_placenames_array)