In [None]:
import re
import json
import string
import random
import pandas as pd
from thefuzz import fuzz

# The Heritage Gazetteer of Libya

[Intro by Valeria]

Source: https://slsgazetteer.org/

## Datasets

In order to train and use DeezyMatch to find candidates for toponyms in the HGL, we need to prepare the following datasets:
* **String matching dataset:** dataset of toponym pairs built from Geonames alternate names belonging to places in current-day Libya.
* **Candidates dataset:** list of toponyms (and alternate names) belonging to places in current-day Libya, from Geonames.
* **Queries dataset:** list of toponyms obtained from the Heritage Gazetteer of Libya.

We provide the [vocabulary file](https://github.com/Living-with-machines/DeezyMatch/examples/libyan_gazetteer/inputs/characters_v001.vocab) and the [input file](https://github.com/Living-with-machines/DeezyMatch/examples/libyan_gazetteer/inputs/input_dfm.yaml).

### Obtaining the data from Geonames

We focus on **toponyms of places in modern-day Libya** in this case study, so we will just download `LY`, but you can use any other country (see a mapping between countries and codes here: https://www.geonames.org/countries/).

Download the following data from [Geonames](https://download.geonames.org/export/dump/), in particular:
* Download `LY.zip`: https://download.geonames.org/export/dump/LY.zip (depends on the country you're interested in)
* Download `alternateNamesV2.zip`: https://download.geonames.org/export/dump/alternateNamesV2.zip (for all countries)

Unzip the files and store `LY.txt` and `alternateNamesV2.txt` in `data/`.

### Obtaining the data from the HGL

The Historical Gazetteer of Lybia data can be found as a json [here](http://slsgazetteer.org/data/downloads/json/dump.json). Download it and store it in `data/`, renaming it as `hgl_data.json`.

### Directory structure

The `libyan_gazetteer` directory should now look like this:
```
libyan_gazetteer
   ├── prepare_dataset.ipynb
   ├── tutorial_hgl.ipynb
   ├── data
   │   ├── hgl_data.json
   │   ├── LY.txt
   │   └── alternateNamesV2.txt
   └── inputs
       ├── characters_v001.vocab
       └── input_dfm.yaml
```

## Prepare the string pairs dataset

In [None]:
# Specify the country of choice:
country = "LY"

# Specify the language codes of the country, for this example: Arabic, Libyan Arabic, Berber, Domari,
# Tamasheq, Teda, Egyptian Spoken Arabic, Standard Arabic, Awjila, Italian, French, English, and Libyan
# Spoken Arabic:
toponym_languages = ["ar", "ar-LY", "ber", "rmt", "taq", "tuq", "arz", "arb", "auj", "it", "fr", "en", "ayl"]

In [None]:
# Load the country-specific gazetteer (we need to specify the column names):
df_country = pd.read_csv("data/" + country + ".txt", sep="\t", names=[
                                                        "geonameid", 
                                                        "name", 
                                                        "asciiname", 
                                                        "alternatenames", 
                                                        "latitude", 
                                                        "longitude", 
                                                        "feature class", 
                                                        "feature code", 
                                                        "country code", 
                                                        "cc2", 
                                                        "admin1 code", 
                                                        "admin2 code", 
                                                        "admin3 code", 
                                                        "admin4 code", 
                                                        "population", 
                                                        "elevation", 
                                                        "dem", 
                                                        "timezone", 
                                                        "modification date"
                                                        ])

In [None]:
# Specify the place classes we're interested in:
# For reference:
# * A: country, state, region,...
# * H: stream, lake, ...
# * L: parks,area, ...
# * P: city, village,...
# * R: road, railroad 
# * S: spot, building, farm
# * T: mountain,hill,rock,... 
# * U: undersea
# * V: forest,heath,...
fclasses = ["A", "H", "L", "P", "R", "T", "V"]

In [None]:
# Filter locations by their feature class:
df_country = df_country[df_country["feature class"].isin(fclasses)]

In [None]:
# Drop unnecessary columns:
df_country = df_country.drop(columns=["feature code", "country code", "cc2", "admin1 code",
                                      "admin2 code", "admin3 code", "admin4 code", "population", 
                                      "elevation", "dem", "timezone", "modification date",
                                      "feature class", "alternatenames"])

In [None]:
# Load the alternate names file:
altnames_df = pd.read_csv("data/alternateNamesV2.txt",
                          sep="\t", 
                          low_memory=False, 
                          names=[
                                "alternateNameId", 
                                "geonameid", 
                                "isolanguage", 
                                "alternateName", 
                                "isPreferredName", 
                                "isShortName", 
                                "isColloquial", 
                                "isHistoric", 
                                "from", 
                                "to"
                                ],
                            usecols=[
                                "geonameid", 
                                "isolanguage",
                                "alternateName"
                            ]
                        )

# Filter the alternate names to keep those in the languages we are interested in:
altnames_df = altnames_df.loc[altnames_df["isolanguage"].isin(toponym_languages)]

# Aggregate alternate names into a list, aggregated by geonames id:
altnames_df = altnames_df.groupby(['geonameid'], as_index=False).agg({'alternateName': lambda x: x.tolist()})

In [None]:
# Show the first rows of the `altnames_df` dataframe:
altnames_df.head()

In [None]:
# Merge the country dataframe with the altnames dataframe by geonames id,
# using only keys from the country dataframe (therefore, dropping alternate
# names that are not in the country of interest):
dataset_df = pd.merge(df_country, altnames_df, on="geonameid", how="left")

In [None]:
# Consolidate the different name and altname columns into one list per geonames id:
altnames = []
for i, row in dataset_df.iterrows():
    current_altnames = []
    current_altnames.append(row["name"])
    current_altnames.append(row["asciiname"])
    if type(row["alternateName"]) == list:
        current_altnames += row["alternateName"]
    altnames.append(list(set(current_altnames)))
dataset_df = dataset_df.drop(columns = ["name", "asciiname", "alternateName"])
dataset_df["altnames"] = altnames

In [None]:
# Show the first rows of the `dataset_df` dataframe:
dataset_df.head()

In [None]:
# Create a dictionary that maps the geonames id of a location to the list of alternate names:
location_to_toponyms = dict()
for i, row in dataset_df.iterrows():
    location = row["geonameid"]
    altnames = row["altnames"]
    location_to_toponyms[location] = altnames

In [None]:
# Create a dictionary that maps a toponym variation to the list of possible geonames id:
toponym_to_locations = dict()
for location in location_to_toponyms:
    for toponym in location_to_toponyms[location]:
        if toponym in toponym_to_locations:
            toponym_to_locations[toponym].append(location)
        else:
            toponym_to_locations[toponym] = [location]

In [None]:
# Get all toponyms in the country:
all_toponyms = []
for k in location_to_toponyms:
    if type(location_to_toponyms[k]) == list:
        all_toponyms += location_to_toponyms[k]
all_toponyms = list(set(all_toponyms))

In [None]:
# Utils: map punctuation to white spaces, for token-based Jaccard similarity needed below:
punctuation = string.punctuation + "’"
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))

In [None]:
# Get positive matches from Geonames pairs of toponyms, if
# their similarity is > 0.60 or if Jaccard similarity of its
# tokens is larger than 0.5:
positive_matches = []
for k in location_to_toponyms:
    if type(location_to_toponyms[k]) == list:
        for toponym1 in location_to_toponyms[k]:
            for toponym2 in location_to_toponyms[k]:
                # Character-based string similarity:
                if fuzz.ratio(toponym1, toponym2) > 60:
                    positive_matches.append(toponym1 + "\t" + toponym2 + "\t" + "TRUE")
                # Token-based string similarity:
                else:
                    s1 = set(toponym1.translate(translator).split(" "))
                    s2 = set(toponym2.translate(translator).split(" "))
                    if float(len(s1.intersection(s2)) / len(s1.union(s2))) >= 0.5:
                        positive_matches.append(toponym1 + "\t" + toponym2 + "\t" + "TRUE")

In [None]:
# Get negative matches (the same number as positive matches)
# from Geonames pairs of toponyms, if their string similarity
# is < 0.40 or if Jaccard similarity of its tokens is less than 0.2:
negative_matches = []
while len(negative_matches) < len(positive_matches):
    random_pair = random.choices(all_toponyms, k=2)
    toponym1 = random_pair[0]
    toponym2 = random_pair[1]
    # Character-based string similarity:
    if fuzz.ratio(toponym1, toponym2) < 40:
        negative_matches.append(toponym1 + "\t" + toponym2 + "\t" + "FALSE")
    # Token-based string similarity:
    else:
        s1 = set(toponym1.translate(translator).split(" "))
        s2 = set(toponym2.translate(translator).split(" "))
        if float(len(s1.intersection(s2)) / len(s1.union(s2))) < 0.2:
            negative_matches.append(toponym1 + "\t" + toponym2 + "\t" + "FALSE")

In [None]:
# Write string pairs into a file (this is the string matching dataset):
with open("data/libyan_pairs.txt", "w") as fw:
    for nm in negative_matches:
        fw.write(nm + "\n")
    for pm in positive_matches:
        fw.write(pm + "\n")

## Prepare the candidates dataset

In [None]:
# The candidates dataset is created from all toponyms and variations from
# the country of interest:
candidates = list(toponym_to_locations.keys())

In [None]:
# Store the candidates dataset, with one toponym per line
with open("data/candidates.txt", "w") as fw:
    for c in set(candidates):
        fw.write(c + "\n")

## Prepare the queries dataset

In [None]:
# Load the HGL data json file:
with open('data/hgl_data.json') as json_file:
    data = json.load(json_file)

In [None]:
# Store the queries dataset, with one toponym per line
with open("data/queries.txt", "w") as fw:
    for entry in data["features"]:
        fw.write(entry["title"].split(",")[0] + "\n")