In [None]:
import pickle
import io
import pandas as pd
import pdfplumber
import re
import requests
import spacy
import zipfile

NLPS = {'en': spacy.load("en_core_web_sm"),
        'fr': spacy.load("fr_core_news_sm")}


# NOTES
* species_translations were found at the following url:
    * https://www.dfo-mpo.gc.ca/species-especes/identify-eng.html
    * this data is super messy, with many errors and a small number of species
    * this source of species translations has been abandoned
        * the initial commit demonstrates these data quality issues  

# Location Translations 

In [None]:
"""
https://natural-resources.canada.ca/maps-tools-publications/maps/geographical-names-canada/translating-geographical-names

Translating Geographical Names
Generally in Canada, place names have one official language form: the name approved by a provincial or territorial naming authority. Exceptions to the rule are:

Geographical names of ‘pan-Canadian’ significance
Names of pan-Canadian significance have customary and well-known forms in both English and French. The list was established by the Treasury Board of Canada, and is maintained by the Geographical Names Board of Canada. The geographical names on this list are shown in both English and French on maps and in documents published by the Government of Canada. See the list of geographical names of pan-Canadian significance.

Names of national parks and national historic sites
National parks and national historic sites established by Parks Canada have an official name in English and in French.

Names of certain towns and cities
Most municipalities have only one official name adopted by the province or territory where the place is located, and this name should not be translated. However, certain municipalities have official names in both French and English, for example: Grand Falls and Grand-Sault in New Brunswick, and Greater Sudbury and Grand Sudbury in Ontario.

Names of undersea features
Some undersea features have an English and a French name approved by the Advisory Committee on Undersea Feature Names, one of the Geographical Names Board of Canada’s advisory committees.

Use of geographical names in Government of Canada documents
The principle governing the use of geographical names on maps and in documents published by the Government of Canada is to use the official form(s) of geographical names as adopted by the federal, provincial or territorial authorities of the Geographical Names Board of Canada.

Names of inhabited places retain their official form in English and French texts.

The use of names of pan-Canadian significance must be respected on both maps and in texts.

In text documents, it is permissible to translate the generic portion of name of a geographical feature. The generic portion of the name indicates the nature of the entity (for example the word “River” in the name “Bow River”). However, the specific portion of name does not get translated (for example, in the name of “Bow River”, the specific is the word “Bow”). In this example, the name “Bow River” could be translated as “rivière Bow” in a text.

You will find the list of English generics and their French equivalents in the document “Glossary of generic terms in Canada’s geographical names” published by the Translation Bureau of Public Services and Procurement Canada.

For more information concerning the rules for translating geographical names, please visit the Language Portal of Canada, a Translation Bureau initiative.

"""


## build the dictionaries
(or load them if they exist)

In [None]:
# links to sources

# FIXME

PAN_PDF   = "https://publications.gc.ca/collections/collection_2017/rncan-nrcan/M86-23-2012-eng.pdf"
PARKS_GJ  = "https://.../national_parks.geojson"  # see open portal
CGN_ZIP   = "https://natural-resources.canada.ca/.../cgn_canada_csv.zip"

In [None]:
def build_pan_canadian(url):
    pairs = {}
    with pdfplumber.open(requests.get(url, stream=True).raw) as pdf:
        txt = "\n".join(page.extract_text() for page in pdf.pages)
    for en, fr in re.findall(r'^(.+?)\s*/\s*(.+?)\s*$', txt, flags=re.MULTILINE):
        pairs[en.strip()] = fr.strip()
    return pairs

def build_from_geojson(url, en_key="NAME_E", fr_key="NAME_F"):
    df = pd.read_json(url)
    return dict(zip(df[en_key], df[fr_key]))

def build_cgndb_pairs(zip_url):
    z = zipfile.ZipFile(io.BytesIO(requests.get(zip_url).content))
    frames = []
    for csv in z.namelist():
        if csv.endswith(".csv"):
            frames.append(pd.read_csv(z.open(csv), dtype=str))
    df = pd.concat(frames)
    english = df.query("NAMETYPE=='English'")
    french  = df.query("NAMETYPE=='French'")
    merged = english.merge(french, on="CGNDB_UID", suffixes=("_en", "_fr"))
    return dict(zip(merged.NAME_en, merged.NAME_fr))


In [None]:
# save dictionaries first run

file_path = 'BILINGUAL.pickle'

try:
    with open(file_path, 'rb') as file:
        BILINGUAL = pickle.load(file)
except FileNotFoundError:
    BILINGUAL = None
   
if BILINGUAL is None:
    pan_dict   = build_pan_canadian(PAN_PDF)
    parks_dict = build_from_geojson(PARKS_GJ)
    cgndb_dict = build_cgndb_pairs(CGN_ZIP)
    
    BILINGUAL = {**pan_dict, **parks_dict, **cgndb_dict}
    
    with open(file_path, 'wb') as file:
        pickle.dump(BILINGUAL, file)


In [None]:
# glossary parse

def parse_glossary_en(path):
    lines = []
    with pdfplumber.open(path) as pdf:
        for pg in pdf.pages:
            lines += pg.extract_text().splitlines()

    out, n, i = {}, len(lines), 0
    while i < n:
        if lines[i].strip() == "DES":
            word = lines[i-1].strip()
            if re.fullmatch(r"[A-Za-z' -]+", word):
                j = i + 1
                while j < n and not lines[j].lstrip().startswith("EQ"):
                    j += 1
                if j < n:
                    fr = re.sub(r"\[|\]", "", lines[j].split("EQ", 1)[1]).strip()
                    fr = re.split(r"\s*\(.*?\)", fr)[0].strip()
                    if fr:
                        out[word.lower()] = fr.lower()
                i = j
        i += 1
    return out


In [None]:
# save glossary dict first run

file_path = 'GEN_EN_FR.pickle'

try:
    with open(file_path, 'rb') as file:
        GEN_EN_FR = pickle.load(file)
except FileNotFoundError:
    GEN_EN_FR = None
   
if GEN_EN_FR is None:
    glossary_pdf = "Glossary of Generic Terms in Canada's Geographical Names S52-2-176-2012.pdf"
    GEN_EN_FR = parse_glossary_en(glossary_pdf)
    
    with open(file_path, 'wb') as file:
        pickle.dump(GEN_EN_FR, file)

GEN_FR_EN = {v: k for k, v in GEN_EN_FR.items()}

## translation and verification

In [None]:
LABELS = ("GPE", "LOC", "FAC")
# GPE - Geo-political entity (jurisdictions with governments)
# LOC - Other physical locations (not GPEs)
# FAC - Man-made facilities or infrastructure

def _split(name, mapping):
    parts = name.split()
    for i in range(1, len(parts) + 1):
        gen = " ".join(parts[-i:])
        if gen in mapping:
            return " ".join(parts[:-i]), gen
    return name, ""

def translate(name, target):
    if target == "fr" and name in BILINGUAL:
        return BILINGUAL[name]
    if target == "en" and name in {v: k for k, v in BILINGUAL.items()}:
        return {v: k for k, v in BILINGUAL.items()}[name]
    if target == "fr":
        spec, gen = _split(name, GEN_EN_FR)
        return f"{GEN_EN_FR.get(gen, gen)} {spec}".strip()
    spec, gen = _split(name, GEN_FR_EN)
    return f"{spec} {GEN_FR_EN.get(gen, gen)}".strip()

def verify(src, dst, src_lang):
    tgt_lang = "fr" if src_lang == "en" else "en"
    return translate(src, tgt_lang) == dst

def find_locations(text, lang):
    doc = NLPS[lang](text)
    return [e.text for e in doc.ents if e.label_ in LABELS]

def extract_locations(text, lang="en"):
    out = []
    for p_idx, para in enumerate(text.split("\n\n"), 1):
        doc = NLPS[lang](para)
        for s_idx, sent in enumerate(doc.sents, 1):
            for ent in sent.ents:
                if ent.label_ in LABELS:
                    out.append((p_idx, s_idx, ent.text))
    return out

def match_locations(en_text, fr_text):
    en_locs = extract_locations(en_text, "en")
    fr_locs = extract_locations(fr_text, "fr")
    fr_map = {(p, s): t for p, s, t in fr_locs}

    pairs, missing = [], []
    for p, s, en_t in en_locs:
        fr_expected = translate(en_t, "fr")
        if fr_map.get((p, s), "").lower() == fr_expected.lower():
            pairs.append(((p, s, en_t), fr_expected))
        else:
            missing.append((p, s, en_t, fr_expected))
    return pairs, missing



# Species Translations

In [None]:
species_translations = {
    
}

# Preferential Translations

In [None]:
preferential_translations = {
    
}

# Translation Quality Checker

In [None]:
# isolate any pre-translated instances from above lists 
# match to translated version 
# compare vs expected translation
# [optional] find-and-replace to clean up translation

