In [4]:
import pickle
import io
import json
import pandas as pd
import pdfplumber
import re
import requests
import spacy
import zipfile

NLPS = {'en': spacy.load("en_core_web_sm"),
        'fr': spacy.load("fr_core_news_sm")}


# NOTES
* species_translations were found at the following url:
    * https://www.dfo-mpo.gc.ca/species-especes/identify-eng.html
    * this data is super messy, with many errors and a small number of species
    * this source of species translations has been abandoned
        * the initial commit demonstrates these data quality issues  

# Location Translations
https://natural-resources.canada.ca/maps-tools-publications/maps/geographical-names-canada/translating-geographical-names

Translating Geographical Names
Generally in Canada, place names have one official language form: the name approved by a provincial or territorial naming authority. Exceptions to the rule are:

Geographical names of ‘pan-Canadian’ significance
Names of pan-Canadian significance have customary and well-known forms in both English and French. The list was established by the Treasury Board of Canada, and is maintained by the Geographical Names Board of Canada. The geographical names on this list are shown in both English and French on maps and in documents published by the Government of Canada. See the list of geographical names of pan-Canadian significance.

Names of national parks and national historic sites
National parks and national historic sites established by Parks Canada have an official name in English and in French.

Names of certain towns and cities
Most municipalities have only one official name adopted by the province or territory where the place is located, and this name should not be translated. However, certain municipalities have official names in both French and English, for example: Grand Falls and Grand-Sault in New Brunswick, and Greater Sudbury and Grand Sudbury in Ontario.

Names of undersea features
Some undersea features have an English and a French name approved by the Advisory Committee on Undersea Feature Names, one of the Geographical Names Board of Canada’s advisory committees.

Use of geographical names in Government of Canada documents
The principle governing the use of geographical names on maps and in documents published by the Government of Canada is to use the official form(s) of geographical names as adopted by the federal, provincial or territorial authorities of the Geographical Names Board of Canada.

Names of inhabited places retain their official form in English and French texts.

The use of names of pan-Canadian significance must be respected on both maps and in texts.

In text documents, it is permissible to translate the generic portion of name of a geographical feature. The generic portion of the name indicates the nature of the entity (for example the word “River” in the name “Bow River”). However, the specific portion of name does not get translated (for example, in the name of “Bow River”, the specific is the word “Bow”). In this example, the name “Bow River” could be translated as “rivière Bow” in a text.

You will find the list of English generics and their French equivalents in the document “Glossary of generic terms in Canada’s geographical names” published by the Translation Bureau of Public Services and Procurement Canada.

For more information concerning the rules for translating geographical names, please visit the Language Portal of Canada, a Translation Bureau initiative.
        

# Import Data
## build the dictionaries
(or load them if they exist)

In [33]:
# pan canadian names
with open("pan_canadian_names.json", encoding="utf-8") as f:
    temp_names = json.load(f) 

pan_canadian_names = temp_names.copy()

for k, v in temp_names.items():
    k_lst = k.split(",")
    v_lst = v.split(",")
    if len(k_lst) == 2 and len(v_lst) == 2:
        k_swap = f"{k_lst[1]} {k_lst[0]}".strip()
        if k_swap in pan_canadian_names:
            print(f"ERROR: {k_swap} already in pan_canadian_names")
        else:
            v_swap = f"{v_lst[1]} {v_lst[0]}".strip()
            pan_canadian_names[k_swap] = v_swap
    elif len(k_lst) > 2 or len(v_lst) > 2:
        print(f"ERROR: {k} or {v} have too many commas")
        
# parks canada names
parks_canada_place_names = pd.read_csv("vw_Place_Names_Noms_Lieux_APCA_V2_FGP.csv")
parks_canada_place_names = parks_canada_place_names.loc[(parks_canada_place_names["Name_e"].notna()) & (parks_canada_place_names["Nom_f"].notna()), ["Name_e", "Nom_f"]]
parks_canada_place_names = dict(zip(parks_canada_place_names['Name_e'], parks_canada_place_names['Nom_f']))

# natural resources canada geographical names



In [39]:
nrc_geo_names = pd.read_csv("cgn_canada_csv_eng.csv", low_memory=False)


In [41]:
nrc_geo_names.columns

Index(['CGNDB ID', 'Geographical Name', 'ISO Language Code', 'Language',
       'Syllabic Form', 'Generic Term', 'Generic Category', 'Concise Code',
       'Toponymic Feature ID', 'Latitude', 'Longitude', 'Location',
       'Province - Territory', 'Relevance at Scale', 'Decision Date',
       'Source'],
      dtype='object')

In [81]:
df = nrc_geo_names[['CGNDB ID', 'Toponymic Feature ID', 'Geographical Name', 'ISO Language Code']]
df.columns = ["id", "feature_id", "name", "lang"]

In [75]:
df.groupby("id").count().describe()

Unnamed: 0,feature_id,name,lang
count,359360.0,359360.0,359360.0
mean,1.0,1.0,1.0
std,0.0,0.0,0.0
min,1.0,1.0,1.0
25%,1.0,1.0,1.0
50%,1.0,1.0,1.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [76]:
df.groupby("feature_id").count().describe()

Unnamed: 0,id,name,lang
count,356746.0,356746.0,356746.0
mean,1.007327,1.007327,1.007327
std,0.091716,0.091716,0.091716
min,1.0,1.0,1.0
25%,1.0,1.0,1.0
50%,1.0,1.0,1.0
75%,1.0,1.0,1.0
max,10.0,10.0,10.0


In [86]:
df = df[df.lang.isin(['und', 'eng', 'fra', 'fre'])].sort_values('feature_id')
df = df.drop_duplicates(subset=["feature_id", "name"], keep="first")
df.loc[:, 'n_feature_id'] = df['feature_id'].map(df['feature_id'].value_counts())

In [89]:
df[df.n_feature_id > 1]

Unnamed: 0,id,feature_id,name,lang,n_feature_id
199703,ERFGA,0238e957ba2511d892e2080020a0f4c9,Lieu historique national du Canada du Canal-de...,und,2
44857,ERFFZ,0238e957ba2511d892e2080020a0f4c9,Chambly Canal National Historic Site of Canada,und,2
199770,FEZYI,03efd245ba2e11d892e2080020a0f4c9,Lieu historique national du Canada du Phare-de...,und,2
223630,FEZYH,03efd245ba2e11d892e2080020a0f4c9,Mississauga Point Lighthouse National Historic...,und,2
35453,JCQPY,04a2b850d05511d892e2080020a0f4c9,Burges and James Gadsden Park,und,2
...,...,...,...,...,...
255824,FEZYF,fbf0dad4ba2d11d892e2080020a0f4c9,Peterborough Lift Lock National Historic Site ...,und,2
199557,DBEDA,ff972ee0ba1f11d892e2080020a0f4c9,Lieu Oxbow,und,2
247040,DBECX,ff972ee0ba1f11d892e2080020a0f4c9,Oxbow Site,und,2
39239,ESFQU,ffeddd36ba2311d892e2080020a0f4c9,Canadian Forces Base/Area Support Unit Valcartier,und,2


In [92]:
# TODO: how can i match languages? do i need to classify them?
nrc_geo_names.iloc[[199703, 44857]].T

Unnamed: 0,199703,44857
CGNDB ID,ERFGA,ERFFZ
Geographical Name,Lieu historique national du Canada du Canal-de...,Chambly Canal National Historic Site of Canada
ISO Language Code,und,und
Language,Undetermined,Undetermined
Syllabic Form,,
Generic Term,Canal,Canal
Generic Category,Constructed Feature,Constructed Feature
Concise Code,PARK,PARK
Toponymic Feature ID,0238e957ba2511d892e2080020a0f4c9,0238e957ba2511d892e2080020a0f4c9
Latitude,45.377222,45.377222


In [31]:
# save dictionaries first run

file_path = 'BILINGUAL.pickle'

try:
    with open(file_path, 'rb') as file:
        BILINGUAL = pickle.load(file)
except FileNotFoundError:
    BILINGUAL = None
   
if BILINGUAL is None:
    BILINGUAL = {**pan_canadian_names, **parks_canada_place_names, **nrc_geo_names}
    
    with open(file_path, 'wb') as file:
        pickle.dump(BILINGUAL, file)

In [35]:
parks_canada_place_names['Sable Island']

'Sable Island'

In [None]:
# glossary parse

def parse_glossary_en(path):
    lines = []
    with pdfplumber.open(path) as pdf:
        for pg in pdf.pages:
            lines += pg.extract_text().splitlines()

    out, n, i = {}, len(lines), 0
    while i < n:
        if lines[i].strip() == "DES":
            word = lines[i-1].strip()
            if re.fullmatch(r"[A-Za-z' -]+", word):
                j = i + 1
                while j < n and not lines[j].lstrip().startswith("EQ"):
                    j += 1
                if j < n:
                    fr = re.sub(r"\[|\]", "", lines[j].split("EQ", 1)[1]).strip()
                    fr = re.split(r"\s*\(.*?\)", fr)[0].strip()
                    if fr:
                        out[word.lower()] = fr.lower()
                i = j
        i += 1
    return out


In [None]:
# save glossary dict first run

file_path = 'GEN_EN_FR.pickle'

try:
    with open(file_path, 'rb') as file:
        GEN_EN_FR = pickle.load(file)
except FileNotFoundError:
    GEN_EN_FR = None
   
if GEN_EN_FR is None:
    glossary_pdf = "Glossary of Generic Terms in Canada's Geographical Names S52-2-176-2012.pdf"
    GEN_EN_FR = parse_glossary_en(glossary_pdf)
    
    with open(file_path, 'wb') as file:
        pickle.dump(GEN_EN_FR, file)

GEN_FR_EN = {v: k for k, v in GEN_EN_FR.items()}

## translation and verification

In [None]:
LABELS = ("GPE", "LOC", "FAC")
# GPE - Geo-political entity (jurisdictions with governments)
# LOC - Other physical locations (not GPEs)
# FAC - Man-made facilities or infrastructure

def _split(name, mapping):
    parts = name.split()
    for i in range(1, len(parts) + 1):
        gen = " ".join(parts[-i:])
        if gen in mapping:
            return " ".join(parts[:-i]), gen
    return name, ""

def translate(name, target):
    if target == "fr" and name in BILINGUAL:
        return BILINGUAL[name]
    if target == "en" and name in {v: k for k, v in BILINGUAL.items()}:
        return {v: k for k, v in BILINGUAL.items()}[name]
    if target == "fr":
        spec, gen = _split(name, GEN_EN_FR)
        return f"{GEN_EN_FR.get(gen, gen)} {spec}".strip()
    spec, gen = _split(name, GEN_FR_EN)
    return f"{spec} {GEN_FR_EN.get(gen, gen)}".strip()

def verify(src, dst, src_lang):
    tgt_lang = "fr" if src_lang == "en" else "en"
    return translate(src, tgt_lang) == dst

def find_locations(text, lang):
    doc = NLPS[lang](text)
    return [e.text for e in doc.ents if e.label_ in LABELS]

def extract_locations(text, lang="en"):
    out = []
    for p_idx, para in enumerate(text.split("\n\n"), 1):
        doc = NLPS[lang](para)
        for s_idx, sent in enumerate(doc.sents, 1):
            for ent in sent.ents:
                if ent.label_ in LABELS:
                    out.append((p_idx, s_idx, ent.text))
    return out

def match_locations(en_text, fr_text):
    en_locs = extract_locations(en_text, "en")
    fr_locs = extract_locations(fr_text, "fr")
    fr_map = {(p, s): t for p, s, t in fr_locs}

    pairs, missing = [], []
    for p, s, en_t in en_locs:
        fr_expected = translate(en_t, "fr")
        if fr_map.get((p, s), "").lower() == fr_expected.lower():
            pairs.append(((p, s, en_t), fr_expected))
        else:
            missing.append((p, s, en_t, fr_expected))
    return pairs, missing



# Species Translations

In [None]:
species_translations = {
    
}

# Preferential Translations

In [None]:
preferential_translations = {
    
}

# Translation Quality Checker

In [None]:
# isolate any pre-translated instances from above lists 
# match to translated version 
# compare vs expected translation
# [optional] find-and-replace to clean up translation

