In [36]:
import json
import re
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 200)


In [None]:
# location_translations 

"""
https://natural-resources.canada.ca/maps-tools-publications/maps/geographical-names-canada/translating-geographical-names

Translating Geographical Names
Generally in Canada, place names have one official language form: the name approved by a provincial or territorial naming authority. Exceptions to the rule are:

Geographical names of ‘pan-Canadian’ significance
Names of pan-Canadian significance have customary and well-known forms in both English and French. The list was established by the Treasury Board of Canada, and is maintained by the Geographical Names Board of Canada. The geographical names on this list are shown in both English and French on maps and in documents published by the Government of Canada. See the list of geographical names of pan-Canadian significance.

Names of national parks and national historic sites
National parks and national historic sites established by Parks Canada have an official name in English and in French.

Names of certain towns and cities
Most municipalities have only one official name adopted by the province or territory where the place is located, and this name should not be translated. However, certain municipalities have official names in both French and English, for example: Grand Falls and Grand-Sault in New Brunswick, and Greater Sudbury and Grand Sudbury in Ontario.

Names of undersea features
Some undersea features have an English and a French name approved by the Advisory Committee on Undersea Feature Names, one of the Geographical Names Board of Canada’s advisory committees.

Use of geographical names in Government of Canada documents
The principle governing the use of geographical names on maps and in documents published by the Government of Canada is to use the official form(s) of geographical names as adopted by the federal, provincial or territorial authorities of the Geographical Names Board of Canada.

Names of inhabited places retain their official form in English and French texts.

The use of names of pan-Canadian significance must be respected on both maps and in texts.

In text documents, it is permissible to translate the generic portion of name of a geographical feature. The generic portion of the name indicates the nature of the entity (for example the word “River” in the name “Bow River”). However, the specific portion of name does not get translated (for example, in the name of “Bow River”, the specific is the word “Bow”). In this example, the name “Bow River” could be translated as “rivière Bow” in a text.

You will find the list of English generics and their French equivalents in the document “Glossary of generic terms in Canada’s geographical names” published by the Translation Bureau of Public Services and Procurement Canada.

For more information concerning the rules for translating geographical names, please visit the Language Portal of Canada, a Translation Bureau initiative.

"""


In [None]:
preferential_translations = {
    
}

In [24]:
# species_translations (this data is super messy)
# from https://www.dfo-mpo.gc.ca/species-especes/identify-eng.html

with open("all_aquatic_species.json", encoding="utf-8") as f:
    data = json.load(f)

def clean(name):
    name = name.lower()
    name = re.sub(r"\s*\(.*", "", name)
    return name.strip()

species_translations = pd.DataFrame([{
    "english": clean(x["name"]["english"]),
    "french": clean(x["name"]["french"]),
    "latin": clean(x["name"]["latin"])
} for x in data])

In [25]:
species_translations[species_translations.english.str.lower().str.contains("shark")].sort_values('latin')

Unnamed: 0,english,french,latin
383,common thresher shark,renard de mer,alopias vulpinus
396,thresher shark,renard marin,alopias vulpinus
393,brown cat shark,holbiche brune,apristurus brunneus
405,deepsea cat shark,roussette de profondeur,apristurus profundorum
403,oceanic whitetip shark,rameur,carcharhinus longimanus
404,dusky shark,requin obscur,carcharhinus obscurus
391,great white shark,grand requin blanc,carcharodon carcharias
261,white shark,grand requin blanc,carcharodon carcharias
402,portuguese shark,pailona,centroscymnus coelolepis
407,basking shark,requin pèlerin,cetorhinus maximus


In [28]:
# well, that's not right...
species_translations[species_translations.latin.str.lower().str.contains("lethenteron alaskense")]

Unnamed: 0,english,french,latin
378,hooded seal,phoque à capuchon,lethenteron alaskense
379,ringed seal,phoque annelé,lethenteron alaskense
380,harbour seal,phoque commun,lethenteron alaskense
381,bearded seal,phoque barbu,lethenteron alaskense
586,alaskan brook lamprey,lamproie d'alaska,lethenteron alaskense


In [42]:
species_translations[species_translations.duplicated("latin", keep=False)].groupby("latin").filter(lambda g: g["english"].nunique() > 1 or g["french"].nunique() > 1).sort_values('latin')

Unnamed: 0,english,french,latin
427,invasive tunicates,les tuniciers envahissants,
426,asian carp,carpe asiatique,
383,common thresher shark,renard de mer,alopias vulpinus
396,thresher shark,renard marin,alopias vulpinus
391,great white shark,grand requin blanc,carcharodon carcharias
261,white shark,grand requin blanc,carcharodon carcharias
192,rocky mountain sculpin,chabot des montagnes rocheuses,cottus bairdii
65,eastslope sculpin,chabot du versant est,cottus bairdii
191,rocky mountain sculpin,chabot des montagnes rocheuses,cottus bairdii
451,atlantic cod,morue franche,gadus morhua


In [43]:
species_translations[species_translations.duplicated("latin", keep=False)].groupby("latin").filter(lambda g: g["english"].nunique() > 1 or g["french"].nunique() > 1).value_counts().sort_index(level=2)

english                                             french                                                   latin                      
asian carp                                          carpe asiatique                                                                          1
invasive tunicates                                  les tuniciers envahissants                                                               1
common thresher shark                               renard de mer                                            alopias vulpinus                1
thresher shark                                      renard marin                                             alopias vulpinus                1
great white shark                                   grand requin blanc                                       carcharodon carcharias          1
white shark                                         grand requin blanc                                       carcharodon carcharias          1
easts

In [57]:
# why does the index change levels when you do this?
vc = species_translations.groupby("latin").value_counts().sort_index(level=0)
vc[vc > 1]

latin                         english                                        french                                                   
acipenser fulvescens          lake sturgeon                                  esturgeon jaune                                               9
acipenser oxyrinchus          atlantic sturgeon                              esturgeon noir                                                2
acipenser transmontanus       white sturgeon                                 esturgeon blanc                                              11
acroloxus coloradensis        rocky mountain capshell                        patelle d'eau douce pointue                                   2
amblyraja radiata             thorny skate                                   raie épineuse                                                 2
ammocrypta pellucida          eastern sand darter                            dard de sable                                                 3
balaena mysticetus 

In [19]:
with open("all_aquatic_species.json", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame([{
    "english": x["name"]["english"],
    "french": x["name"]["french"],
    "latin": x["name"]["latin"]
} for x in data])

In [21]:
df[df.duplicated("latin", keep=False)].groupby("latin").filter(lambda g: g["english"].nunique() > 1 or g["french"].nunique() > 1).sort_values('latin')

Unnamed: 0,english,french,latin
426,Asian Carp,Carpe asiatique,
427,Invasive Tunicates,Les Tuniciers Envahissants,
105,Lake Sturgeon (Great Lakes – Upper St. Lawrenc...,Esturgeon jaune (populations des Grands Lacs e...,Acipenser fulvescens
107,Lake Sturgeon (Nelson River populations),Esturgeon jaune (populations de la rivière Nel...,Acipenser fulvescens
108,Lake Sturgeon (Red-Assiniboine River - Lake Wi...,Esturgeon jaune (populations de la rivière Rou...,Acipenser fulvescens
...,...,...,...
39,Bluefin Tuna,Thon rouge,Thunnus thynnus
441,Atlantic Bluefin Tuna,Thon rouge de l'Atlantique,Thunnus thynnus
260,White Hake (Southern Gulf of St. Lawrence),Merluche blanche (sud du golfe du Saint-Laurent),Urophycis tenuis
258,White Hake,Merluche blanche,Urophycis tenuis
