### import required libraries

In [None]:
!pip install googletrans==3.1.0a0
!pip install epitran
!pip install jellyfish

from tqdm.auto import tqdm
from difflib import SequenceMatcher
import random
import urllib.request
import gzip
import shutil

import nltk
nltk.download(['words', 'stopwords'])
from nltk.corpus import stopwords
en_words = nltk.corpus.words.words()
stop_words = stopwords.words()
#en_words = [word for word in tqdm(en_words) if word not in stop_words]

import googletrans
from googletrans import Translator
languages = googletrans.LANGUAGES
translator = Translator()

import epitran
import jellyfish

### install files required by epitran

required for chinese phonetics

In [None]:
url = "https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz"
fname = "cedict_1_0_ts_utf-8_mdbg.txt.gz"
urllib.request.urlretrieve(url, fname)

with gzip.open(fname, 'rb') as f_in:
    with open("cedict_1_0_ts_utf-8_mdbg.txt", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

required for english phonetics

In [None]:
%%shell

git clone https://github.com/festvox/flite
cd flite/
./configure && make
sudo make install
cd testsuite
make lex_lookup
sudo cp lex_lookup /usr/local/bin

### test translator

In [None]:
result = translator.translate('door', src='en', dest='hi')
print(result,'\n')
for ed in result.extra_data:
  print(ed)

print(result.extra_data["possible-translations"],'\n')

for x in result.extra_data["possible-translations"][0][2]:
  print(x)

Translated(src=en, dest=hi, text=द्वार, pronunciation=dvaar, extra_data="{'translat...") 

translation
all-translations
original-language
possible-translations
confidence
possible-mistakes
language
synonyms
definitions
examples
see-also
[['door', None, [['द्वार', 1000, True, False, [6]], ['दरवाजा', 1000, True, False, [6, 3]]], [[0, 4]], 'door', 0, 0]] 

['द्वार', 1000, True, False, [6]]
['दरवाजा', 1000, True, False, [6, 3]]


### define phonetic codes dict

epitran requires the phonetic code to be passed which can be found on their website : https://pypi.org/project/epitran/

In [None]:
# https://pypi.org/project/epitran/

phonetic_codes = {
    'en' : 'eng-Latn',
    'de' : 'deu-Latn',
    #'fr' : 'fra-Latn',
    'es' : 'spa-Latn',
    'hi' : 'hin-Deva',
    'zh-cn' : 'cmn-Hans',
    'ru' : 'rus-Cyrl',
}

for lang_short, lang_long in languages.items():
  print(lang_short, lang_long)

af afrikaans
sq albanian
am amharic
ar arabic
hy armenian
az azerbaijani
eu basque
be belarusian
bn bengali
bs bosnian
bg bulgarian
ca catalan
ceb cebuano
ny chichewa
zh-cn chinese (simplified)
zh-tw chinese (traditional)
co corsican
hr croatian
cs czech
da danish
nl dutch
en english
eo esperanto
et estonian
tl filipino
fi finnish
fr french
fy frisian
gl galician
ka georgian
de german
el greek
gu gujarati
ht haitian creole
ha hausa
haw hawaiian
iw hebrew
he hebrew
hi hindi
hmn hmong
hu hungarian
is icelandic
ig igbo
id indonesian
ga irish
it italian
ja japanese
jw javanese
kn kannada
kk kazakh
km khmer
ko korean
ku kurdish (kurmanji)
ky kyrgyz
lo lao
la latin
lv latvian
lt lithuanian
lb luxembourgish
mk macedonian
mg malagasy
ms malay
ml malayalam
mt maltese
mi maori
mr marathi
mn mongolian
my myanmar (burmese)
ne nepali
no norwegian
or odia
ps pashto
fa persian
pl polish
pt portuguese
pa punjabi
ro romanian
ru russian
sm samoan
gd scots gaelic
sr serbian
st sesotho
sn shona
sd sindhi


### test epitran

In [None]:
epi = epitran.Epitran('eng-Latn')
print(epi.transliterate(u'hello'))

epi = epitran.Epitran('cmn-Hans', cedict_file='cedict_1_0_ts_utf-8_mdbg.txt')
print(epi.transliterate(u'姓名'))

həlow
ɕiŋmiŋ


### define similarity scoring function

In [None]:
def similarity_score(en_word, type_, languages, similarity, print_=False):
  score = 0
  en_word_phon = epitran.Epitran('eng-Latn').transliterate(en_word)

  for lang in tqdm(languages, leave=False):

      if type_ == 'phonetic':
        trans_word = translator.translate(en_word, src='en', dest=lang).text
        #cedict file arg is only needed for chinese ('zh-cn' : 'cmn-Hans')
        trans_word_phon = epitran.Epitran(phonetic_codes[lang], cedict_file='cedict_1_0_ts_utf-8_mdbg.txt').transliterate(trans_word)
        if similarity == 'SequenceMatcher':
          lang_score = SequenceMatcher(None, en_word_phon, trans_word_phon).ratio()
        elif similarity == 'levenshtein_distance':
          dist = jellyfish.levenshtein_distance(en_word_phon, trans_word_phon)
          lang_score = 1/(1+dist)
        if print_:
          print(lang, trans_word, trans_word_phon, lang_score)

      elif type_ == 'pronounciation':
        trans_word = translator.translate(en_word, src='en', dest=lang)
        word_text = trans_word.text
        word_pron = trans_word.pronunciation
        trans_word_pron = word_text if word_pron is None else word_pron
        if similarity == 'SequenceMatcher':
          lang_score = SequenceMatcher(None, en_word, trans_word_pron).ratio()
        elif similarity == 'levenshtein_distance':
          dist = jellyfish.levenshtein_distance(en_word, trans_word_pron)
          lang_score = 1/(1+dist)
        if print:
          print(lang, word_text, trans_word_pron, lang_score)
      
      score += lang_score
  return score

In [None]:
similarity_score('name', type_='phonetic', languages=phonetic_codes, similarity='levenshtein_distance', print_=True)

  0%|          | 0/6 [00:00<?, ?it/s]

en name nejm 1.0
de Name nɑːmə 0.25
es nombre nombɾe 0.16666666666666666
hi नाम naːm 0.3333333333333333
zh-cn 姓名 ɕiŋmiŋ 0.16666666666666666
ru имя imʲa 0.2


2.1166666666666667

In [None]:
similarity_score('name', type_='pronounciation', languages=languages, similarity='SequenceMatcher', print_=True)

  0%|          | 0/107 [00:00<?, ?it/s]

af naam naam 0.75
sq emri emri 0.25
am ስም simi 0.25
ar اسم asm 0.5714285714285714
hy Անուն Anun 0.25
az adı adı 0.2857142857142857
eu izena izena 0.4444444444444444
be імя imia 0.25
bn নাম Nāma 0.25
bs ime ime 0.5714285714285714
bg име ime 0.5714285714285714
ca nom nom 0.5714285714285714
ceb ngalan ngalan 0.4
ny dzina dzina 0.4444444444444444
zh-cn 姓名 Xìngmíng 0.3333333333333333
zh-tw 姓名 Xìngmíng 0.3333333333333333
co nome nome 0.75
hr Ime Ime 0.5714285714285714
cs název název 0.4444444444444444
da navn navn 0.5
nl naam naam 0.75
en name name 1.0
eo nomo nomo 0.5
et nimi nimi 0.5
tl pangalan pangalan 0.3333333333333333
fi nimi nimi 0.5
fr Nom Nom 0.2857142857142857
fy namme namme 0.8888888888888888
gl nome nome 0.75
ka სახელი sakheli 0.36363636363636365
de Name Name 0.75
el όνομα ónoma 0.4444444444444444
gu નામ Nāma 0.25
ht non non 0.2857142857142857
ha suna suna 0.5
haw inoa inoa 0.5
iw שֵׁם שֵׁם 0.0
he שֵׁם שֵׁם 0.0
hi नाम naam 0.75
hmn lub npe lub npe 0.36363636363636365
hu név név 

41.87669552669551

In [None]:
max_score = 0
while True:
  en_word = random.choice(en_words)
  score = similarity_score(en_word, type_='pronounciation', languages=languages, similarity='SequenceMatcher')
  if score > max_score:
    print(en_word, score)
    max_score = score