In [None]:
# copy Hungarian, Gothic, West Old Turkic input dfs

from shutil import copy

copy("../gerstnerhungarian/cldf/forms.csv", "hun_forms.csv")
copy("../streitberggothic/cldf/forms.csv", "got_forms.csv")
copy("../ronataswestoldturkic/cldf/forms.csv", "wot_forms.csv")

In [None]:
# download German word-vectors

import wget

wget.download("https://cloud.devmount.de/d2bc5672c523b086/german.model")

In [3]:
# import Etymology class from helpers module
from loanpy.helpers import Etym
# turn etymological data into a Python object
etym_obj = Etym("ronataswestoldturkic/cldf/forms.csv", source_language="WOT", target_language="EAH")

In [None]:
# create dictionary of heuristic sound substitutions
etym_obj.get_scdictbase(write_to="GothicHungarian/scWOT2EAH.txt")  # 1.7MB!

In [None]:
# import Quantify class from quantify-sound-correspondences module
from loanpy.qfysc import Qfy
# turn etymological data into a Python object
qfy_obj = Qfy("ronataswestoldturkic/cldf/forms.csv", "WOT", "EAH",
              scdictbase="GothicHungarian/scWOT2EAH.txt", mode="adapt")
# extract sound substitutions from file, add to heuristics, overwrite sound-correspondence-file
qfy_obj.get_sound_corresp("GothicHungarian/scWOT2EAH.txt")

In [None]:
# do the same but with historical reconstructions now
from loanpy.qfysc import Qfy
qfy_obj = Qfy("ronataswestoldturkic/cldf/forms.csv", "EAH", "H", mode="reconstruct")
qfy_obj.get_sound_corresp("GothicHungarian/scH2EAH.txt")

In [8]:
# import Adrc class from adapt-reconstruct module
from loanpy.adrc import Adrc
# read sound correspondence data, create Python object from it
ad_obj = Adrc(scdictlist="GothicHungarian/scWOT2EAH.txt",
              forms_csv="ronataswestoldturkic/cldf/forms.csv", source_language="WOT",
                target_language="EAH", mode="adapt")

In [42]:
# use tool to make predictions about loanword adaptation
# ɸuɣl̥s bɔ̃ʒuʀ
ad_obj.adapt("ɸuɣl̥s", 10, max_repaired_phonotactics=5, sort_by_nse=True, show_workflow=True,
             max_paths2repaired_phonotactics=5)

'boʃuh, poʃuh, doʃuh, ʃuh, buh, ʃoh, puh, duh, ʃh'

In [43]:
# white-box: inspect what happened
ad_obj.workflow

OrderedDict([('tokenised', "['b', 'ɔ̃', 'ʒ', 'u', 'ʀ']"),
             ('donor_phonotactics', 'CVCVC'),
             ('predicted_phonotactics', "['CVCVC', 'CVC']"),
             ('adapted_phonotactics',
              "[['b', 'ɔ̃', 'ʒ', 'u', 'ʀ'], ['ʒ', 'u', 'ʀ'], ['b', 'u', 'ʀ']]"),
             ('before_combinatorics',
              "[[['b', 'p', 'd'], ['o'], ['ʃ'], ['u'], ['h']], [['ʃ'], ['u', 'o', ''], ['h']], [['b', 'p', 'd'], ['u'], ['h']]]")])

In [39]:
from epitran import Epitran
epi = Epitran("fra-Latn")
epi.transliterate("bonjour")  # hɪrɪ aːɟuː tɛlɛviːzioː baːɡɛr

'bɔ̃ʒuʀ'

In [44]:
# read etymological data for reconstructions
rc_obj = Adrc(scdictlist="ronataswestoldturkic/etc/scH2EAH.txt",
              forms_csv="ronataswestoldturkic/cldf/forms.csv", source_language="H",
                target_language="EAH", mode="reconstruct")

In [45]:
# predict reconstructions
rc_obj.reconstruct("baːɡɛr", howmany=10, vowelharmony_filter=False, phonotactics_filter=False, clusterised=True)

'^(b)(a)(ɡ|n|ɡl)(ɛ|i|ø|y)(r)$'

In [46]:
# evaluate sanity of reconstruction model
# and optimise parameters

from loanpy.sanity import eval_all, cache
from loanpy.sanity import ArgumentsAlreadyTested

eval_all = cache(eval_all)

for cl in [True, False]:
    for ph_f in [True, False]:
        for vh in [True, False]:
            try:
                eval_all(forms_csv="ronataswestoldturkic/cldf/forms.csv",
                         source_language="H",
                         target_language="EAH",
                         mode="reconstruct",
                         write_to="GothicHungarian/eval_rc.csv",
                         path2cache="GothicHungarian/cache_rc.csv",
                         clusters=cl,
                         vowelharmony=vh,
                         phonotactics_filter=ph_f)
            except ArgumentsAlreadyTested:
                print("arguments already tested")

489it [01:01,  7.92it/s]
489it [01:01,  8.01it/s]
489it [01:03,  7.68it/s]
489it [00:55,  8.77it/s]
489it [01:02,  7.79it/s]
489it [00:57,  8.56it/s]
489it [00:58,  8.30it/s]
489it [00:51,  9.44it/s]


<Figure size 640x480 with 0 Axes>

In [48]:
# evaluate sanity of adaptation model
# and optimise parameters

from loanpy.sanity import eval_all, cache

eval_all = cache(eval_all)

for cl in [True, False]:
    for ph_f in [True, False]:
        for vh in [True, False]:
            eval_all(forms_csv="ronataswestoldturkic/cldf/forms.csv",
                     source_language="WOT",
                     target_language="EAH",
                     mode="adapt",
                     write_to="GothicHungarian/eval_ad.csv",
                     path2cache="GothicHungarian/cache_ad.csv",
                     clusters=cl,
                     vowelharmony=vh,
                     phonotactics_filter=ph_f)

463it [01:02,  7.41it/s]
463it [01:05,  7.03it/s]
463it [01:02,  7.41it/s]
463it [01:03,  7.29it/s]
463it [01:01,  7.55it/s]
463it [01:02,  7.37it/s]
463it [01:01,  7.52it/s]
463it [01:00,  7.65it/s]


<Figure size 640x480 with 0 Axes>

In [50]:
# evaluate sanity of adaptation model
# and optimise parameters

from loanpy.sanity import eval_all, cache

eval_all = cache(eval_all)

for mrp in [2, 10, 20]:
    for mp2rp in [2, 10, 20]:
        eval_all(forms_csv="ronataswestoldturkic/cldf/forms.csv",
                 source_language="WOT",
                 target_language="EAH",
                 mode="adapt",
                 write_to="GothicHungarian/eval_ad.csv",
                 path2cache="GothicHungarian/cache_ad2.csv",
                 clusters=False,
                 vowelharmony=False,
                 phonotactics_filter=False,
                 max_repaired_phonotactics=mrp,
                 max_paths2repaired_phonotactics=mp2rp)

463it [01:02,  7.35it/s]
463it [01:02,  7.44it/s]
463it [01:01,  7.58it/s]
463it [01:01,  7.50it/s]
463it [01:01,  7.49it/s]
463it [01:03,  7.27it/s]
463it [01:03,  7.24it/s]
463it [01:01,  7.49it/s]
463it [01:01,  7.58it/s]


<Figure size 640x480 with 0 Axes>

In [52]:
# evaluate sanity
# and optimise parameters

from loanpy.sanity import eval_all, cache

eval_all = cache(eval_all)

for mrp in [3, 5, 8]:
    for mp2rp in [3, 5, 8]:
        eval_all(forms_csv="ronataswestoldturkic/cldf/forms.csv",
                 source_language="WOT",
                 target_language="EAH",
                 mode="adapt",
                 write_to="GothicHungarian/eval_ad.csv",
                 path2cache="GothicHungarian/cache_ad2.csv",
                 clusters=False,
                 vowelharmony=False,
                 phonotactics_filter=False,
                 max_repaired_phonotactics=mrp,
                 max_paths2repaired_phonotactics=mp2rp)

463it [01:01,  7.47it/s]
463it [01:01,  7.56it/s]
463it [01:01,  7.51it/s]
463it [01:02,  7.45it/s]
463it [01:02,  7.42it/s]
463it [01:02,  7.38it/s]
463it [01:04,  7.14it/s]
463it [01:03,  7.24it/s]
463it [01:03,  7.27it/s]


<Figure size 640x480 with 0 Axes>

In [26]:
# create adaptations

from loanpy.adrc import Adrc
import pandas as pd

# read sound correspondence data, create Python object from it
ad_obj = Adrc(scdictlist="scWOT2EAH.txt",
              forms_csv="wot_forms.csv",
              source_language="WOT",
              target_language="EAH",
              mode="adapt")

# read in forms.csv with pandas
dfforms = pd.read_csv("got_forms.csv")

# add new column of pseudo-adaptations from Gothic into Hungarian with loanpy
output = [(ad_obj.adapt(word.replace(" ", ""),
                        howmany=1000,
                        max_repaired_phonotactics=2,
                        max_paths2repaired_phonotactics=2,
                        show_workflow=True), ad_obj.workflow)
          for word in dfforms["Segments"]]

dfforms["ad"] = [i[0] for i in output]
dfforms["ad_workflow"] = [i[1] for i in output]

# write to file
dfforms.to_csv("ad_got_forms.csv", encoding="utf-8", index=False)

In [27]:
# create reconstructions

from loanpy.adrc import Adrc
import pandas as pd

# read etymological data for reconstructions
rc_obj = Adrc(scdictlist="scH2EAH.txt",
              forms_csv="hun_forms.csv",
              source_language="H",
              target_language="EAH",
              mode="reconstruct")

# read in forms.csv with pandas
dfforms = pd.read_csv("hun_forms.csv")

#drop years after 1600

# add new column of pseudo-reconstructions from Hungarian into Early Ancient Hungarian with loanpy
dfforms["rc"] = [rc_obj.reconstruct(word.replace(" ", ""),
                                    howmany=1000,
                                    vowelharmony_filter=False,
                                    phonotactics_filter=False,
                                    clusterised=True)
                       for word in dfforms["Segments"]]

# write to file
dfforms.to_csv("rc_hun_forms.csv", encoding="utf-8", index=False)

In [28]:
from gensim.models import KeyedVectors

from loanpy.helpers import plug_in_model
from loanpy.loanfinder import Search

#create instance of loanpy.loanfinder.Search
search_obj = Search(
path2donordf="ad_got_forms.csv", path2recipdf="rc_hun_forms.csv",
semsim=0, scdictlist_ad="scWOT2EAH.txt", scdictlist_rc="scH2EAH.txt")

# plug in German vectors manually, since default is English
plug_in_model(KeyedVectors.load_word2vec_format("german.model", binary=True))

# search for loans
search_obj.loans(write_to="loans.csv")

searching for phonological matches: 100%|█████| 673/673 [00:51<00:00, 13.08it/s]
calculating semantic similarity of phonological matches: 100%|█| 8354/8354 [00:0
cutting off by semsim=0and ranking by semantic similarity
file written to loans.csv
done. Insert date and time later here.


Unnamed: 0,match,recipdf_idx,Meaning_x,Meaning_y,gensim_multiword
214,døs,127,Brett,Tor,0.474125
453,dis,127,Brett,Sack,0.456463
330,disɒ,127,Brett,unten,0.446397
300,teːjz,1507,Winter,See,0.422098
30,disɛ,127,Brett,Bett,0.412935
103,maːm,1384,heute,kommen,0.41132
408,maːmik,1384,heute,Bild,0.381895
11,maːmɛ,1384,heute,Menge,0.37
300,dɛs,127,Brett,See,0.357322
455,dis,127,Brett,Salz,0.337295
