# SOUND CORRESPONDENCE MINER

In [1]:
from loanpy.scminer import get_correspondences

input_table = [

    ['ID', 'COGID', 'DOCULECT', 'ALIGNMENT', 'PROSODY'],

    ['0', '1', 'LG1', 'a b', 'VC'],

    ['1', '1', 'LG2', 'c d', 'CC']

]

get_correspondences(input_table)

[{'a': ['c'], 'b': ['d']},
 {'a c': 1, 'b d': 1},
 {'a c': [1], 'b d': [1]},
 {'VC': ['CC']},
 {'VC CC': 1},
 {'VC CC': [1]}]

In [None]:
from loanpy.scminer import get_heur

get_heur("eng")

In [3]:
from loanpy.scminer import get_prosodic_inventory


data = [  ['ID', 'COGID', 'DOCULECT', 'ALIGNMENT', 'PROSODY'],

  [0, 1, 'H', '#aː t͡ʃ# -#', 'VC'],

  [1, 1, 'EAH', 'a.ɣ.a t͡ʃ i', 'VCVCV'],

  [2, 2, 'H', '#aː ɟ uː#', 'VCV'],

  [3, 2, 'EAH', 'a l.d a.ɣ', 'VCCVC'],

  [4, 3, 'H', '#ɒ j n', 'VCC'],

  [5, 3, 'EAH', 'a j.a n', 'VCVC']

]

get_prosodic_inventory(data)

['VCCVC', 'VCVCV', 'VCVC']

In [5]:
from loanpy.scminer import uralign

print(uralign("a b c", "f g h i j k").replace(" ", "\t"))

#a	b	c#	-#
f	g	h	ijk


# SOUND CORRESPONDENCE MINER

In [7]:
from loanpy.scapplier import Adrc

adrc = Adrc("examples/sc2.json", "examples/inv.json")

adrc.sc

[{'d': ['d', 't'], 'a': ['a', 'o']},
 {'d d': 5, 'd t': 4, 'a a': 7, 'a o': 1},
 {},
 {'CVCV': ['CVC']}]

In [8]:
adrc.prosodic_inventory

['CV', 'CVV']

In [9]:
from loanpy.scapplier import Adrc

adrc = Adrc("examples/sc2.json", "examples/inv.json")

adrc.adapt("d a d a")

['dada']

In [10]:
adrc.adapt("d a d a", 5)

['dada', 'data', 'doda', 'dota', 'tada']

In [11]:
adrc.adapt("d a d a", 5, "CVCV")  # sc2.json says CVCV to CVC

['dad', 'dat', 'dod', 'dot', 'tad']

In [12]:
adrc.adapt("d a d", 5, "CVC")   # no info on CVC in sc2.json

['da', 'do', 'ta', 'to']

In [15]:
from loanpy.scapplier import Adrc

adrc = Adrc("examples/sc2.json", "examples/inv.json")

adrc.get_closest_phonotactics("CVC")

'CV'

In [16]:
adrc.get_closest_phonotactics("CVCV")

'CVV'

In [17]:
from loanpy.scapplier import Adrc

adrc = Adrc()

adrc.set_sc([{}, {"k k": 2, "k c": 1, "i e": 2, "i o": 1}, {}, {}, {}, {}, {}])

sclistlist = [["k", "c", "$"], ["e", "o", "$"], ["k", "c", "$"], ["e", "o", "$"]]

adrc.get_diff(sclistlist, ["k", "i", "k", "i"])

[1, 1, 1, 1]

In [18]:
from loanpy.scapplier import Adrc

adrc = Adrc()

adrc.set_sc([{"k": ["k", "h"], "i": ["e", "o"]},

             {"k k": 5, "k c": 3, "i e": 2, "i o": 1},

             {}, {}, {}, {}, {}])

sclistlist = [["k", "c", "$"], ["e", "o", "$"], ["k", "c", "$"], ["e", "o", "$"]]

adrc.read_sc(["k", "i"], 2)

[['k'], ['e', 'o']]

In [19]:
from loanpy.scapplier import Adrc

adrc = Adrc("examples/sc2.json", "examples/inv.json")

adrc.reconstruct("d a d a")

'^(d)(a)(d)(a)$'

In [20]:
adrc.reconstruct("d a d a", 1000)

'^(d|t)(a|o)(d|t)(a|o)$'

In [21]:
adrc.reconstruct("l a l a")

'l not old'

In [22]:
from loanpy.scapplier import Adrc

adrc = Adrc("examples/sc2.json", "examples/inv.json")

adrc.repair_phonotactics(["d", "a", "d", "a"], "CVCV")

['d', 'a', 'd']

In [24]:
from loanpy.scapplier import Adrc

adrc = Adrc("examples/sc2.json", "examples/inv.json")

adrc.set_prosodic_inventory("rofl")

adrc.prosodic_inventory

'rofl'

In [25]:
from loanpy.scapplier import Adrc

adrc = Adrc("examples/sc2.json", "examples/inv.json")

adrc.set_sc("lol")

adrc.sc

'lol'

In [26]:
from loanpy.scapplier import add_edge

graph = {'A': {'B': 3}}

add_edge(graph, 'A', 'C', 7)

graph

{'A': {'B': 3, 'C': 7}}

In [27]:
from loanpy.scapplier import apply_edit

apply_edit(

      ['f', 'ɛ', 'r', 'i', 'h', 'ɛ', 'ɟ'],

      ('insert d',

       'insert u',

       'insert n',

       'insert ɒ',

       'insert p',

       'substitute f by ɒ',

       'delete ɛ',

       'keep r',

       'delete i',

       'delete h',

       'delete ɛ',

       'substitute ɟ by t')

)

['d', 'u', 'n', 'ɒ', 'p', 'ɒ', 'r', 't']

In [28]:
from loanpy.scapplier import dijkstra

graph1 = {

        'A': {'B': 1, 'C': 4},

        'B': {'C': 2, 'D': 6},

        'C': {'D': 3},

        'D': {}

    }

dijkstra(graph1, 'A', 'D')

['A', 'B', 'C', 'D']

In [29]:
from loanpy.scapplier import edit_distance_with2ops

edit_distance_with2ops("rajka", "ajka", w_del=100, w_ins=49)

100

In [30]:
edit_distance_with2ops("ajka", "rajka", w_del=100, w_ins=49)

49

In [31]:
edit_distance_with2ops("Bécs", "Pécs", w_del=100, w_ins=49)

149

In [32]:
edit_distance_with2ops("Hegyeshalom", "Mosonmagyaróvár", w_del=100, w_ins=49)

1388

In [33]:
from loanpy.scapplier import get_mtx

get_mtx("Bécs", "Pécs")

[[0, 1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [2, 3, 2, 3, 4],
 [3, 4, 3, 2, 3],
 [4, 5, 4, 3, 2]]

In [34]:
from loanpy.scapplier import list2regex

list2regex(["b", "k", "-", "v"])

'(b|k|v)?'

In [35]:
from loanpy.scapplier import move_sc

move_sc([["x", "x"]], 0, [[]])

([['x']], [['x']])

In [36]:
move_sc([["x", "x"], ["y", "y"], ["z"]], 1, [["a"], ["b"], ["c"]])

([['x', 'x'], ['y'], ['z']], [['a'], ['b', 'y'], ['c']])

In [37]:
from loanpy.scapplier import mtx2graph

mtx2graph([[0, 1, 2], [1, 2, 3], [2, 3, 2]])

{(0, 0): {(0, 1): 100, (1, 0): 49},
 (0, 1): {(0, 2): 100, (1, 1): 49},
 (0, 2): {(1, 2): 49},
 (1, 0): {(1, 1): 100, (2, 0): 49},
 (1, 1): {(1, 2): 100, (2, 1): 49, (2, 2): 0},
 (1, 2): {(2, 2): 49},
 (2, 0): {(2, 1): 100},
 (2, 1): {(2, 2): 100},
 (2, 2): {}}

In [38]:
from loanpy.scapplier import substitute_operations

substitute_operations(['insert A', 'delete B', 'insert C'])

['substitute B by A', 'insert C']

In [39]:
substitute_operations(['delete A', 'insert B', 'delete C', 'insert D'])

['substitute A by B', 'substitute C by D']

In [40]:
from loanpy.scapplier import tuples2editops

tuples2editops([(0, 0), (0, 1), (1, 1), (2, 2)], "ló", "hó")

['substitute l by h', 'keep ó']

In [41]:
tuples2editops([(0, 0), (1, 1), (2, 2), (2, 3)], "lóh", "ló")

['keep l', 'keep ó', 'delete h']

# EVALUATE SOUND CORRESPONDENCE APPLIER

In [42]:
from loanpy.eval_sca import eval_all

intable = [  ['ID', 'COGID', 'DOCULECT', 'ALIGNMENT', 'PROSODY'],

  ['0', '1', 'H', 'k i k i', 'VC'],

  ['1', '1', 'EAH', 'k i g i', 'VCVCV'],

  ['2', '2', 'H', 'i k k i', 'VCV'],

  ['3', '2', 'EAH', 'i g k i', 'VCCVC']

]


eval_all(intable, "", False, [1, 2, 3])

[(0.33, 0.0), (0.67, 1.0), (1.0, 1.0)]

In [43]:
from loanpy.eval_sca import eval_one

intable = [  # regular sound correspondences

    ['ID', 'COGID', 'DOCULECT', 'ALIGNMENT', 'PROSODY'],

    ['0', '1', 'H', 'k i k i', 'VC'],

    ['1', '1', 'EAH', 'g i g i', 'VCVCV'],

    ['2', '2', 'H', 'i k k i', 'VCV'],

    ['3', '2', 'EAH', 'i g g i', 'VCCVC']

]

eval_one(intable, "", False, 1)

1.0

In [44]:
intable = [  # not enough regular sound correspondences

  ['ID', 'COGID', 'DOCULECT', 'ALIGNMENT', 'PROSODY'],

  ['0', '1', 'H', 'k i k i', 'VC'],

  ['1', '1', 'EAH', 'g i g i', 'VCVCV'],

  ['2', '2', 'H', 'b u b a', 'VCV'],

  ['3', '2', 'EAH', 'p u p a', 'VCCVC']

]

eval_one(intable, "", False, 1)

0.0

In [45]:
intable = [  # irregular sound correspondences

  ['ID', 'COGID', 'DOCULECT', 'ALIGNMENT', 'PROSODY'],

  ['0', '1', 'H', 'k i k i', 'VC'],

  ['1', '1', 'EAH', 'k i g i', 'VCVCV'],

  ['2', '2', 'H', 'i k k i', 'VCV'],

  ['3', '2', 'EAH', 'i g k i', 'VCCVC']

]

eval_one(intable, "", False, 1)

0.0

In [46]:
intable = [  # irregular sound correspondences

  ['ID', 'COGID', 'DOCULECT', 'ALIGNMENT', 'PROSODY'],

  ['0', '1', 'H', 'k i k i', 'VC'],

  ['1', '1', 'EAH', 'k i g i', 'VCVCV'],

  ['2', '2', 'H', 'i k k i', 'VCV'],

  ['3', '2', 'EAH', 'i g k i', 'VCCVC']

]

eval_one(intable, "", False, 2)  # increase rate of false positives

1.0

# LOAN FINDER

In [48]:
from loanpy.loanfinder import phonetic_matches

donor = [

    ['a0', 'Donorese-0', 'igig'],

    ['a1', 'Donorese-1', 'iggi']

]

recipient = [

    ['0', 'Recipientese-0', '^(i|u)(g)(g)(i|u)$'],

    ['1', 'Recipientese-1', '^(i|u)(i|u)(g)(g)$']

]

outpath = "examples/phonetic_matches.tsv"

phonetic_matches(recipient, donor, outpath)

with open(outpath, "r") as f:

    print(f.read())

ID	ID_rc	ID_ad
0	Recipientese-0	Donorese-1



In [49]:
from loanpy.loanfinder import semantic_matches

def getsemsim(x, y):

    return 3

phmtsv = [

    ["ID", "ID_rc", "ID_ad"],

    ["0", "Recipientese-0", "Donorese-1", "cat", "dog"],

]

outpath = "examples/phonetic_matches.tsv"

semantic_matches(phmtsv, getsemsim, outpath)

with open(outpath, "r") as f:

    print(f.read())

ID	ID_rc	ID_ad	semsim
0	Recipientese-0	Donorese-1	3



# UTILITY FUNCTIONS

In [50]:
from loanpy.utils import IPA

ipa = IPA()

type(ipa.vowels)

list

In [51]:
len(ipa.vowels)

1464

In [52]:
ipa.vowels[0]

'ʋ̥'

In [53]:
from loanpy.utils import IPA

ipa = IPA()

ipa.get_clusters(["r", "a", "u", "f", "l"])

'r a.u f.l'

In [54]:
from loanpy.utils import IPA

ipa = IPA()

ipa.get_cv("p")

'C'

In [55]:
ipa.get_cv("u")

'V'

In [56]:
from loanpy.utils import IPA

ipa = IPA()

ipa.get_prosody("l o l")

'CVC'

In [57]:
ipa.get_prosody("r o f.l")

'CVCC'

In [58]:
from loanpy.utils import cvgaps

cvgaps("b l -", "b l a")

['b l V', 'b l a']

In [59]:
cvgaps("b - a", "b l a")

['b C a', 'b l a']

In [60]:
from loanpy.utils import find_optimal_year_cutoff

tsv = [

    ['form', 'sense', 'Year', 'Etymology', 'Loan'],

    ['gulyás', 'goulash, Hungarian stew', '1800', 'unknown', ''],

    ['Tisza', 'a major river in Hungary', '1230', 'uncertain', ''],

    ['Pest', 'part of Budapest, the capital', '1241', 'Slavic', 'True'],

    ['paprika', 'ground red pepper, spice', '1598', 'Slavic', 'True']

]

find_optimal_year_cutoff(tsv, "Slavic")

1241

In [61]:
from loanpy.utils import is_same_length_alignments

is_same_length_alignments([[0, 1, 2, "a - c", 4, 5], [0, 1, 2, "d e f", 4, 5]])

True

In [62]:
is_same_length_alignments([[0, 1, 2, "a b c", 4, 5], [0, 1, 2, "d e", 4, 5]])

2023-04-27 00:58:15,790 - INFO - 0
['a', 'b', 'c']
['d', 'e']


False

In [63]:
from loanpy.utils import is_valid_language_sequence

data = [

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '0', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '0', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '1', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '1', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x']

]

is_valid_language_sequence(data, "de", "en")

True

In [64]:
from loanpy.utils import is_valid_language_sequence

data = [

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '0', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '0', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '1', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '1', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x'],

    ['x', 'x', 'nl', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x']

]

is_valid_language_sequence(data, "de", "en")

2023-04-27 00:58:31,502 - INFO - Problem in row 5


False

In [66]:
from loanpy.utils import prefilter

data = [

    ['x', 'x', 'Language_ID', 'x', 'x', 'x', 'x', 'x', 'x', 'Cognacy', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '0', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '0', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '1', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '1', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '2', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '3', 'x'],

    ['x', 'x', 'nl', 'x', 'x', 'x', 'x', 'x', 'x', '4', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '4', 'x'],

    ['x', 'x', 'nl', 'x', 'x', 'x', 'x', 'x', 'x', '5', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '5', 'x'],

    ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x'],

    ['x', 'x', 'nl', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x'],

    ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x']

]

prefilter(data, "de", "en")

[['x', 'x', 'Language_ID', 'x', 'x', 'x', 'x', 'x', 'x', 'Cognacy', 'x'],
 ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '0', 'x'],
 ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '0', 'x'],
 ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '1', 'x'],
 ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '1', 'x'],
 ['x', 'x', 'de', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x'],
 ['x', 'x', 'en', 'x', 'x', 'x', 'x', 'x', 'x', '6', 'x']]

In [67]:
from loanpy.utils import prod

prod([1, 2, 3])  # one times two times three

6

In [68]:
from loanpy.utils import read_ipa_all

ipa_all = read_ipa_all()

type(ipa_all)

list

In [69]:
len(ipa_all)

6490

In [70]:
ipa_all[:2]

[['ipa',
  'syl',
  'son',
  'cons',
  'cont',
  'delrel',
  'lat',
  'nas',
  'strid',
  'voi',
  'sg',
  'cg',
  'ant',
  'cor',
  'distr',
  'lab',
  'hi',
  'lo',
  'back',
  'round',
  'velaric',
  'tense',
  'long',
  'hitone',
  'hireg'],
 ['˩',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '-1',
  '-1']]

In [1]:
from loanpy.utils import scjson2tsv

scjson2tsv("examples/sc.json", "examples/sc.tsv", "examples/sc_p.tsv")

with open("examples/sc.tsv", "r") as f:

    print(f.read())

sc	src	tgt	freq	CogID
a o	a	o	1	512
a e	a	e	2	3, 4



In [2]:
with open("examples/sc_p.tsv", "r") as f:

    print(f.read())

sc	src	tgt	freq	CogID
CV CV	CV	CV	1	7



# THE END