In [2]:
import sys, os, csv, collections, time

In [4]:
from tf.fabric import Fabric

DATABASE = '~/github'
BHSA = 'bhsa/tf/c'

TF = Fabric(locations=[DATABASE], modules=[BHSA], silent=False )

This is Text-Fabric 3.0.3
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

107 features found and 0 ignored


In [5]:
api = TF.load('''
    otype
    lex
''')

  0.00s loading features ...
   |     0.06s B otype                from C:/Users/Martijn/github/bhsa/tf/c
   |     0.10s B lex                  from C:/Users/Martijn/github/bhsa/tf/c
   |     0.00s Feature overview: 102 for nodes; 4 for edges; 1 configs; 7 computed
  4.03s All features loaded/computed - for details use loadLog()


In [6]:
api.loadLog()
api.makeAvailableIn(globals())

   |     0.00s M otext                from C:/Users/Martijn/github/bhsa/tf/c
   |     0.06s B otype                from C:/Users/Martijn/github/bhsa/tf/c
   |     0.49s B oslots               from C:/Users/Martijn/github/bhsa/tf/c
   |     0.00s M otext                from C:/Users/Martijn/github/bhsa/tf/c
   |     0.01s B book                 from C:/Users/Martijn/github/bhsa/tf/c
   |     0.01s B chapter              from C:/Users/Martijn/github/bhsa/tf/c
   |     0.01s B verse                from C:/Users/Martijn/github/bhsa/tf/c
   |     0.14s B g_cons               from C:/Users/Martijn/github/bhsa/tf/c
   |     0.18s B g_cons_utf8          from C:/Users/Martijn/github/bhsa/tf/c
   |     0.14s B g_lex                from C:/Users/Martijn/github/bhsa/tf/c
   |     0.19s B g_lex_utf8           from C:/Users/Martijn/github/bhsa/tf/c
   |     0.12s B g_word               from C:/Users/Martijn/github/bhsa/tf/c
   |     0.16s B g_word_utf8          from C:/Users/Martijn/github/bhsa/tf/c

In [7]:
SKIP_LETT = set(' /=[_')

def text_per_verse():
    '''
    verse_dict is dict, keys are versenames(book_chapter_verse), values are concatenation of lexemes, each separated by _.
    letter_count_dict is dict, keys are letters, values are counts in whole Hebrew Bible.
    '''
    letter_count_dict = collections.defaultdict(int)
    skip_lett = SKIP_LETT
    for verse in F.otype.s('verse'):
        text = ''
        words = L.d(verse, 'word')
        for word in words:
            lexeme = F.lex.v(word)
            for lett in lexeme:
                if lett not in skip_lett:
                    text += lett
                    letter_count_dict[lett] += 1

    return(letter_count_dict)

In [8]:
def make_short_str(lexem, letter_count_dict):
    '''
    short_word is string 
    '''
    word_lett_dict = {}
    if len(lexem) <= 2:
        return(lexem)
    else:
        for lett in lexem:
            word_lett_dict[lett] = letter_count_dict[lett]
        lowest = min(word_lett_dict, key=word_lett_dict.get)  
        word_lett_dict.pop(lowest, None)
        lowest2 = min(word_lett_dict, key=word_lett_dict.get)

        low_set = {lowest, lowest2}
        short_word = ''
        for lett in lexem:
            if len(short_word) < 2:
                if lett in low_set:
                    short_word += lett
                    low_set.remove(lett)
                    
    return(short_word)    

In [9]:
def make_num_dict(letter_count_dict):
    '''
    number_dict is dict, key is one or two letter string (there are (23*23 + 23) keys), value is numerical code of that string
    '''
    number_dict = {}
    basic_num = 100
    for letter in letter_count_dict.keys():
        feat = letter
        number_dict[feat] = basic_num
        basic_num += 1
        for letter2 in letter_count_dict.keys():
        
            feat = letter + letter2
            number_dict[feat] = basic_num
            basic_num += 1
    
    return(number_dict)

In [10]:
def make_skip_grams(number_dict):
    '''
    five_sam is list of reduced words.
    sam_order is int, it is place of last word of skipgram in MT.
    skip_grams_in_book is dict, key is place, value is list of skip-grams associated with it.
    skips_and_info is dict, key is place (int), value is string: book_chapter_verse
    sam_dict is dict, first key is numerical code of first word in skip-gram, second key is tuple of next words in skip-gram, 
    value is list of places of skip_gram in Hebrew Bible, place is place of last word in skip-gram
    
    '''
    sam_dict = collections.defaultdict(lambda: collections.defaultdict(list))
    skip_grams_in_book = collections.defaultdict(list)
    skips_and_info = {}
    order_list = []
    five_sam = []
    sam_order = 0
    skip_lett = SKIP_LETT
    
    for word in F.otype.s('word'):
        sam_order += 1
        where = T.sectionFromNode(word)
        info = where[0] + '_' + str(where[1]) + '_' + str(where[2])

        lexeme = F.lex.v(word)
        redu_lex = ''
        for lett in lexeme:
            if not lett in skip_lett:
                redu_lex += lett
        reduced = make_short_str(redu_lex, B)
        if len(five_sam) < 5:                
            five_sam.append(reduced)
        if len(five_sam) == 5:
            numb = [number_dict[word] for word in five_sam]
            numb0 = numb[0]
            sn1 = ''.join(str(numb[x]) for x in (1,2,3))
            sn2 = ''.join(str(numb[x]) for x in (1,2,4))
            sn3 = ''.join(str(numb[x]) for x in (1,3,4))
            sn4 = ''.join(str(numb[x]) for x in (2,3,4))
            sam_dict[numb0][sn1].append(sam_order)
            sam_dict[numb0][sn2].append(sam_order)
            sam_dict[numb0][sn3].append(sam_order)
            sam_dict[numb0][sn4].append(sam_order)
                
            skips_and_info[sam_order] = info
            del(five_sam[0])
    return(skips_and_info, sam_dict)

In [11]:
def make_grambi_dict(sam_dict):
    '''
    grambi_dict is dict, key is place(int), value is list of other places sharing skip-gram with place 
    grambi_list is list of all places (int)
    '''
    grambi_dict = collections.defaultdict(list)
    grambi_set = set()
    for key in sam_dict.keys():
        for key2 in sam_dict[key].keys():
            for place in sam_dict[key][key2]:
                grambi_set.add(place)
                for other_item in sam_dict[key][key2]:
                    if not other_item in grambi_dict[place]:
                        grambi_dict[place].append(other_item)
    grambi_list = sorted(list(grambi_set))
    return(grambi_dict, grambi_list) 

In [16]:
def find_clusters(grambi_dict, grambi_list, chap_dict, paral_size): #=HK, KL, FG
    '''
    cluster_dict is dict in which clusters grow. Its keys are tuples, consisting of two parallel skipgrams.  
    if cluster is finished it moves to finished clusters. 
    '''
    finished_clusters = collections.defaultdict(list)
    #cluster_dict = collections.defaultdict(list)
    cluster_dict = collections.defaultdict(lambda: collections.defaultdict(list))
    start_skip_list = []
    for gram in grambi_list:
        if len(cluster_dict) == 0:
            if len(grambi_dict[gram]) > 1:
                for item in grambi_dict[gram]:
                    if item != gram:
                        if abs(item - gram) > paral_size:
                            if not (chap_dict[gram][0], chap_dict[gram][1]) == (chap_dict[item][0], chap_dict[item][1]):
                                if not set((chap_dict[gram], chap_dict[item])) in start_skip_list:
                                    cluster_dict[(gram, item)][gram].append(gram)
                                    cluster_dict[(gram, item)][item].append(item)
        else:
            keys = list(cluster_dict.keys())
            for it1, it2 in keys:
                match = False
                for probeer in range(6):
                    for keit in range(6):
                        if (gram + probeer) in grambi_dict[cluster_dict[(it1, it2)][it2][-1] + 1 + keit]:
                            match = True
                if match:
                    cluster_dict[(it1, it2)][it1].append(gram)
                    cluster_dict[(it1, it2)][it2].append(cluster_dict[(it1, it2)][it2][-1] + 1)
                    
                else:
                    if len(cluster_dict[(it1, it2)][it1]) > paral_size:
                        length = len(cluster_dict[(it1, it2)][it1])
                        finished_clusters[(it1, it2)] = cluster_dict[(it1, it2)]
                        start_skip_list.append(set((chap_dict[it1], chap_dict[it2])))
                        cluster_dict.pop((it1, it2))
                        print(chap_dict[it1], chap_dict[it2], length)
                    else:
                        cluster_dict.pop((it1, it2))

    return(finished_clusters)

In the following the cell the preprocessing of the data is done.

In [17]:
start_time = time.time()
B = text_per_verse()
D = make_num_dict(B)
FG, GH = make_skip_grams(D)
HK, KL = make_grambi_dict(GH)

print("--- %s seconds ---" % (time.time() - start_time))

--- 27.647458791732788 seconds ---


In the next cell the parallel texts are detected.

In [18]:
start_time = time.time()
LM = find_clusters(HK, KL, FG, 30)
print("--- %s seconds ---" % (time.time() - start_time))

Genesis_10_6 1_Chronicles_1_8 38
Genesis_10_13 1_Chronicles_1_11 70
Genesis_10_22 1_Chronicles_1_17 93
Genesis_36_32 1_Chronicles_1_43 95
Exodus_20_2 Deuteronomy_5_6 112
Leviticus_11_15 Deuteronomy_14_14 42
Numbers_21_33 Deuteronomy_3_1 55
Deuteronomy_3_2 Numbers_21_34 36
Deuteronomy_5_7 Exodus_20_3 101
Deuteronomy_14_15 Leviticus_11_16 36
Joshua_15_15 Judges_1_11 34
Joshua_15_17 Judges_1_13 54
Joshua_21_6 1_Chronicles_6_47 41
Joshua_21_13 1_Chronicles_6_42 47
Joshua_21_20 1_Chronicles_6_51 41
Joshua_21_27 1_Chronicles_6_56 36
Judges_1_14 Joshua_15_18 47
2_Samuel_5_1 1_Chronicles_11_1 55
2_Samuel_5_14 1_Chronicles_14_4 44
2_Samuel_6_8 1_Chronicles_13_11 37
2_Samuel_7_4 1_Chronicles_17_3 36
2_Samuel_7_7 1_Chronicles_17_6 89
2_Samuel_7_10 1_Chronicles_17_9 34
2_Samuel_7_12 1_Chronicles_17_11 31
2_Samuel_7_16 1_Chronicles_17_14 48
2_Samuel_7_22 1_Chronicles_17_20 33
2_Samuel_7_24 1_Chronicles_17_22 62
2_Samuel_8_3 1_Chronicles_18_3 101
2_Samuel_8_11 1_Chronicles_18_11 34
2_Samuel_8_14 1_C