In [1]:
import collections, itertools, os

import numpy as np
import pandas as pd
import pickle

from tf.fabric import Fabric

TF = Fabric(locations='~/github//extrabiblical/tf/0.2') 

api = TF.load('''
        otype lex language prs pdp
        det nu ps sp
        prs prs_gn
        g_suffix
        g_cons_utf8
        g_lex_utf8 g_nme_utf8 g_pfm_utf8 g_vbs_utf8 g_vbe_utf8 g_uvf_utf8
        g_lex g_nme g_pfm g_vbs g_vbe g_uvf
            ''')
    
api.loadLog()
api.makeAvailableIn(globals())

__boundary__         computed  
__characters__       computed  
__levDown__          computed  
__levUp__            computed  
__levels__           computed  
__order__            computed  
__rank__             computed  
__sections__         computed  
__structure__        NOT LOADED
book                 node (str)
book@en              node (str)
chapter              node (int)
code                 NOT LOADED
det                  node (str)
dist                 NOT LOADED
dist_unit            NOT LOADED
distributional_parent NOT LOADED
domain               NOT LOADED
function             NOT LOADED
functional_parent    NOT LOADED
g_cons               node (str)
g_cons_utf8          node (str)
g_lex                node (str)
g_lex_utf8           node (str)
g_nme                node (str)
g_nme_utf8           node (str)
g_pfm                node (str)
g_pfm_utf8           node (str)
g_prs                NOT LOADED
g_prs_utf8           NOT LOADED
g_suffix             node (str)
g_suffi

[('Computed',
  'computed-data',
  ('C Computed', 'Call AllComputeds', 'Cs ComputedString')),
 ('Features', 'edge-features', ('E Edge', 'Eall AllEdges', 'Es EdgeString')),
 ('Fabric', 'loading', ('TF',)),
 ('Locality', 'locality', ('L Locality',)),
 ('Nodes', 'navigating-nodes', ('N Nodes',)),
 ('Features',
  'node-features',
  ('F Feature', 'Fall AllFeatures', 'Fs FeatureString')),
 ('Search', 'search', ('S Search',)),
 ('Text', 'text', ('T Text',))]

In [2]:
MODEL_DIR = './models'
seq_length = 2
augment_factor = 10

# model details
num_hidden_layers = 2
num_attention_heads = 4

In [3]:
relevant_chars_utf8 = {' ',
 'א',
 'ב',
 'ג',
 'ד',
 'ה',
 'ו',
 'ז',
 'ח',
 'ט',
 'י',
 'ך',
 'כ',
 'ל',
 'ם',
 'מ',
 'ן',
 'נ',
 'ס',
 'ע',
 'ף',
 'פ',
 'ץ',
 'צ',
 'ק',
 'ר',
 'ש',
 'ת'}

In [4]:
alphabet_dict_heb = {char: char for char in relevant_chars_utf8}
double_chars = ['ןנ','ףפ', 'ץצ','ךכ','םמ']

for end_char, non_end_char in double_chars:
    alphabet_dict_heb[end_char] = non_end_char

In [5]:
# needed for conversion of lex to hebrew script, including markers =, / and [

alphabet_dict_heb_lat = {'א': '>',
                                      'ב': 'B',
                                      'ג': 'G',
                                      'ד': 'D',
                                      'ה': 'H',
                                      'ו': 'W',
                                      'ז': 'Z',
                                      'ח': 'X',
                                      'ט': 'V',
                                      'י': 'J',
                                      'כ': 'K',
                                      'ל': 'L',
                                      'מ': 'M',
                                      'נ': 'N',
                                      'ס': 'S',
                                      'ע': '<',
                                      'פ': 'P',
                                      'צ': 'Y',
                                      'ק': 'Q',
                                      'ר': 'R',
                                      'ש': 'C',
                                      'ת': 'T'}

alphabet_dict_lat_heb = {v:k for k,v in alphabet_dict_heb_lat.items()}

alphabet_dict_lat_heb['_'] = ' '
alphabet_dict_lat_heb['F'] = 'ש' + 'ׂ'
alphabet_dict_lat_heb['/'] = 'ֶ' # nouns/adjectives
alphabet_dict_lat_heb['['] = 'ַ' # verbs
alphabet_dict_lat_heb['='] = 'ֻ' # lex disambiguation marker

new_chars = ['ש', 'ׂ', 'ֶ', 'ַ', 'ֻ']

In [6]:
nme_marker =  '֜'
pfm_marker =  'ְ'
vbs_marker =  'ֱ'
vbe_marker =  'ֲ'
prs_marker =  'ֳ'
uvf_marker =  'ִ'

morpheme_markers = {nme_marker, pfm_marker, vbs_marker, vbe_marker, prs_marker, uvf_marker}

# keys indicate indices of morphemes in a word
morph_marker_dict = {
    4:  '֜',
    0:  'ְ',
    1:  'ֱ',
    3:  'ֲ',
    6:  'ֳ',
    5:  'ִ'
}

In [7]:
all_chars = set()
all_chars_utf8 = set()

for w in F.otype.s('word'):
    
    morphemes_utf8 = [F.g_lex_utf8.v(w), F.g_nme_utf8.v(w), F.g_pfm_utf8.v(w), F.g_vbs_utf8.v(w), F.g_vbe_utf8.v(w), F.g_uvf_utf8.v(w)] 
    morphemes_utf8 = [morph if morph else '' for morph in morphemes_utf8]
    for morph_utf8 in morphemes_utf8:
        all_chars_utf8.update(set(morph_utf8))

    morphemes = [F.g_lex_utf8.v(w), F.g_nme_utf8.v(w), F.g_pfm_utf8.v(w), F.g_vbs_utf8.v(w), F.g_vbe_utf8.v(w), F.g_uvf_utf8.v(w)]
    morphemes = [morph if morph else '' for morph in morphemes]
    for morph in morphemes:
        
        all_chars.update(set(morph))

all_chars

{'֜',
 'ׁ',
 'ׂ',
 'א',
 'ב',
 'ג',
 'ד',
 'ה',
 'ו',
 'ז',
 'ח',
 'ט',
 'י',
 'ך',
 'כ',
 'ל',
 'ם',
 'מ',
 'ן',
 'נ',
 'ס',
 'ע',
 'ף',
 'פ',
 'ץ',
 'צ',
 'ק',
 'ר',
 'ש',
 'ת'}

In [23]:
def make_non_overlapping_n_grams(input_list, n):
  return [input_list[i:i+n] for i in range(0, len(input_list), n)]

def make_n_clause_dict(n):
    """
    Makes sequences of n clauses in the Hebrew Bible, based on a running window.
    """
    n_clause_dict = {}

    for bo in F.otype.s('book'):
        cl_n_grams = list(make_non_overlapping_n_grams(L.d(bo, 'clause'), n))
        
        for cl_n_gram in cl_n_grams:
            ch = L.u(cl_n_gram[0], 'chapter')[0]
            book, chapter_number = T.sectionFromNode(ch)
            
            words_n_clause = sorted(list(itertools.chain(*[L.d(cl, 'word') for cl in cl_n_gram])))
            n_clause_dict[(book, chapter_number, cl_n_gram, 0)] = words_n_clause

    return n_clause_dict

In [9]:
def convert_lex_to_heb_script(tf_word_id):
    return ''.join([alphabet_dict_lat_heb[char] for char in F.lex.v(tf_word_id)])

def convert_ascii_string_to_heb_script(ascii_string):
    return ''.join([alphabet_dict_lat_heb[char] for char in ascii_string])

def update_char_dicts(relevant_chars_utf8, alphabet_dict_heb, new_chars):
    for new_char in new_chars:
        relevant_chars_utf8.add(new_char)
        alphabet_dict_heb[new_char] = new_char
    return relevant_chars_utf8, alphabet_dict_heb

def make_lex_representation_with_verb_noun_marker(tf_id):
    lex = F.lex.v(tf_id)
    if F.sp.v(tf_id) == 'verb':
        lex += '['
    elif F.sp.v(tf_id) in {'subs', 'adjv', 'nmpr'}:
        lex += '/'
    if lex == '=':
        lex = ''
    return lex
    

In [10]:
def process_one_word(w, lex_representation, relevant_chars_utf8, alphabet_dict_heb):
    if lex_representation == 'g_lex_utf8':
        morphs = [F.g_pfm.v(w), F.g_vbs.v(w), F.g_lex.v(w), F.g_vbe.v(w), F.g_nme.v(w), F.g_uvf.v(w)]
        morphs = [morph if morph else '' for morph in morphs]
    elif lex_representation == 'lex':
        lex_rep = make_lex_representation_with_verb_noun_marker(w)
        morphs = [F.g_pfm.v(w), F.g_vbs.v(w), lex_rep, F.g_vbe.v(w), F.g_nme.v(w), F.g_uvf.v(w)]
        morphs = [morph if morph else '' for morph in morphs]
        morphs = [morph.strip('-').strip('!').strip(']').strip('/').strip('[').strip('~') if morphs.index(morph) != 2 else morph for morph in morphs]

    morph_list = [''.join([alphabet_dict_lat_heb[char] for char in morph]) for morph in morphs]
   
    morph_list_with_markers = []
    for idx, morph in enumerate(morph_list):
        # check if it is prs
        if morph and idx == 2 and F.sp.v(w) == 'prps' and not F.g_suffix.v(w-1):
            #print(morph)
            morph = morph + morph_marker_dict.get(6, '')
        elif morph:
            morph = morph + morph_marker_dict.get(idx, '')
        morph_list_with_markers.append(morph)
                
    morph_string_with_markers = ' '.join(morph_list_with_markers)
    morph_string_with_markers = ' '.join(morph_string_with_markers.split())
    
    morph_string_without_markers = ' '.join(morph_list)
    morph_string_without_markers = ' '.join(morph_string_without_markers.split())

    return morph_string_with_markers, morph_string_without_markers

In [11]:
def make_morpheme_dicts(n_clause_dict, lex_representation, relevant_chars_utf8, alphabet_dict_heb):
    """
    returns:
    all_morph_strings_with_markers
    keys: (book: str, (clause ids))
    values: hebrew string with morphemes as separate words (with markers) for morpheme types
    """
    all_morph_strings_with_markers = {}
    all_morph_strings_without_markers = {}

    for key, words in n_clause_dict.items():
        morphemes_in_clause_with_markers = []
        morphemes_in_clause_without_markers = []
    
        for w in words:
            morph_string_with_markers, morph_string_without_markers = process_one_word(w, lex_representation, relevant_chars_utf8, alphabet_dict_heb)
            
            morphemes_in_clause_with_markers.append(morph_string_with_markers)
            morphemes_in_clause_without_markers.append(morph_string_without_markers)
            
        all_morph_strings_with_markers[key] = ' '.join(morphemes_in_clause_with_markers)
        all_morph_strings_without_markers[key] = ' '.join(morphemes_in_clause_without_markers)

    return all_morph_strings_with_markers, all_morph_strings_without_markers

In [12]:
n_clause_dict = make_n_clause_dict(seq_length)

In [13]:
len(n_clause_dict)

4456

In [14]:
# choose 'lex' or 'g_lex_utf8'
# g_lex_utf is the lexeme part of the word, but as it is found in the manuscript. A word can have various spellings
# lex is based on the lex feature. This means it has standard spelling and lexeme disambiguation.

morpheme_dataset, _ = make_morpheme_dicts(n_clause_dict, 'lex', relevant_chars_utf8, alphabet_dict_heb)

hebrew_bible_tokens = set()
for token_string in morpheme_dataset.values():
    token_set = set(token_string.split())
    hebrew_bible_tokens.update(token_set)

In [15]:
# Remove duplicates in values (= texts)

swapped_morpheme_dataset = {v:k for k, v in morpheme_dataset.items()}
morpheme_dataset = {v:k for k, v in swapped_morpheme_dataset.items()}
len(morpheme_dataset)     

4131

In [16]:
with open(f'xbib_dict_len_{seq_length}.pkl', 'wb') as f:
    pickle.dump(morpheme_dataset, f)     