In [1]:
import collections, itertools, os

import numpy as np
import pandas as pd
from scipy.spatial import distance

import torch
from torchinfo import summary
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModel,
    AutoTokenizer,
    BertConfig, 
    BertModel, 
    DataCollatorForLanguageModeling, 
    AutoModelForMaskedLM,
    BertForMaskedLM,
    PreTrainedTokenizerFast,
    Trainer,
    TrainingArguments
)

from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from tf.app import use
A = use('etcbc/bhsa', hoist=globals())

A.load([
        'g_lex_utf8', 'g_prs_utf8', 'g_nme_utf8', 'g_pfm_utf8', 'g_vbs_utf8', 'g_vbe_utf8', 'g_uvf_utf8'
       ])

**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


True

In [2]:
target_book = 'Genesis'
seq_len = 5

In [3]:
relevant_chars_utf8 = {' ',
 'א',
 'ב',
 'ג',
 'ד',
 'ה',
 'ו',
 'ז',
 'ח',
 'ט',
 'י',
 'ך',
 'כ',
 'ל',
 'ם',
 'מ',
 'ן',
 'נ',
 'ס',
 'ע',
 'ף',
 'פ',
 'ץ',
 'צ',
 'ק',
 'ר',
 'ש',
 'ת'}

In [4]:
alphabet_dict_heb = {char: char for char in relevant_chars_utf8}
double_chars = ['ןנ','ףפ', 'ץצ','ךכ','םמ']

for end_char, non_end_char in double_chars:
    alphabet_dict_heb[end_char] = non_end_char

In [5]:
# needed for conversion of lex to hebrew script, including markers =, / and [

alphabet_dict_heb_lat = {'א': '>',
                                      'ב': 'B',
                                      'ג': 'G',
                                      'ד': 'D',
                                      'ה': 'H',
                                      'ו': 'W',
                                      'ז': 'Z',
                                      'ח': 'X',
                                      'ט': 'V',
                                      'י': 'J',
                                      'כ': 'K',
                                      'ל': 'L',
                                      'מ': 'M',
                                      'נ': 'N',
                                      'ס': 'S',
                                      'ע': '<',
                                      'פ': 'P',
                                      'צ': 'Y',
                                      'ק': 'Q',
                                      'ר': 'R',
                                      'ש': 'C',
                                      'ת': 'T'}

alphabet_dict_lat_heb = {v:k for k,v in alphabet_dict_heb_lat.items()}

alphabet_dict_lat_heb['_'] = ' '
alphabet_dict_lat_heb['F'] = 'ש' + 'ׂ'
alphabet_dict_lat_heb['/'] = 'ֶ' # nouns/adjectives
alphabet_dict_lat_heb['['] = 'ַ' # verbs
alphabet_dict_lat_heb['='] = 'ֻ' # lex disambiguation marker

new_chars = ['ש', 'ׂ', 'ֶ', 'ַ', 'ֻ']



In [6]:
# Are morphemes together always identical with g_cons? 

for w in F.otype.s('word'):
    g_cons = ''.join([alphabet_dict_heb[char] for char in F.g_cons_utf8.v(w) if char in relevant_chars_utf8])
    morphs = [F.g_pfm_utf8.v(w), F.g_vbs_utf8.v(w), F.g_lex_utf8.v(w), F.g_vbe_utf8.v(w), F.g_nme_utf8.v(w), F.g_uvf_utf8.v(w), F.g_prs_utf8.v(w)]
    reconstr = ''.join([''.join([alphabet_dict_heb[char] for char in morph if char in relevant_chars_utf8]) for morph in morphs])
    if g_cons != reconstr:
        print(w, g_cons, reconstr, F.g_uvf_utf8.v(w))

In [7]:
nme_marker =  '֜'
pfm_marker =  'ְ'
vbs_marker =  'ֱ'
vbe_marker =  'ֲ'
prs_marker =  'ֳ'
uvf_marker =  'ִ'

# keys indicate indices of morphemes in a word, index 2 is the lexeme
morph_marker_dict = {
    4:  '֜',
    0:  'ְ',
    1:  'ֱ',
    3:  'ֲ',
    6:  'ֳ',
    5:  'ִ'
}

In [8]:
all_chars = set()
all_chars_utf8 = set()

for w in F.otype.s('word'):
    
    morphemes_utf8 = [F.g_lex_utf8.v(w), F.g_nme_utf8.v(w), F.g_pfm_utf8.v(w), F.g_vbs_utf8.v(w), F.g_vbe_utf8.v(w), F.g_uvf_utf8.v(w), F.g_prs_utf8.v(w)]
    for morph_utf8 in morphemes_utf8:
        all_chars_utf8.update(set(morph_utf8))

    morphemes = [F.g_lex_utf8.v(w), F.g_nme_utf8.v(w), F.g_pfm_utf8.v(w), F.g_vbs_utf8.v(w), F.g_vbe_utf8.v(w), F.g_prs_utf8.v(w), F.g_uvf_utf8.v(w)]
    for morph in morphemes:
        
        all_chars.update(set(morph))

In [9]:
def make_overlapping_n_grams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

def make_non_overlapping_n_grams(input_list, n):
  return [input_list[i:i+n] for i in range(0, len(input_list), n)]

def convert_lex_to_heb_script(tf_word_id):
    return ''.join([alphabet_dict_lat_heb[char] for char in F.lex.v(tf_word_id)])

def update_char_dicts(relevant_chars_utf8, alphabet_dict_heb, new_chars):
    for new_char in new_chars:
        relevant_chars_utf8.add(new_char)
        alphabet_dict_heb[new_char] = new_char
    return relevant_chars_utf8, alphabet_dict_heb

In [10]:
class HebrewBibleVerses:
    def __init__(self, lex_representation, relevant_chars_utf8, alphabet_dict_heb):
        self.seq_len = seq_len
        self.lex_representation = lex_representation
        self.relevant_chars_utf8 = relevant_chars_utf8
        self.alphabet_dict_heb = alphabet_dict_heb
        #self.n_clause_dict = self.make_n_clause_dict()
        #self.n_clause_hebrew_texts = self.make_hebrew_texts_dict()
        self.n_clause_morphemes_with_markers = self.make_morpheme_dict(lex_representation, relevant_chars_utf8, alphabet_dict_heb)

    def make_n_clause_dict(self):
        """
        Makes sequences of n clauses in the Hebrew Bible, based on a running window.
        """
        n_clause_dict = {}

        for bo in F.otype.s('book'):
            cl_n_grams = list(make_overlapping_n_grams(L.d(bo, 'clause'), self.seq_len))
        
            for cl_n_gram in cl_n_grams:
                book, chapter_number, verse_number = T.sectionFromNode(cl_n_gram[0])
            
                words = sorted(list(itertools.chain(*[L.d(cl, 'word') for cl in cl_n_gram])))
                n_clause_dict[(book, chapter_number, verse_number, cl_n_gram)] = words

        return n_clause_dict

    def make_hebrew_texts_dict(self):

        hebrew_strings_dict = {}
        for key, words in self.n_clause_dict.items():
            hebrew_clauses = ''.join([F.g_cons_utf8.v(w) if not F.trailer.v(w) else F.g_cons_utf8.v(w) + ' ' for w in words]).strip()
            hebrew_strings_dict[key] = hebrew_clauses
        return hebrew_strings_dict

    def make_morpheme_dict(self, lex_representation, relevant_chars_utf8, alphabet_dict_heb):
        """
        returns:
        all_morph_strings_with_markers
        keys: (book: str, (clause ids))
        values: hebrew string with morphemes as separate words (with markers) for morpheme types
        """
        all_morph_strings_with_markers = {}
        
        for ve in F.otype.s('verse'):
            key = T.sectionFromNode(ve)
            words = L.d(ve, 'word')
            morphemes_in_clause_with_markers = []
    
            for w in words:
                if lex_representation == 'g_lex_utf8':
                    morphs = [F.g_pfm_utf8.v(w), F.g_vbs_utf8.v(w), F.g_lex_utf8.v(w), F.g_vbe_utf8.v(w), F.g_nme_utf8.v(w), F.g_uvf_utf8.v(w), F.g_prs_utf8.v(w)]
                elif lex_representation == 'lex':
                    lex_rep = convert_lex_to_heb_script(w)
                    morphs = [F.g_pfm_utf8.v(w), F.g_vbs_utf8.v(w), lex_rep, F.g_vbe_utf8.v(w), F.g_nme_utf8.v(w), F.g_uvf_utf8.v(w), F.g_prs_utf8.v(w)]
                    relevant_chars_utf8, alphabet_dict_heb = update_char_dicts(relevant_chars_utf8, alphabet_dict_heb, new_chars)
                
                morph_list = [''.join([alphabet_dict_heb[char] for char in morph if char in relevant_chars_utf8]) for morph in morphs]
        
                morph_list_with_markers = []
                for idx, morph in enumerate(morph_list):
                    if morph:
                        morph = morph + morph_marker_dict.get(idx, '')
                        morph_list_with_markers.append(morph)
                
                morph_string_with_markers = ' '.join(morph_list_with_markers)
                morph_string_with_markers = ' '.join(morph_string_with_markers.split())
                morphemes_in_clause_with_markers.append(morph_string_with_markers)

            all_morph_strings_with_markers[key] = ' '.join(morphemes_in_clause_with_markers)

        return all_morph_strings_with_markers

In [11]:
hebrew_bible_verses = HebrewBibleVerses('lex', relevant_chars_utf8, alphabet_dict_heb)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
import pickle

with open(f'../analyze_performance_models_on_parallel_texts_and_translation/syriac_verse_dict_for_multilingual_evaluation_s4.pkl', 'rb') as f:
    syriac_morphemes_dict = pickle.load(f)

In [14]:
def get_hebrew_text_representation(representation: str, hebrew_bible):
    """
    
    """
    if representation == 'raw':
        texts_dict = hebrew_bible.n_clause_hebrew_texts
    elif representation == 'morphemes_with_markers':
        texts_dict = hebrew_bible.n_clause_morphemes_with_markers

    return texts_dict

def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name,
                                              return_dict_in_generate=True, 
                                              output_hidden_states=True).to(device)

    model.eval()
    return tokenizer, model

def get_hidden_states(heb_texts_dict, model, tokenizer):
    hidden_states = {}
    for key, text_chunk in heb_texts_dict.items():
        tokenized_inputs = tokenizer(text_chunk, max_length=128, truncation=True, padding=True, return_tensors="pt")
        tokenized_inputs = {k:v.to(device) for k,v in tokenized_inputs.items()}
        
        with torch.no_grad():
            outputs = model(**tokenized_inputs)
            last_hidden_states = outputs.hidden_states[-1].cpu().numpy()
            hidden_states[key] = last_hidden_states

    return hidden_states

def process_hidden_states(hidden_states, approach):
    processed_hidden_states = {}

    for key, hs in hidden_states.items():
        if approach == 'mean':
            state= np.mean(hs, 1)
        elif approach == 'cls':
            state = hs[:, 0, :]
        state = np.squeeze(state)
        processed_hidden_states[key] = state 
    return processed_hidden_states

def get_keys(hebrew_bible, target_book):
    target_book_keys = [key for key in hebrew_bible.n_clause_dict.keys() if target_book in key[0]]
    non_target_book_keys = [key for key in hebrew_bible.n_clause_dict.keys() if target_book not in key[0]]

    return target_book_keys, non_target_book_keys

def make_heb_syr_cosine_distance_array(book, embeddings_dict_heb, embeddings_dict_syr):

    # Remove '1_' and '2_' from booknames 1_Chronicles, 2 Chronicles, etc.
    #if '_' in book:
    #    book = book.split('_')[1]

    cosine_dists = np.zeros((len(embeddings_dict_heb), len(embeddings_dict_syr)))
    for heb_idx, heb_embedding in enumerate(embeddings_dict_heb.values()):
        for syr_idx, syr_embedding in enumerate(embeddings_dict_syr.values()):
            cosine_dists[heb_idx, syr_idx] = distance.cosine(heb_embedding, syr_embedding)
    return cosine_dists

def make_min_dist_dict(distances, heb_keys, syr_keys):

    min_dist_dict = {}
    min_indices = np.argmin(distances, axis=1)
    for heb_idx, syr_idx in enumerate(min_indices):
        heb_key = heb_keys[heb_idx]
        syr_key = syr_keys[syr_idx]
        min_dist_dict[heb_key] = syr_key

    return min_dist_dict

def calculate_model_performance(min_dist_dict: dict):
    correct = 0
    wrong = 0
    for heb_key, syr_value in min_dist_dict.items():
        if heb_key == syr_value:
            correct += 1
        else:
            wrong += 1

    return correct, wrong

def make_top_n_indices_array(distances_array, n):
    top_n_min_indices = np.argpartition(distances_array, n, axis=1)[:, :n]
    return top_n_min_indices

def calculate_top_n_model_performance(top_n_min_indices, syr_keys, heb_keys):

    in_top_n_count = 0
    for i in range(top_n_min_indices.shape[0]):
        top_idcs = top_n_min_indices[i]
        top_n_syr_keys = [syr_keys[idx] for idx in top_idcs]

        true_key = heb_keys[i]
        if true_key in top_n_syr_keys:
            in_top_n_count += 1
    return in_top_n_count

In [15]:
def get_target_book_keys(hebrew_bible):
    target_book_keys = [key for key in hebrew_bible.n_clause_dict.keys() if target_book in key[0]]
    return target_book_keys

In [16]:
def evaluate_model(model_name: str, text_representation: str, hebrew_bible, syriac_text, target_book):

    eval_dict = {}

    print(f'Model name: {model_name}')
    print('Get text representation')
    hebrew_text = get_hebrew_text_representation(text_representation, hebrew_bible)

    hebrew_morphemes_target_book = {k:v for (k, v) in hebrew_text.items() if k[0] == target_book}
    target_book_keys = [key for key in hebrew_text.keys() if key[0] == target_book]

    syriac_morphemes = {k:v for (k, v) in syriac_text.items()} # if k[0] == target_book}

    print('Load model and tokenizer')
    tokenizer, model = load_model_and_tokenizer(model_name)

    print('Retrieve hidden states from model')
    hidden_states_hebrew = get_hidden_states(hebrew_morphemes_target_book, model, tokenizer)
    hidden_states_syriac = get_hidden_states(syriac_morphemes, model, tokenizer)
    
    for embedding_method in ['mean', 'cls']:
        print(f'Process hidden states with {embedding_method}')
        processed_hidden_states_hebrew = process_hidden_states(hidden_states_hebrew, embedding_method)
        processed_hidden_states_syriac = process_hidden_states(hidden_states_syriac, embedding_method)

        print(f'Calculate cosine distances using {embedding_method}')
        distances_array = make_heb_syr_cosine_distance_array(target_book, processed_hidden_states_hebrew, processed_hidden_states_syriac)
        heb_keys = list(hebrew_morphemes_target_book.keys())
        syr_keys = list(syriac_morphemes.keys())

        top_1_min_indices = make_top_n_indices_array(distances_array, 1)
        in_top_1_count = calculate_top_n_model_performance(top_1_min_indices, syr_keys, heb_keys)

        top_5_min_indices = make_top_n_indices_array(distances_array, 5)
        in_top_5_count = calculate_top_n_model_performance(top_5_min_indices, syr_keys, heb_keys)

        top_10_min_indices = make_top_n_indices_array(distances_array, 10)
        in_top_10_count = calculate_top_n_model_performance(top_10_min_indices, syr_keys, heb_keys)

        print(f'Evaluation {embedding_method}')
        print(f'Evaluation of {len(heb_keys)} verses.')

        print(f'Completely correct: {in_top_1_count / len(heb_keys)}' )
        print(f'In top 5: {in_top_5_count / len(heb_keys)}' )
        print(f'In top 10: {in_top_10_count / len(heb_keys)}' )
        print()

    return distances_array
    

In [17]:
model_name = 'martijn75/morphs_marks_lex_4_layers_8_att_heads_5_seqlen_20_augm_with_xbib_syr_20_augm'

distances_array = evaluate_model(model_name, 'morphemes_with_markers', hebrew_bible_verses, syriac_morphemes_dict, 'Genesis')

Model name: martijn75/morphs_marks_lex_4_layers_8_att_heads_5_seqlen_20_augm_with_xbib_syr_20_augm
Get text representation
Load model and tokenizer


tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Retrieve hidden states from model
Process hidden states with mean
Calculate cosine distances using mean
Evaluation mean
Evaluation of 1533 verses.
Completely correct: 0.22896281800391388
In top 5: 0.42857142857142855
In top 10: 0.5342465753424658

Process hidden states with cls
Calculate cosine distances using cls
Evaluation cls
Evaluation of 1533 verses.
Completely correct: 0.14220482713633398
In top 5: 0.3150684931506849
In top 10: 0.41943900848010435



In [111]:
syriac_morphemes_dict[('Genesis', 41, 6 )]

'ו הא שבעֶ שבלאֶ ינ֜ קטינֶ נ֜ ו שקפַ נ֜ ל רוחֶ אֽ ד שובֶ אֽ יעיַ נ֜ בתר הינ'

In [None]:
# WITH G_CONS IPV LEX
'martijn75/bert_mt_morph_with_markers_based_on_lex_3_layers_8_att_heads_5_seqlen_10_augm_with_xbib_syriac
{'mean_correct': 91, 'mean_wrong': 1442, 'cls_correct': 49, 'cls_wrong': 1484}


martijn75/morphs_marks_lex_3_layers_8_att_heads_5_seqlen_10_augm_with_xbib_syr_10_augm
{'mean_correct': 144, 'mean_wrong': 1389, 'cls_correct': 85, 'cls_wrong': 1448}

Model name: martijn75/morphs_marks_lex_4_layers_8_att_heads_5_seqlen_10_augm_with_xbib_syr_10_augm
{'mean_correct': 164, 'mean_wrong': 1369, 'cls_correct': 81, 'cls_wrong': 1452}


# WITH UPDATED SYRIAC DATA:
Model name: martijn75/morphs_marks_lex_4_layers_8_att_heads_5_seqlen_10_augm_with_xbib_syr_10_augm
{'mean_correct': 370, 'mean_wrong': 1163, 'cls_correct': 183, 'cls_wrong': 1350}