# Classifying Aramaic and Hebrew clauses using sequence analysis

In this notebook Aramaic and Hebrew clauses are distinguished, based on a representation of clauses as a sequence of phrase functions or parts of speech. This analysis serves as a validation of the approach in the analysis in which EBH and LBH clauses are distinguished.

In [None]:
import numpy as np
from pprint import pprint
import sys, os, csv, collections

import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.optimizers import Adam
from keras import backend as K

from sklearn.model_selection import train_test_split

Start TF!

In [None]:
from tf.app import use
A = use('bhsa', hoist=globals())

Define prose books

In [None]:
prose = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings', 'Jonah', 'Ruth', 'Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles']

A dictionary is made in which EBH and LBH are defined.

In [None]:
ebh_lbh_dict = {}

ebh = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings']
lbh = ['Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles']
for book in ebh:
    ebh_lbh_dict[book] = 'ebh'
for book in lbh:
    ebh_lbh_dict[book] = 'lbh'

double_books = {'1_Samuel': 'Samuel', '2_Samuel' : 'Samuel', '2_Kings' : 'Kings', '1_Kings': 'Kings', '1_Chronicles' : 'Chronicles', '2_Chronicles' : 'Chronicles'}

Count N and Q clauses in Aramaic portions of the MT.

In [None]:
ara_tot = 0
ara_qnd = collections.defaultdict(int)
ara_book = collections.defaultdict(int)

for cl in F.otype.s('clause'):
    
    words1 = L.d(cl, 'word')
    if F.language.v(words1[0]) == 'Aramaic':
        ara_tot += 1
        ara_qnd[F.txt.v(cl)[-1]] += 1
 
print("total number of Aramaic clauses: ", ara_tot)
print(ara_qnd)
print(ara_book)  

Extract the data from the BHSA. Note that the function has one argument (level), which can have two values: 'word_level' and 'clause_level'.

In [None]:
def extract_data(level):
    
    # these lists contain all sequences
    seq_h = [] # hebrew sequences
    seq_a = [] # aramaic sequences
    
    elem_count = collections.defaultdict(int) # count 

    max_len = 0 # check what is the length of the longest clause
       
    for cl in F.otype.s('clause'):
        
        # Only Q clauses are selected
        if F.txt.v(cl)[-1] in {'D','?', 'N'}:
            continue
    
        # poetic sections are removed
        bo,ch,ve = T.sectionFromNode(cl)
        if bo == 'Genesis' and ch == 49 and 1 < ve < 28:
            continue
        elif bo == 'Exodus' and ch == 15 and ve < 19:
            continue
        elif bo == 'Numbers' and ch in {23,24}:
            continue
        elif bo == 'Deuteronomy' and ch in {32,33}:
            continue
        elif bo == 'Judges' and ch == 5:
            continue
        elif bo == '1_Samuel' and ch == 2 and ve < 11:
            continue
        elif bo == '2_Samuel' and ch == 1 and ve > 18:
            continue
        elif bo == '2_Samuel' and ch == 22:
            continue
        elif bo == '2_Samuel' and ch == 23 and ve < 8:
            continue
        elif bo == 'Daniel' and ch == 2 and 19 < ve < 24:
            continue
        elif bo == 'Daniel' and ch == 8 and 22 < ve < 27:
            continue  
        elif bo == 'Daniel' and ch == 12 and ve < 4:
            continue
        elif bo == 'Nehemiah' and ch == 9 and 5 < ve < 38:
            continue    
        if bo == '1_Chronicles' and ch == 16 and 7 < ve < 37:
            continue
        
        words = L.d(cl, 'word')
    
        if level == 'phrase_level':
            if F.language.v(words[0]) == 'Hebrew':
 
                if T.bookName(cl) in prose and T.bookName(cl) not in {'Daniel', 'Ezra'}:
                    phrases = L.d(cl, 'phrase')
                    funcs = [F.function.v(ph) for ph in phrases]
            
                    for fun in funcs:
                        elem_count[fun] += 1
                    seq_h.append(funcs)
            
                    if len(funcs) > max_len:
                        max_len = len(funcs)
                
            elif F.language.v(words[0]) == 'Aramaic':
                if T.bookName(cl) in prose:
                    phrases = L.d(cl, 'phrase')
                    funcs = [F.function.v(ph) for ph in phrases]
        
                    for fun in funcs:
                        elem_count[fun] += 1
                    seq_a.append(funcs)
            
                    if len(funcs) > max_len:
                        max_len = len(funcs)

        if level == 'word_level':
            words = L.d(cl, 'word')
            if F.language.v(words[0]) == 'Hebrew':
 
                if T.bookName(cl) in prose and T.bookName(cl) not in {'Daniel', 'Ezra'}:
                    poss = [F.sp.v(w) for w in words]
            
                    for pos in poss:
                        elem_count[pos] += 1
                    seq_h.append(poss)
            
                    if len(poss) > max_len:
                        max_len = len(poss)
                
            elif F.language.v(words[0]) == 'Aramaic':
                if T.bookName(cl) in prose:
                    poss = [F.sp.v(w) for w in words]
        
                    for pos in poss:
                        elem_count[pos] += 1
                    seq_a.append(poss)
            
                    if len(poss) > max_len:
                        max_len = len(poss)
                    
    return(max_len, seq_h, seq_a, elem_count)     

Creates a dict, called f2int_dict, for converting phrase functions or parts of speech to integers. 

In [None]:
def create_dict(elem_count):

    f2int_dict = {}
    f_list = []
    for value in elem_count.values():
        f_list.append(value)
    
    sorted_freqs = (sorted(f_list, reverse=True))

    for key in elem_count.keys():
        f2int_dict[key] = sorted_freqs.index(elem_count[key]) + 1
        
    return(f2int_dict)

The sequences are converted to integers.

In [None]:
def convert_to_ints(seq_h, seq_a, f2int_dict):

    ints_h = []
    ints_a = []

    for clause in seq_h:
        seq_ints = [f2int_dict[elem] for elem in clause]
        ints_h.append(seq_ints)
    for clause in seq_a:
        seq_ints = [f2int_dict[elem] for elem in clause]
        ints_a.append(seq_ints)
    
    seq_arr_h = np.asarray(ints_h)
    seq_arr_a = np.asarray(ints_a)
    
    return(seq_arr_h, seq_arr_a)

Select 900 clauses from the Hebrew and Aramaic prose portions of the MT.

In [None]:
def prepare_data(phr_ints_h, phr_ints_a):

    sel_heb = np.random.choice(phr_ints_h, 900, replace = False)
    sel_ara = np.random.choice(phr_ints_a, 900, replace = False)

    selected_input = np.concatenate((sel_heb, sel_ara), axis=0)
    tar_heb = [0 for elem in sel_heb]
    tar_ara = [1 for elem in sel_ara]
    selected_targets = np.array(tar_heb + tar_ara)

    X_train = sequence.pad_sequences(selected_input, maxlen=max_len)
    data_train, data_test, labels_train, labels_test = train_test_split(X_train, selected_targets, test_size=0.15, random_state=42)

    return data_train, data_test, labels_train, labels_test

In [None]:
predictions = []

for i in range(200):
    
    print(i)
    
    # choose 'word_level' or 'phrase_level' as argument of function extract_data()
    max_len, seq_h, seq_a, elem_count = extract_data('phrase_level')
    f2int_dict = create_dict(elem_count)
    seq_arr_h, seq_arr_a = convert_to_ints(seq_h, seq_a, f2int_dict)
    data_train, data_test, labels_train, labels_test = prepare_data(seq_arr_h, seq_arr_a)
  
    # define model
    top_words = 100
    embedding_vector_length = 32
   
    model = Sequential()
    model.add(Embedding(top_words, embedding_vector_length, input_length=max_len))
    model.add(LSTM(200, activation = 'relu', return_sequences=True))
    model.add(Dropout(0.5)) # dropout is used to prevent overfitting
    model.add(LSTM(200, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    adam = Adam(lr=0.0006)
    
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    
    # fit the model
    history = model.fit(data_train, labels_train, validation_data=(data_test, labels_test), epochs=50, batch_size=256)
    
    scores = model.evaluate(data_test, labels_test, verbose=0)
    
    predictions.append(scores[1])
    
    # remove model from memory
    K.clear_session()
    
    # plot history
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')

    plt.show()

    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')

    plt.show()

In [None]:
print(predictions)

In [None]:
csvh = open(r"aram_heb_phrases.csv", "w")

header = ['accuracy']
csvh.write('{}\n'.format(','.join(header)))

for value in predictions:
    
    csvh.write('{}\n'.format(','.join(str(value)))
    
csvh.close()