In [1]:
import os, collections

from Bio import pairwise2
from Bio.Seq import Seq
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from data import AllBooks, Book
from utils import most_frequent

#import bible_datasets

from align_config import dss_version, sp_version
from align_functions import make_alignments, collect_matching_words

from tf.app import use

MT = use('etcbc/bhsa')
Fmt, Tmt, Lmt = MT.api.F, MT.api.T, MT.api.L

SP = use('dt-ucph/sp', version=sp_version)
Fsp, Tsp, Lsp = SP.api.F, SP.api.T, SP.api.L

DSS = use('etcbc/dss', version=dss_version)
Fdss, Tdss, Ldss = DSS.api.F, DSS.api.T, DSS.api.L


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,5,79878.4,100
chapter,187,2135.79,100
verse,5841,68.38,100
word,114891,3.48,100
sign,399392,1.0,100


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


In [2]:
DATA_FOLDER = '../data'
MATRES_FILE = ['nouns_adjectives.csv', 'ptca_qal.csv', 'ptcp_qal.csv', 'niph_hiph_pe_yod.csv', 'infa_qal.csv',
               'infc_qal_triliteral.csv', 'infc_qal_lamed_he.csv', 'hiphil_triliteral_with_hireq.csv']

In [3]:
PENTATEUCH_BOOKS = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']
ALL_BOOK_NAMES = [Tmt.sectionFromNode(bo)[0] for bo in Fmt.otype.s('book')]

In [4]:
QSP_SCROLLS = {'1Qisaa', '1QisaaI', '1QisaaII', '2Q3', '4Q13', '4Q20', '2Q7', '4Q27', '1Q4', '2Q12', '4Q37', '4Q38', '4Q38a', '4Q40', '4Q53',
               '4Q57', '2Q13', '4Q78', '4Q80', '4Q82', '4Q128', '4Q129', '4Q134', '4Q135', '4Q136',
                '4Q137', '4Q138', '4Q139', '4Q140', '4Q141', '4Q142', '4Q143', '4Q144', '4Q158', '4Q364',
                '4Q365', '4Q96', '4Q111', '4Q109', '11Q5', '11Q6', '11Q7', '11Q8'}

# Prepare MT and SP texts

Produce dictionary mt_sp_matches which has mt words nodes as keys and matching word numbers from SP as values.

In [5]:
# prepare mt and sp books
MANUSCRIPTS = ['MT', 'SP']
all_books = AllBooks()
for book_name in ALL_BOOK_NAMES:
    book = Book('MT', book_name, Fmt, Tmt, Lmt)
    all_books.data[('MT', book_name)] = book
    
    if book_name in PENTATEUCH_BOOKS:
        book = Book('SP', book_name, Fsp, Tsp, Lsp)
        all_books.data[('SP', book_name)] = book

# Match words

In [6]:
all_data = []

for file_name in MATRES_FILE:
    file_path = DATA_FOLDER + '/' + file_name
    feature_name = file_name.split('.')[0]
    data_one_feature = pd.read_csv(file_path, sep='\t')
    data_one_feature['feature'] = feature_name
    all_data.append(data_one_feature)
    
dat = pd.concat(all_data)
dat.shape

(62333, 39)

In [7]:
dat.head()

Unnamed: 0,tf_id,scroll,book,chapter,verse,lex,g_cons,stem,pattern,pattern_g_cons,...,has_nme,rec_signs_stem,cor_signs_stem,type,vowel_letter,has_vowel_letter,neigh_vowel_letter,feature,has_hireq,mt_match
0,2,MT,Genesis,1,1,R>CJT/,R>CJT,R>CJT,CMCMC,CMCMC,...,0,nnnnn,nnnnn,first,>,1,0.0,nouns_adjectives,,
1,2,MT,Genesis,1,1,R>CJT/,R>CJT,R>CJT,CMCMC,CMCMC,...,0,nnnnn,nnnnn,last,J,1,0.0,nouns_adjectives,,
2,4,MT,Genesis,1,1,>LHJM/,>LHJM,>LH,CCC,CCCMC,...,1,nnn,nnn,last,,0,0.0,nouns_adjectives,,
3,20,MT,Genesis,1,2,XCK/,XCK,XCK,CCC,CCC,...,0,nnn,nnn,first,,0,0.0,nouns_adjectives,,
4,23,MT,Genesis,1,2,THWM/,THWM,THWM,CCMC,CCMC,...,0,nnnn,nnnn,last,W,1,0.0,nouns_adjectives,,


In [8]:
dat.tail()

Unnamed: 0,tf_id,scroll,book,chapter,verse,lex,g_cons,stem,pattern,pattern_g_cons,...,has_nme,rec_signs_stem,cor_signs_stem,type,vowel_letter,has_vowel_letter,neigh_vowel_letter,feature,has_hireq,mt_match
3136,2104434,Mas1b,Leviticus,11,7,PRS[,MPRJS,PRJS,,,...,0,nnnn,nnnn,last,J,1,,hiphil_triliteral_with_hireq,1,57952
3137,2106782,Mas1e,Psalms,81,15,KN<[,>KNJ<,KNJ<,,,...,0,nnnn,nnnn,last,J,1,,hiphil_triliteral_with_hireq,1,324738
3138,2106808,Mas1e,Psalms,81,17,FB<[,>FBJ<K,FBJ<,,,...,0,nnnn,nnnn,last,J,1,,hiphil_triliteral_with_hireq,1,324762
3139,2106840,Mas1e,Psalms,82,3,YDQ[,HYDJQW,YDJQ,,,...,0,nnnn,nnnn,last,J,1,,hiphil_triliteral_with_hireq,1,324791
3140,2106849,Mas1e,Psalms,82,4,NYL[,HYJLW,YJL,,,...,0,nnn,nnn,last,J,1,,hiphil_triliteral_with_hireq,1,324799


In [9]:
dat.dtypes

tf_id                   int64
scroll                 object
book                   object
chapter                 int64
verse                   int64
lex                    object
g_cons                 object
stem                   object
pattern                object
pattern_g_cons         object
vs                     object
vt                     object
nu                     object
gn                     object
ps                     object
sp                     object
prs                    object
nme                    object
hloc                   object
prefix                 object
rec_signs              object
cor_signs              object
heb_g_cons             object
other_vowel_ending     object
line                   object
column                 object
has_prs                 int64
has_prefix              int64
has_hloc                int64
has_nme                 int64
rec_signs_stem         object
cor_signs_stem         object
type                   object
vowel_lett

In [10]:
# Remove hiphil without hireq

dat = dat[(dat.feature != 'hiphil_triliteral_with_hireq') | (dat.feature == 'hiphil_triliteral_with_hireq') & (dat.has_hireq == '1')] # & dat.has_hireq == 1)]

In [11]:
dat['book2'] = pd.Series(dat.book.str.replace('1_', '').str.replace('2_', ''))

In [12]:
#dat = pd.read_csv(MATRES_FILE, sep='\t')

# dat_dss means non-MT, SP is included
dat_dss = dat[~dat.scroll.isin(['MT'])]
scroll_book_combinations = list(set(zip(dat_dss.scroll, dat_dss.book)))
dat_dss.shape

(19026, 40)

In [13]:
for manuscript, book_name in scroll_book_combinations:
    if manuscript == 'MT':
        continue
    elif manuscript == 'SP':
        book = Book(manuscript, book_name, Fsp, Tsp, Lsp)
        all_books.data[(manuscript, book_name)] = book
    else:
        book = Book(manuscript, book_name, Fdss, Tdss, Ldss)
        all_books.data[(manuscript, book_name)] = book

In [14]:
def make_matching_word_dict(book_name, all_books):

    all_match_dicts = {}

    matching_book = book_name
    matching_manuscripts = [scr for scr, bo in all_books.data.keys() if bo == matching_book]

    for man1 in matching_manuscripts:
        for man2 in matching_manuscripts:
            if man1 != man2:
            
                man1_verse_texts = all_books.data[(man1, matching_book)].verse_text_dict
                man2_verse_texts = all_books.data[(man2, matching_book)].verse_text_dict

                alignments_dict = make_alignments(man1_verse_texts, man2_verse_texts)
    
                man1_word2char = all_books.data[(man1, matching_book)].word2char
                man2_word2char = all_books.data[(man2, matching_book)].word2char
 
                matching_words_dict = collect_matching_words(alignments_dict, man1_word2char, man2_word2char)

                matches = {}
                for man1_word, man2_list in matching_words_dict.items():
                    man2_word = most_frequent(man2_list)
                    matches[man1_word] = man2_word
                all_match_dicts[((man1, matching_book), (man2, matching_book))] = matches
                
    return all_match_dicts, matching_manuscripts

In [15]:
def read_dataset(file):

    return pd.read_csv(file, sep='\t')

In [16]:
class MaterData:
    def __init__(self, man1, man2, section, lex, mater_val1, mater_val2, tf_id1, tf_id2, g_cons1, g_cons2):
        self.man1 = man1
        self.man2 = man2
        self.section = section
        self.lex = lex
        self.mater_val1 = mater_val1
        self.mater_val2 = mater_val2
        self.tf_id1 = tf_id1
        self.tf_id2 = tf_id2
        self.g_cons1 = g_cons1
        self.g_cons2 = g_cons2

In [17]:
def collect_matching_cases(matching_manuscripts, matching_book, dat):

    all_mater_datasets = {}

    for man in matching_manuscripts:
        mater_data = dat[(dat.book == matching_book) & (dat.scroll == man)]
        all_mater_datasets[(man, matching_book)] = mater_data
        
    return all_mater_datasets

In [18]:
def collect_mater_data(matching_manuscripts, all_match_dicts, all_mater_datasets, matching_book):

    manuscript_mater_match = collections.defaultdict(list)
    manuscripts = set()

    for idx, man in enumerate(matching_manuscripts):
        for idx2, man2 in enumerate(matching_manuscripts):
            if idx < idx2:
            
                matching_ids = all_match_dicts[((man, matching_book), (man2, matching_book))]
                man_data = all_mater_datasets[(man, matching_book)]
                man2_data = all_mater_datasets[(man2, matching_book)]
                for _, row in man_data.iterrows():
                
                    tf_id = row.tf_id
                    lex, typ, has_vl = row.lex, row.type, row.has_vowel_letter
                    g_cons1 = row.g_cons
                    section = (row.book, row.chapter, row.verse)
                    if man == 'SP':
                        tf_id = tf_id - 100000

                    matching_tf_id = matching_ids.get(tf_id, None)
                
                    if not matching_tf_id:
                        continue
                    
                    if man2 == 'SP':
                        matching_tf_id = matching_tf_id + 100000
                
                    man2_row = man2_data[(man2_data.tf_id == matching_tf_id) & (man2_data.lex == lex) & (man2_data.type == typ)]
                    
                    if not man2_row.shape[0]:
                        continue
                    has_vl2 = man2_row.has_vowel_letter.iloc[0]
                    g_cons2 = man2_row.g_cons.iloc[0]
                 
                    mater_data = MaterData(man, man2, section, lex, has_vl, has_vl2, tf_id, matching_tf_id, g_cons1, g_cons2)
                
                    manuscript_mater_match[man].append(mater_data)
                    manuscripts.add(man)
                    manuscripts.add(man2)
                    
    return manuscript_mater_match, manuscripts

In [19]:
def register_similarities_with_mt(manuscripts, mt_ids, manuscript_mater_match):

    mater_value_dict = {0: -1,
                    1: 1}

    mater_match_array = np.zeros((len(manuscripts), len(mt_ids)))

    for dat_object in manuscript_mater_match['MT']:
        
        other_man = dat_object.man2
        mt_tfid = dat_object.tf_id1
        other_man_tfid = dat_object.tf_id2
    
        mt_mater = dat_object.mater_val1
        other_man_mater = dat_object.mater_val2
    
        mt_idx = man2idx['MT']
        other_man_idx = man2idx[other_man]
    
        mt_mater_value = mater_value_dict[mt_mater]
        other_man_mater_value = mater_value_dict[other_man_mater]
    
        mt_tf_id = mt_tf2idx[mt_tfid]
    
        mater_match_array[mt_idx, mt_tf_id] = mt_mater_value
        mater_match_array[other_man_idx, mt_tf_id] = other_man_mater_value
        
    return mater_match_array

In [20]:
def get_parallels(manuscript_mater_match, hif_match_dict):
    for scroll in manuscript_mater_match.keys():
        for dat_object in manuscript_mater_match[scroll]:
            man1 = dat_object.man1
            man2 = dat_object.man2
            
            hif_match_dict[(dat_object.tf_id2, man2)][man1] = dat_object.tf_id1
            hif_match_dict[(dat_object.tf_id1, man1)][man2] = dat_object.tf_id2
        
    return hif_match_dict


In [21]:
def count_parallel_cases(mater_match_array):

    mater_arr = np.zeros((2, mater_match_array.shape[1]))

    for col_idx in range(mater_match_array.shape[1]):
        col = mater_match_array[:, col_idx]
        col_counts = collections.Counter(col)
        with_vowel_count = col_counts.get(1, 0)
        without_vowel_count = col_counts.get(-1, 0)
        mater_arr[0, col_idx] = with_vowel_count
        mater_arr[1, col_idx] = without_vowel_count
        
    return mater_arr

In [22]:
hif_match_dict = collections.defaultdict(lambda: collections.defaultdict(int))

for book in {'Isaiah'}: #ALL_BOOK_NAMES:
    print(book)
    all_match_dicts, matching_manuscripts = make_matching_word_dict(book, all_books)
    all_mater_datasets = collect_matching_cases(matching_manuscripts, book, dat)
    manuscript_mater_match, manuscripts = collect_mater_data(matching_manuscripts, all_match_dicts, all_mater_datasets, book)
    
    hif_match_dict = get_parallels(manuscript_mater_match, hif_match_dict)
    print(len(hif_match_dict))
    

Isaiah
6647


In [24]:
def merge_dicts(dict_args):
    """
    Given any number of dictionaries, shallow copy and merge into a new dict,
    precedence goes to key-value pairs in latter dictionaries.
    """
    result = {}
    for book, dictionary in dict_args.items():
        result.update(dictionary)
    return result

In [25]:
all_book_dicts = {}

for book in ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']:
    print(book)
    all_match_dicts, matching_manuscripts = make_matching_word_dict(book, all_books)
    book_dict = all_match_dicts[(('MT', book), ('SP', book))]
    all_book_dicts[book] = book_dict

Genesis
Exodus
Leviticus
Numbers
Deuteronomy


In [27]:
full_books_dict = merge_dicts(all_book_dicts)

In [28]:
len(full_books_dict)

110259

In [30]:
import json

with open("mt_to_sp.json", "w") as outfile:
    json.dump(full_books_dict, outfile)

In [35]:
n = 0

for m, s in full_books_dict.items():
    lex_mt = Fmt.lex.v(m)
    lex_sp = Fsp.lex.v(s)
    if lex_mt != lex_sp:
        print(lex_mt, lex_sp, m, s)
        n += 1
        
        
print(n)

RMF[ RMF/ 637 406061
CBJ<J/ CCJ/ 691 406114
TPR[ absent 1308 406734
HRWN/ HRJWN/ 1485 406908
<YB=/ <YBWN/ 1487 406910
LHV/ LXV/ 1655 407078
>J >JH 1813 407237
MXJJ>L/ MXWJ>L/ 1979 407402
>MRH/ absent 2057 407480
M>H/ CC/ 2376 407796
CNJM/ CB</ 2402 407827
CC/ >RB</ 2404 407829
TC</ CMNH/ 2407 407832
CMNH/ CC/ 2471 407896
CNH/ CC/ 2472 407896
M>H/ CC/ 2474 407896
CNJM/ CLC/ 2487 407909
CMNH/ XMC/ 2489 407911
CB</ CC/ 2492 407914
TC</ <FR/ 2505 407927
CC/ <FR/ 2507 407927
TC</ CB</ 2510 407930
CNJM/ CLC/ 2518 407938
CMNH/ XMC/ 2520 407940
CNH/ XMC/ 2521 407940
M>H/ XMC/ 2523 407940
XMC/ NX/ 2556 407972
CNH/ CC/ 2559 407973
XMC/ CC/ 2561 407973
CB</ CLC/ 2574 407986
CB</ XMC/ 2576 407988
CB</ CC/ 2579 407991
HMH HM 2687 408097
>L <L 2895 408303
W QWM[ 2942 408349
XJ/ XJH/ 2965 408371
HJH[ NQBH/ 2982 408389
JHWH/ >LHJM/ 3040 408452
L >L 3041 408453
>CH/ W 3074 408484
>JC/ CNJM/ 3084 408495
>CH/ W 3086 408497
>LHJM/ JHWH/ 3211 408627
HMH HM 3302 408714
GBH/ GBX/ 3429 408843
KL>[ KLH[ 3584 4

MYH[ MY>[ 52826 459688
<L >L 52828 459690
MR>H==/ MRH==/ 52835 459697
CM MN 52906 459770
QVR[ QRB[ 53072 459931
QLH[ QLJ/ 53122 459980
MN MZBX/ 53205 460063
ZBX/ MZBX/ 53206 460063
>CH=/ >CR 53273 460130
MN MZBX/ 53332 460195
ZBX/ MZBX/ 53333 460195
HNH= HN 53543 460408
MN ML>[ 53602 460468
DM/ JD/ 53603 460470
>L <L 53773 460642
HW> HJ> 53982 460860
HW> HJ> 54037 460916
NQBH/ TMJM/ 54120 461002
KBF/ KFBH/ 54201 461088
MN MZBX/ 54266 461154
ZBX/ MZBX/ 54267 461154
>W W 54364 461250
JVB[ VWB[ 54376 461262
>CM[ XV>[ 54400 461286
NG<[ NFG[ 54440 461332
<L >L 54495 461388
MYH[ MY>[ 54505 461397
HW> HJ> 54511 461403
FJM[ JYQ[ 54559 461450
M<L=/ <L 54758 461645
HNH= B 54799 461685
GZLH/ GZL/ 54810 461696
<CQ/ <CWQ/ 54816 461702
B BJN/ 55162 462040
XRF/ XRC/ 55276 462153
MN MZBX/ 55678 462544
ZBX/ MZBX/ 55679 462544
CQY/ CRY/ 55713 462578
MN MZBX/ 55844 462711
ZBX/ MZBX/ 55845 462711
XQH/ XQ/ 55986 462851
>L <L 56174 463033
FJM[ NTN[ 56192 463051
MGB<H/ GB<H/ 56288 463148
QRB[ NGC[ 56406 4632