In [41]:
import collections

from Bio import pairwise2
from Bio.Seq import Seq
import matplotlib.pyplot as plt
#import networkx as nx
import numpy as np
import pandas as pd

from data import AllBooks, Book
from utils import most_frequent

#import bible_datasets

#from tf.app import use
#MT = use('etcbc/bhsa')
#Fmt, Tmt, Lmt = MT.api.F, MT.api.T, MT.api.L

#SP = use('dt-ucph/sp', version=sp_version)
#Fsp, Tsp, Lsp = SP.api.F, SP.api.T, SP.api.L

#DSS = use('etcbc/dss', version=dss_version)
#Fdss, Tdss, Ldss = DSS.api.F, DSS.api.T, DSS.api.L

from align_config import dss_version, sp_version
from align_functions import make_alignments, collect_matching_words

from tf.app import use

MT = use('etcbc/bhsa')
Fmt, Tmt, Lmt = MT.api.F, MT.api.T, MT.api.L

SP = use('dt-ucph/sp', version=sp_version)
Fsp, Tsp, Lsp = SP.api.F, SP.api.T, SP.api.L

DSS = use('etcbc/dss', version=dss_version)
Fdss, Tdss, Ldss = DSS.api.F, DSS.api.T, DSS.api.L


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,5,79878.4,100
chapter,187,2135.79,100
verse,5841,68.38,100
word,114891,3.48,100
sign,399392,1.0,100


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


In [42]:
MATRES_FILE = '../data/ptca_qal.csv'

In [43]:
PENTATEUCH_BOOKS = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']
ALL_BOOK_NAMES = [Tmt.sectionFromNode(bo)[0] for bo in Fmt.otype.s('book')]

In [44]:
QSP_SCROLLS = {'1Qisaa', '1QisaaI', '1QisaaII', '2Q3', '4Q13', '4Q20', '2Q7', '4Q27', '1Q4', '2Q12', '4Q37', '4Q38', '4Q38a', '4Q40', '4Q53',
               '4Q57', '2Q13', '4Q78', '4Q80', '4Q82', '4Q128', '4Q129', '4Q134', '4Q135', '4Q136',
                '4Q137', '4Q138', '4Q139', '4Q140', '4Q141', '4Q142', '4Q143', '4Q144', '4Q158', '4Q364',
                '4Q365', '4Q96', '4Q111', '4Q109', '11Q5', '11Q6', '11Q7', '11Q8'}

# Prepare MT and SP texts

Produce dictionary mt_sp_matches which has mt words nodes as keys and matching word numbers from SP as values.

In [45]:
# prepare mt and sp books
MANUSCRIPTS = ['MT', 'SP']
all_books = AllBooks()
for book_name in ALL_BOOK_NAMES:
    book = Book('MT', book_name, Fmt, Tmt, Lmt)
    all_books.data[('MT', book_name)] = book
    
    if book_name in PENTATEUCH_BOOKS:
        book = Book('SP', book_name, Fsp, Tsp, Lsp)
        all_books.data[('SP', book_name)] = book

# Match words

In [46]:
dat = pd.read_csv(MATRES_FILE, sep='\t')

# dat_dss means non-MT, SP is included
dat_dss = dat[~dat.scroll.isin(['MT'])]
scroll_book_combinations = list(set(zip(dat_dss.scroll, dat_dss.book)))
dat_dss.shape

(2029, 35)

In [47]:
for manuscript, book_name in scroll_book_combinations:
    if manuscript == 'MT':
        continue
    elif manuscript == 'SP':
        book = Book(manuscript, book_name, Fsp, Tsp, Lsp)
        all_books.data[(manuscript, book_name)] = book
    else:
        book = Book(manuscript, book_name, Fdss, Tdss, Ldss)
        all_books.data[(manuscript, book_name)] = book

In [48]:
def make_matching_word_dict(book_name, all_books):

    all_match_dicts = {}

    matching_book = book_name
    matching_manuscripts = [scr for scr, bo in all_books.data.keys() if bo == matching_book]
    #print(matching_manuscripts)

    for man1 in matching_manuscripts:
        for man2 in matching_manuscripts:
            if man1 != man2:
            
                man1_verse_texts = all_books.data[(man1, matching_book)].verse_text_dict
                man2_verse_texts = all_books.data[(man2, matching_book)].verse_text_dict

                alignments_dict = make_alignments(man1_verse_texts, man2_verse_texts)
    
                man1_word2char = all_books.data[(man1, matching_book)].word2char
                man2_word2char = all_books.data[(man2, matching_book)].word2char
 
                matching_words_dict = collect_matching_words(alignments_dict, man1_word2char, man2_word2char)

                matches = {}
                for man1_word, man2_list in matching_words_dict.items():
                    man2_word = most_frequent(man2_list)
                    matches[man1_word] = man2_word
                all_match_dicts[((man1, matching_book), (man2, matching_book))] = matches
                
    return all_match_dicts, matching_manuscripts

In [49]:
def read_dataset(file):

    return pd.read_csv(file, sep='\t')

In [50]:
class MaterData:
    def __init__(self, man1, man2, section, lex, mater_val1, mater_val2, tf_id1, tf_id2, g_cons1, g_cons2):
        self.man1 = man1
        self.man2 = man2
        self.section = section
        self.lex = lex
        self.mater_val1 = mater_val1
        self.mater_val2 = mater_val2
        self.tf_id1 = tf_id1
        self.tf_id2 = tf_id2
        self.g_cons1 = g_cons1
        self.g_cons2 = g_cons2

In [51]:
def collect_matching_cases(matching_manuscripts, matching_book, dat):

    all_mater_datasets = {}

    for man in matching_manuscripts:
        mater_data = dat[(dat.book == matching_book) & (dat.scroll == man)]
        all_mater_datasets[(man, matching_book)] = mater_data
        
    return all_mater_datasets

In [52]:
def collect_mater_data(matching_manuscripts, all_match_dicts, all_mater_datasets, matching_book):

    manuscript_mater_match = collections.defaultdict(list)
    manuscripts = set()

    for idx, man in enumerate(matching_manuscripts):
        for idx2, man2 in enumerate(matching_manuscripts):
            if idx < idx2:
            
                matching_ids = all_match_dicts[((man, matching_book), (man2, matching_book))]
                man_data = all_mater_datasets[(man, matching_book)]
                man2_data = all_mater_datasets[(man2, matching_book)]
                for _, row in man_data.iterrows():
                
                    tf_id = row.tf_id
                    lex, typ, has_vl = row.lex, row.type, row.has_vowel_letter
                    g_cons1 = row.g_cons
                    section = (row.book, row.chapter, row.verse)
                    if man == 'SP':
                        tf_id = tf_id - 100000

                    matching_tf_id = matching_ids.get(tf_id, None)
                
                    if not matching_tf_id:
                        continue
                    
                    if man2 == 'SP':
                        matching_tf_id = matching_tf_id + 100000
                
                    man2_row = man2_data[(man2_data.tf_id == matching_tf_id) & (man2_data.lex == lex) & (man2_data.type == typ)]
                    
                    if not man2_row.shape[0]:
                        continue
                    has_vl2 = man2_row.has_vowel_letter.iloc[0]
                    g_cons2 = man2_row.g_cons.iloc[0]
                 
                    mater_data = MaterData(man, man2, section, lex, has_vl, has_vl2, tf_id, matching_tf_id, g_cons1, g_cons2)
                
                    manuscript_mater_match[man].append(mater_data)
                    manuscripts.add(man)
                    manuscripts.add(man2)
                    
    return manuscript_mater_match, manuscripts

In [53]:
def register_similarities_with_mt(manuscripts, mt_ids, manuscript_mater_match):

    mater_value_dict = {0: -1,
                    1: 1}

    mater_match_array = np.zeros((len(manuscripts), len(mt_ids)))

    for dat_object in manuscript_mater_match['MT']:
        
        other_man = dat_object.man2
        mt_tfid = dat_object.tf_id1
        other_man_tfid = dat_object.tf_id2
    
        mt_mater = dat_object.mater_val1
        other_man_mater = dat_object.mater_val2
    
        mt_idx = man2idx['MT']
        other_man_idx = man2idx[other_man]
    
        mt_mater_value = mater_value_dict[mt_mater]
        other_man_mater_value = mater_value_dict[other_man_mater]
    
        mt_tf_id = mt_tf2idx[mt_tfid]
    
        mater_match_array[mt_idx, mt_tf_id] = mt_mater_value
        mater_match_array[other_man_idx, mt_tf_id] = other_man_mater_value
        
    return mater_match_array

In [54]:
def get_parallels(manuscript_mater_match, hif_match_dict):
    for scroll in manuscript_mater_match.keys():
        for dat_object in manuscript_mater_match[scroll]:
            man1 = dat_object.man1
            man2 = dat_object.man2
            if man1 == 'MT' or man2 == 'MT':
            
                hif_match_dict[dat_object.tf_id2] = dat_object.tf_id1
                hif_match_dict[dat_object.tf_id1] = dat_object.tf_id2
        
    return hif_match_dict


In [55]:
def count_parallel_cases(mater_match_array):

    mater_arr = np.zeros((2, mater_match_array.shape[1]))

    for col_idx in range(mater_match_array.shape[1]):
        col = mater_match_array[:, col_idx]
        col_counts = collections.Counter(col)
        with_vowel_count = col_counts.get(1, 0)
        without_vowel_count = col_counts.get(-1, 0)
        mater_arr[0, col_idx] = with_vowel_count
        mater_arr[1, col_idx] = without_vowel_count
        
    return mater_arr

In [56]:
#file_name = '../data/ptca_qal.csv'
#all_bib_books = [Tmt.sectionFromNode(bo)[0] for bo in Fmt.otype.s('book')]
#dat = read_dataset(MATRES_FILE)

hif_match_dict = {}

for book in ALL_BOOK_NAMES:
    print(book)
    all_match_dicts, matching_manuscripts = make_matching_word_dict(book, all_books)
    all_mater_datasets = collect_matching_cases(matching_manuscripts, book, dat)
    manuscript_mater_match, manuscripts = collect_mater_data(matching_manuscripts, all_match_dicts, all_mater_datasets, book)
    
    hif_match_dict = get_parallels(manuscript_mater_match, hif_match_dict)
    print(len(hif_match_dict))
    

Genesis
380
Exodus
764
Leviticus
938
Numbers
1314
Deuteronomy
1842
Joshua
1860
Judges
1864
1_Samuel
1876
2_Samuel
1894
1_Kings
1898
2_Kings
1898
Isaiah
2871
Jeremiah
2927
Ezekiel
2945
Hosea
2947
Joel
2955
Amos
2981
Obadiah
2988
Jonah
3008
Micah
3054
Nahum
3079
Habakkuk
3087
Zephaniah
3110
Haggai
3110
Zechariah
3118
Malachi
3136
Psalms
3371
Job
3375
Proverbs
3391
Ruth
3395
Song_of_songs
3403
Ecclesiastes
3405
Lamentations
3421
Esther
3421
Daniel
3433
Ezra
3433
Nehemiah
3433
1_Chronicles
3433
2_Chronicles
3433


In [58]:
len(hif_match_dict)

3433

In [59]:
all_scrolls = set(dat_dss.scroll) 
non_qsp_scrolls = all_scrolls.difference(QSP_SCROLLS)
non_qsp_data = dat_dss[dat_dss.scroll.isin(non_qsp_scrolls)]
non_qsp_pent = non_qsp_data[non_qsp_data.book.isin(PENTATEUCH_BOOKS)]
non_qsp_pent.shape

(1020, 35)

In [60]:
n = 0
full_count = collections.defaultdict(int)
scrolls = set()
scroll_counts = collections.defaultdict(int)

sp_mt_dict = collections.defaultdict(lambda: collections.defaultdict(int))

for index, row in non_qsp_data.iterrows():
    tf_id = row.tf_id, 
    scroll = row.scroll, 
    has_vl = row.has_vowel_letter
    book, ch, ve = row.book, row.chapter, row.verse
    g_cons = row.g_cons
    lex = row.lex
    prefix = row.prefix
    #print(tf_id)
    #if lex != 'NBJ>/':
    #    continue
    
    
    
    #if scroll[0] not in {'SP'}: #, '4Q55'}:
    #    continue
    #if book != 'Isaiah':
    #    continue
        
    #if scroll[0] not in {'Mur88'}: #, '4Q55'}:
    #    continue
    #if lex[2] != 'H':
    #    continue
    
    
    
    if tf_id[0] in hif_match_dict:
        corr_id = hif_match_dict[tf_id[0]]
        corr_row = dat[dat.tf_id == corr_id]
        #print(scroll, corr_row.scroll.iloc[0], row.chapter, row.verse, row.lex, g_cons, corr_row.g_cons.iloc[0], 
        #      book, prefix, corr_row.prefix.iloc[0], has_vl, corr_row.has_vowel_letter.iloc[0])
        n += 1
        
        mt_full = corr_row.has_vowel_letter.iloc[0]
        scroll_full = has_vl
        full_count[(scroll_full, mt_full)] += 1
        scrolls.add(scroll)
        scroll_counts[scroll] += 1
        
        # SP MT
        #if scroll[0] == 'SP' and  corr_row.scroll.iloc[0] == 'MT':
        #    sp_mt_dict[lex][(scroll_full, mt_full)] += 1
        #    if lex == 'JCB[':
        #        print(has_vl, corr_row.has_vowel_letter.iloc[0], g_cons, corr_row.g_cons.iloc[0], prefix, book, ch, ve)
        
        #if scroll[0] == 'SP': # and lex[0] != 'J':
        print(scroll, has_vl, corr_row.has_vowel_letter.iloc[0], g_cons, corr_row.g_cons.iloc[0], prefix, book, ch, ve)
        
    #else:
    #    #print(scroll, book, row.chapter, row.verse, row.lex, row.g_cons)
    #    if scroll[0] != 'SP' and lex[0] == 'J':
    #           print(scroll, book, row.chapter, row.verse, row.lex, row.g_cons)
    #    continue
    
print(n)
print(full_count)
print(len((scrolls)))
print(scroll_counts)

('SP',) 0 0 <FH <FH nan Genesis 1 11
('SP',) 0 0 <FH <FH nan Genesis 1 12
('SP',) 0 0 RMFT RMFT H Genesis 1 21
('SP',) 0 0 RMF RMF H Genesis 1 26
('SP',) 0 0 RMFT RMFT H Genesis 1 28
('SP',) 0 0 ZR< ZR< nan Genesis 1 29
('SP',) 0 0 ZR< ZR< nan Genesis 1 29
('SP',) 0 0 JY> JY> nan Genesis 2 10
('SP',) 1 0 SWBB SBB H Genesis 2 11
('SP',) 1 1 SWBB SWBB H Genesis 2 13
('SP',) 0 0 HLK HLK H Genesis 2 14
('SP',) 0 0 JD< JD< nan Genesis 3 5
('SP',) 0 0 JD<J JD<J nan Genesis 3 5
('SP',) 0 0 R<J R<H nan Genesis 4 2
('SP',) 0 0 <BD <BD nan Genesis 4 2
('SP',) 0 0 RBY RBY nan Genesis 4 7
('SP',) 0 0 CMR CMR H Genesis 4 9
('SP',) 0 0 Y<Q Y<QJM nan Genesis 4 10
('SP',) 0 0 MY>J MY>J nan Genesis 4 14
('SP',) 0 0 HRG HRG nan Genesis 4 15
('SP',) 0 0 MY>W MY>W nan Genesis 4 15
('SP',) 0 0 BNH BNH nan Genesis 4 17
('SP',) 0 0 JCB JCB nan Genesis 4 20
('SP',) 0 0 TPF TPF nan Genesis 4 21
('SP',) 0 0 LVC LVC nan Genesis 4 22
('SP',) 0 0 XRC XRC nan Genesis 4 22
('SP',) 0 0 RMF RMF nan Genesis 7 8
('SP',)

('SP',) 0 0 CKB CKB WH Leviticus 14 47
('SP',) 0 0 >KL >KL WH Leviticus 14 47
('SP',) 0 0 JCB JCB WH Leviticus 15 6
('SP',) 0 0 NG< NG< WH Leviticus 15 7
('SP',) 0 0 NG< NG< H Leviticus 15 10
('SP',) 1 1 NWF> NWF> WH Leviticus 15 10
('SP',) 0 0 NG< NG< H Leviticus 15 19
('SP',) 0 0 NG< NG< H Leviticus 15 21
('SP',) 0 0 NG< NG< H Leviticus 15 22
('SP',) 0 0 JCBT JCBT nan Leviticus 15 23
('SP',) 0 1 NG< NWG< H Leviticus 15 27
('SP',) 0 0 CKN CKN H Leviticus 16 16
('SP',) 0 0 FRP FRP WH Leviticus 16 28
('SP',) 0 0 ZBXJM ZBXJM nan Leviticus 17 5
('SP',) 0 0 ZNJM ZNJM nan Leviticus 17 7
('SP',) 0 0 >KLT >KLT H Leviticus 17 10
('SP',) 0 0 JCBJH JCBJH nan Leviticus 18 25
('SP',) 0 0 <FWT <FT H Leviticus 18 29
('SP',) 0 0 ZNJM ZNJM H Leviticus 20 5
('SP',) 0 0 N>P N>P H Leviticus 20 10
('SP',) 0 0 N>PT N>PT WH Leviticus 20 10
('SP',) 1 0 ZWNH ZNH nan Leviticus 21 7
('SP',) 1 0 ZWNH ZNH W Leviticus 21 14
('SP',) 0 0 NG< NG< WH Leviticus 22 4
('SP',) 0 0 NTN NTN nan Leviticus 23 10
('SP',) 0 0 C

('SP',) 1 0 CWPV CPV H Deuteronomy 17 9
('SP',) 0 0 <MD <MD H Deuteronomy 17 12
('SP',) 1 0 CWPV CPV H Deuteronomy 17 12
('SP',) 0 0 NTN NTN nan Deuteronomy 17 14
('SP',) 0 0 ZBXJ ZBXJ nan Deuteronomy 18 3
('SP',) 0 0 <MDJM <MDJM H Deuteronomy 18 7
('SP',) 0 0 NTN NTN nan Deuteronomy 18 9
('SP',) 0 0 QSM QSM nan Deuteronomy 18 10
('SP',) 0 0 XBR XBR nan Deuteronomy 18 11
('SP',) 0 0 C>L C>L nan Deuteronomy 18 11
('SP',) 0 0 DRC DRC W Deuteronomy 18 11
('SP',) 0 0 <FH <FH nan Deuteronomy 18 12
('SP',) 0 1 JRCJM JWRC nan Deuteronomy 18 14
('SP',) 0 0 QSMJM QSMJM H Deuteronomy 18 14
('SP',) 0 0 NTN NTN nan Deuteronomy 19 1
('SP',) 0 0 NTN NTN nan Deuteronomy 19 2
('SP',) 0 0 RYX RYX nan Deuteronomy 19 3
('SP',) 0 0 RYX RYX H Deuteronomy 19 4
('SP',) 0 0 FN> FN> nan Deuteronomy 19 4
('SP',) 0 0 G>L G>L nan Deuteronomy 19 6
('SP',) 0 0 RYX RYX H Deuteronomy 19 6
('SP',) 0 0 FN> FN> nan Deuteronomy 19 6
('SP',) 0 0 NTN NTN nan Deuteronomy 19 10
('SP',) 0 0 FN> FN> nan Deuteronomy 19 11
('SP'

('4Q2',) 0 0 <BD <BD nan Genesis 4 2
('4Q2',) 0 0 RBY RBY nan Genesis 4 7
('4Q2',) 0 0 CMR CMR H Genesis 4 9
('4Q3',) 0 0 <MD <MD nan Genesis 41 1
('4Q3',) 0 0 <LWT <LWT nan Genesis 41 5
('4Q3',) 0 0 YMXWT YMXWT nan Genesis 41 6
('4Q3',) 0 1 PTR PWTR nan Genesis 41 8
('4Q4',) 0 0 RMFT RMFT H Genesis 1 21
('4Q5',) 0 0 R<H R<H nan Genesis 37 2
('4Q5',) 1 0 <WMD <MD nan Genesis 41 1
('4Q7',) 0 0 RMFT RMFT H Genesis 1 21
('4Q9',) 0 0 <FH <FH nan Genesis 41 25
('4Q11',) 0 0 R<H R<H nan Exodus 3 1
('4Q11',) 0 0 HLK HLK H Exodus 14 19
('4Q11',) 0 0 NTN NTN nan Exodus 16 29
('4Q11',) 0 0 FN>J FN>J nan Exodus 18 21
('4Q11',) 0 0 FN>K FN>K nan Exodus 23 5
('4Q11',) 0 0 PRFJ PRFJ nan Exodus 25 20
('4Q11',) 0 0 RQM RQM nan Exodus 26 36
('4Q14',) 0 0 NGFJM NGFJM WH Exodus 5 13
('4Q14',) 1 0 NWGP NGP nan Exodus 7 27
('4Q14',) 1 0 HWLKJM HLKJM H Exodus 10 8
('4Q14',) 1 0 JWCBJ JCBJ nan Exodus 15 14
('4Q15',) 0 0 ZBX ZBX nan Exodus 13 15
('4Q15',) 0 0 RKBW RKBW W Exodus 15 1
('4Q22',) 1 0 DWBR DBR nan

('Mas1d',) 0 0 JCBJM JCBJM nan Ezekiel 36 17
('Mas1d',) 0 0 <FH <FH nan Ezekiel 36 22
('Mas1d',) 0 0 >MRJM >MRJM nan Ezekiel 37 11
('Mas1d',) 0 0 PTX PTX nan Ezekiel 37 12
('Mas1e',) 0 0 CM< CM< nan Psalms 81 14
('Mas1e',) 0 0 JCBJ JCBJ nan Psalms 83 8
('Xjoshua',) 0 0 <BRJM <BRJM nan Joshua 1 11
('Xjoshua',) 0 0 NTN NTN nan Joshua 1 11
('Xjudges',) 0 1 JCBJ JWCBJ nan Judges 1 11
1320
defaultdict(<class 'int'>, {(0, 0): 1070, (1, 0): 134, (1, 1): 68, (0, 1): 48})
98
defaultdict(<class 'int'>, {('SP',): 813, ('1Q5',): 5, ('1Q8',): 74, ('1Q11',): 1, ('1Q13',): 2, ('2Q5',): 1, ('2Q11',): 1, ('2Q16',): 1, ('4Q1',): 7, ('4Q2',): 7, ('4Q3',): 4, ('4Q4',): 1, ('4Q5',): 2, ('4Q7',): 1, ('4Q9',): 1, ('4Q11',): 7, ('4Q14',): 4, ('4Q15',): 2, ('4Q22',): 10, ('4Q23',): 9, ('4Q24',): 1, ('4Q26',): 1, ('4Q28',): 3, ('4Q29',): 1, ('4Q30',): 3, ('4Q31',): 3, ('4Q32',): 1, ('4Q33',): 2, ('4Q34',): 1, ('4Q35',): 3, ('4Q36',): 1, ('4Q41',): 10, ('4Q42',): 1, ('4Q45',): 4, ('4Q47',): 4, ('4Q48',): 3, ('4Q

In [26]:
len(hif_match_dict)

2028