In [1]:
import collections

from Bio import pairwise2
from Bio.Seq import Seq
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd

from tf.app import use
A = use('etcbc/bhsa', hoist=globals())
Fmt, Tmt, Lmt = F, T, L

B = use('dt-ucph/sp', version='3.4', hoist=globals())
Fsp, Tsp, Lsp = F, T, L

C = use('etcbc/dss', version='1.9', hoist=globals())
Fdss, Tdss, Ldss = F, T, L

del F, T, L

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,5,79878.4,100
chapter,187,2135.79,100
verse,5841,68.38,100
word,114890,3.48,100
sign,399392,1.0,100


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


In [2]:
PENTATEUCH_BOOKS = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']
ALL_BOOK_NAMES = [Tmt.sectionFromNode(bo)[0] for bo in Fmt.otype.s('book')]

In [3]:
QSP_SCROLLS = {'1Qisaa', '1QisaaI', '1QisaaII', '2Q3', '4Q13', '4Q20', '2Q7', '4Q27', '1Q4', '2Q12', '4Q37', '4Q38', '4Q38a', '4Q40', '4Q53',
               '4Q57', '2Q13', '4Q78', '4Q80', '4Q82', '4Q128', '4Q129', '4Q134', '4Q135', '4Q136',
                '4Q137', '4Q138', '4Q139', '4Q140', '4Q141', '4Q142', '4Q143', '4Q144', '4Q158', '4Q364',
                '4Q365', '4Q96', '4Q111', '4Q109', '11Q5', '11Q6', '11Q7', '11Q8'}

In [4]:
class Book:
    def __init__(self, manuscript, book_name, F, T, L):
        self.manuscript = manuscript
        self.book_name = book_name
        self.F = F
        self.T = T
        self.L = L
        if self.manuscript in {'MT', 'SP'}:
            self.verse_g_cons, self.word2char = self.prepare_book_data()
        else:
            self.verse_g_cons, self.word2char = self.prepare_dss_book_data()
        self.verse_text_dict = self.make_verse_text()
        
    def prepare_book_data(self):
        verse_g_cons = collections.defaultdict(list)
        word2char = collections.defaultdict(list)
    
        for book_node in eval(self.F + ".otype.s('book')"):
            book_name = eval(self.T + '.sectionFromNode(book_node)[0]')
            if book_name != self.book_name:
                continue
            words = eval(self.L + ".d(book_node, 'word')")
            for w in words:
                bo, ch, ve = eval(self.T + '.sectionFromNode(w)')
                g_cons = eval(self.F + '.g_cons.v(w)')
                trailer = eval(self.F + '.trailer.v(w)')
                if trailer:
                    trailer = ' '
                verse_g_cons[(bo, ch, ve)].append(g_cons + trailer)
                for char in g_cons:
                    word2char[(bo, ch, ve)].append(w)
        return verse_g_cons, word2char
    
    def prepare_dss_book_data(self):
        verse_g_cons = collections.defaultdict(list)
        word2char = collections.defaultdict(list)
        
        for scr in eval(self.F + ".otype.s('scroll')"):
            if eval(self.T + '.scrollName(scr)') == self.manuscript:
                words = eval(self.L + ".d(scr, 'word')")
                for w in words:
                    bo = eval(self.F + '.book_etcbc.v(w)')
                    if bo != self.book_name:
                        continue            
                    ch = eval(self.F + '.chapter.v(w)')
                    ve = eval(self.F + '.verse.v(w)')
                
                    g_cons = eval(self.F + '.g_cons.v(w)')
                    after = eval(self.F + '.after.v(w)')

                    if after is None:
                        after = ''
                    if g_cons:
                        verse_g_cons[(bo, int(ch), int(ve))].append(g_cons + after)
                        for char in g_cons:
                            word2char[(bo, int(ch), int(ve))].append(w)
        return verse_g_cons, word2char
    
    def make_verse_text(self):
        return {section : ''.join(g_conss).strip() for (section, g_conss) in self.verse_g_cons.items()}

In [5]:
class AllBooks:
    def __init__(self):
        self.data = {}
        

In [6]:
def align_verses(str_1, str_2):
        
    seq1 = Seq(str_1)
    seq2 = Seq(str_2)
    
    alignments = pairwise2.align.globalxx(seq1, seq2)
    
    seq1_al = (alignments[0][0]).strip(' ')
    seq2_al = (alignments[0][1]).strip(' ')
        
    return seq1_al, seq2_al

In [7]:
def make_alignments(verse_text1, verse_text2):
    alignments_dict = {}

    for section, text1 in verse_text1.items():
        try:
            text2 = verse_text2[section]
            alignment1, alignment2 = align_verses(text1, text2)
            alignments_dict[section] = (alignment1, alignment2)
        except:
            continue
    return alignments_dict

In [8]:
def collect_matching_words(alignments_dict, word2char1, word2char2):
    man1_man2_dict = collections.defaultdict(list)

    for section, (al1, al2) in alignments_dict.items():
        man1_idx = 0
        man2_idx = 0
    
        word_chars1 = word2char1[section]
        word_chars2 = word2char2[section]
        for char1, char2 in zip(al1, al2):
            if char1 not in {' ', '-'}:
                man1_word = word_chars1[man1_idx]
                man1_idx += 1
            
            if char2 not in {' ', '-'}:
                man2_word = word_chars2[man2_idx]
                man2_idx += 1
            
            if char1 not in {' ', '-'} and char2 not in {' ', '-'}:
                man1_man2_dict[man1_word].append(man2_word)
                
    return man1_man2_dict

In [9]:
def most_frequent(List):
    return max(set(List), key = List.count)

# Prepare MT and SP texts

Produce dictionary mt_sp_matches which has mt words nodes as keys and matching word numbers from SP as values.

In [10]:
# prepare mt and sp books
MANUSCRIPTS = ['MT', 'SP']
all_books = AllBooks()
for book_name in ALL_BOOK_NAMES:
    book = Book('MT', book_name, 'Fmt', 'Tmt', 'Lmt')
    all_books.data[('MT', book_name)] = book
    
    if book_name in PENTATEUCH_BOOKS:
        book = Book('SP', book_name, 'Fsp', 'Tsp', 'Lsp')
        all_books.data[('SP', book_name)] = book

# Match words

In [19]:
dat = pd.read_csv('../data/infc_qal_lamed_he.csv', sep='\t')
dat_dss = dat[~dat.scroll.isin(['MT'])] #, 'SP'])]
scroll_book_combinations = list(set(zip(dat_dss.scroll, dat_dss.book)))
dat_dss.shape

(140, 35)

In [20]:
for manuscript, book_name in scroll_book_combinations:
    book = Book(manuscript, book_name, 'Fdss', 'Tdss', 'Ldss')
    all_books.data[(manuscript, book_name)] = book

In [21]:
def make_matching_word_dict(book_name, all_books):

    all_match_dicts = {}

    matching_book = book_name
    matching_manuscripts = [scr for scr, bo in all_books.data.keys() if bo == matching_book]

    for man1 in matching_manuscripts:
        for man2 in matching_manuscripts:
            if man1 != man2:
            
                man1_verse_texts = all_books.data[(man1, matching_book)].verse_text_dict
                man2_verse_texts = all_books.data[(man2, matching_book)].verse_text_dict

                alignments_dict = make_alignments(man1_verse_texts, man2_verse_texts)
    
                man1_word2char = all_books.data[(man1, matching_book)].word2char
                man2_word2char = all_books.data[(man2, matching_book)].word2char
 
                matching_words_dict = collect_matching_words(alignments_dict, man1_word2char, man2_word2char)

                matches = {}
                for man1_word, man2_list in matching_words_dict.items():
                    man2_word = most_frequent(man2_list)
                    matches[man1_word] = man2_word
                all_match_dicts[((man1, matching_book), (man2, matching_book))] = matches
                
    return all_match_dicts, matching_manuscripts

In [22]:
def read_dataset(file):

    return pd.read_csv(file, sep='\t')

In [23]:
class MaterData:
    def __init__(self, man1, man2, section, lex, mater_val1, mater_val2, tf_id1, tf_id2, g_cons1, g_cons2):
        self.man1 = man1
        self.man2 = man2
        self.section = section
        self.lex = lex
        self.mater_val1 = mater_val1
        self.mater_val2 = mater_val2
        self.tf_id1 = tf_id1
        self.tf_id2 = tf_id2
        self.g_cons1 = g_cons1
        self.g_cons2 = g_cons2

In [24]:
def collect_matching_cases(matching_manuscripts, matching_book, dat):

    all_mater_datasets = {}

    for man in matching_manuscripts:
        mater_data = dat[(dat.book == matching_book) & (dat.scroll == man)]
        all_mater_datasets[(man, matching_book)] = mater_data
        
    return all_mater_datasets

In [25]:
def collect_mater_data(matching_manuscripts, all_match_dicts, all_mater_datasets, matching_book):

    manuscript_mater_match = collections.defaultdict(list)
    manuscripts = set()

    for idx, man in enumerate(matching_manuscripts):
        for idx2, man2 in enumerate(matching_manuscripts):
            if idx < idx2:
            
                matching_ids = all_match_dicts[((man, matching_book), (man2, matching_book))]
                man_data = all_mater_datasets[(man, matching_book)]
                man2_data = all_mater_datasets[(man2, matching_book)]
                for _, row in man_data.iterrows():
                
                    tf_id = row.tf_id
                    lex, typ, has_vl = row.lex, row.type, row.has_vowel_letter
                    g_cons1 = row.g_cons
                    section = (row.book, row.chapter, row.verse)
                    if man == 'SP':
                        tf_id = tf_id - 100000

                    matching_tf_id = matching_ids.get(tf_id, None)
                
                    if not matching_tf_id:
                        continue
                    
                    if man2 == 'SP':
                        matching_tf_id = matching_tf_id + 100000
                
                    man2_row = man2_data[(man2_data.tf_id == matching_tf_id) & (man2_data.lex == lex) & (man2_data.type == typ)]
                    
                    if not man2_row.shape[0]:
                        continue
                    has_vl2 = man2_row.has_vowel_letter.iloc[0]
                    g_cons2 = man2_row.g_cons.iloc[0]
                 
                    mater_data = MaterData(man, man2, section, lex, has_vl, has_vl2, tf_id, matching_tf_id, g_cons1, g_cons2)
                
                    manuscript_mater_match[man].append(mater_data)
                    manuscripts.add(man)
                    manuscripts.add(man2)
                    
    return manuscript_mater_match, manuscripts

In [26]:
def register_similarities_with_mt(manuscripts, mt_ids, manuscript_mater_match):

    mater_value_dict = {0: -1,
                    1: 1}

    mater_match_array = np.zeros((len(manuscripts), len(mt_ids)))

    for dat_object in manuscript_mater_match['MT']:
        print(dat_object.g_cons1, dat_object.g_cons2)
        
        other_man = dat_object.man2
        mt_tfid = dat_object.tf_id1
        other_man_tfid = dat_object.tf_id2
    
        mt_mater = dat_object.mater_val1
        other_man_mater = dat_object.mater_val2
    
        mt_idx = man2idx['MT']
        other_man_idx = man2idx[other_man]
    
        mt_mater_value = mater_value_dict[mt_mater]
        other_man_mater_value = mater_value_dict[other_man_mater]
    
        mt_tf_id = mt_tf2idx[mt_tfid]
    
        mater_match_array[mt_idx, mt_tf_id] = mt_mater_value
        mater_match_array[other_man_idx, mt_tf_id] = other_man_mater_value
        
    return mater_match_array

In [27]:
def get_parallels(manuscript_mater_match, hif_match_dict):
    for scroll in manuscript_mater_match.keys():
        for dat_object in manuscript_mater_match[scroll]:
            man1 = dat_object.man1
            man2 = dat_object.man2
            if man1 == 'MT' or man2 == 'MT':
            
                hif_match_dict[dat_object.tf_id2] = dat_object.tf_id1
                hif_match_dict[dat_object.tf_id1] = dat_object.tf_id2
        
    return hif_match_dict


In [28]:
def count_parallel_cases(mater_match_array):

    mater_arr = np.zeros((2, mater_match_array.shape[1]))

    for col_idx in range(mater_match_array.shape[1]):
        col = mater_match_array[:, col_idx]
        col_counts = collections.Counter(col)
        with_vowel_count = col_counts.get(1, 0)
        without_vowel_count = col_counts.get(-1, 0)
        mater_arr[0, col_idx] = with_vowel_count
        mater_arr[1, col_idx] = without_vowel_count
        
    return mater_arr

In [30]:
file_name = '../data/infc_qal_lamed_he.csv'
all_bib_books = [Tmt.sectionFromNode(bo)[0] for bo in Fmt.otype.s('book')]
dat = read_dataset(file_name)

hif_match_dict = {}

for book in all_bib_books:
    #if book != 'Isaiah':
    #    continue
    print(book)
    all_match_dicts, matching_manuscripts = make_matching_word_dict(book, all_books)
    all_mater_datasets = collect_matching_cases(matching_manuscripts, book, dat)
    manuscript_mater_match, manuscripts = collect_mater_data(matching_manuscripts, all_match_dicts, all_mater_datasets, book)
    
    #print(manuscript_mater_match)
    #man2idx = {man:idx for idx, man in enumerate(manuscripts)}
    #mt_ids = sorted(list({data.tf_id1 for data in manuscript_mater_match['MT']}))
    #mt_tf2idx = {tf_id:idx for idx, tf_id in enumerate(mt_ids)}
    #idx2mt_tf = {v:k for k, v in mt_tf2idx.items()}
    
    hif_match_dict = get_parallels(manuscript_mater_match, hif_match_dict)
    print(len(hif_match_dict))
    

Genesis
8
Exodus
27
Leviticus
36
Numbers
42
Deuteronomy
83
Joshua
87
Judges
87
1_Samuel
89
2_Samuel
95
1_Kings
97
2_Kings
97
Isaiah
178
Jeremiah
184
Ezekiel
184
Hosea
184
Joel
184
Amos
186
Obadiah
188
Jonah
197
Micah
199
Nahum
199
Habakkuk
201
Zephaniah
201
Haggai
201
Zechariah
203
Malachi
205
Psalms
228
Job
228
Proverbs
230
Ruth
230
Song_of_songs
230
Ecclesiastes
230
Lamentations
230
Esther
230
Daniel
234
Ezra
234
Nehemiah
234
1_Chronicles
234
2_Chronicles
234


In [31]:
hif_match_dict

{1940261: 27490,
 27490: 1940261,
 1938559: 1796,
 1796: 1938559,
 1934988: 21663,
 21663: 1934988,
 2076020: 8232,
 8232: 2076020,
 2069167: 35702,
 35702: 2088585,
 1949016: 34651,
 34651: 1954379,
 1936731: 31063,
 31063: 1936731,
 1937398: 32128,
 32128: 1937398,
 2063943: 35702,
 1952530: 31867,
 31867: 1952530,
 1954379: 34651,
 1956773: 39308,
 39308: 1956773,
 2088585: 35702,
 1942958: 34744,
 34744: 1942958,
 1945479: 52096,
 52096: 1945479,
 2103927: 57323,
 57323: 2103927,
 2080027: 63808,
 63808: 2080027,
 2080209: 64517,
 64517: 2080209,
 2080296: 65108,
 65108: 1970734,
 1970734: 65108,
 1977788: 92258,
 92258: 1977788,
 1964031: 71814,
 71814: 1964031,
 1964808: 75866,
 75866: 1964808,
 1983771: 104307,
 104307: 1983771,
 1985960: 93080,
 93080: 1985960,
 1993477: 100288,
 100288: 1993477,
 1991174: 96344,
 96344: 2089521,
 2089280: 96929,
 96929: 2089280,
 2089335: 96981,
 96981: 2066797,
 2089383: 97025,
 97025: 2076949,
 1979447: 95174,
 95174: 1979447,
 1979466: 9546

In [32]:
len(hif_match_dict)

234

In [33]:
all_scrolls = set(dat_dss.scroll) 
non_qsp_scrolls = all_scrolls.difference(QSP_SCROLLS)
non_qsp_data = dat_dss[dat_dss.scroll.isin(non_qsp_scrolls)]
non_qsp_pent = non_qsp_data[non_qsp_data.book.isin(PENTATEUCH_BOOKS)]
non_qsp_pent.shape

(38, 35)

In [34]:
n = 0
full_count = collections.defaultdict(int)
scrolls = set()
scroll_counts = collections.defaultdict(int)

for index, row in non_qsp_data.iterrows():
    tf_id = row.tf_id, 
    scroll = row.scroll, 
    has_vl = row.has_vowel_letter
    book = row.book
    g_cons = row.g_cons
    lex = row.lex
    prefix = row.prefix
    #print(tf_id)
    #if lex != 'GBWL/':
    #    continue
    
    
    
    #if scroll[0] not in {'Mur88'}: #, '4Q55'}:
    #    continue
    #if book != 'Isaiah':
    #    continue
        
    #if scroll[0] not in {'Mur88'}: #, '4Q55'}:
    #    continue
    #if lex[2] != 'H':
    #    continue
    
    
    
    if tf_id[0] in hif_match_dict:
        corr_id = hif_match_dict[tf_id[0]]
        corr_row = dat[dat.tf_id == corr_id]
        print(scroll, corr_row.scroll.iloc[0], row.chapter, row.verse, row.lex, g_cons, book, prefix, corr_row.prefix.iloc[0], has_vl, corr_row.has_vowel_letter.iloc[0])
        n += 1
        
        mt_full = corr_row.has_vowel_letter.iloc[0]
        scroll_full = has_vl
        full_count[(scroll_full, mt_full)] += 1
        scrolls.add(scroll)
        scroll_counts[scroll] += 1
        
    else:
        print(scroll, book, row.chapter, row.verse, row.lex, row.g_cons)
        
    
print(n)
print(full_count)
print(len((scrolls)))
print(scroll_counts)

('1Q5',) MT 31 12 <FH[ <FWT Deuteronomy L L 1 1
('1Q8',) MT 55 6 HJH[ HJWTW Isaiah B B 1 1
('1Q8',) MT 56 2 <FH[ <FWT Isaiah M M 1 1
('1Q8',) MT 56 6 HJH[ HJWT Isaiah L L 1 1
('1Q8',) MT 58 13 <FH[ <FWT Isaiah nan nan 1 1
('1Q8',) MT 58 13 <FH[ <FWT Isaiah M M 1 1
('1Q8',) MT 60 15 HJH[ HJWTK Isaiah nan nan 1 1
('4Q1',) MT 39 11 <FH[ <FWT Genesis L L 1 1
('4Q1',) MT 5 13 HJH[ HJWT Exodus B B 1 1
('4Q1',) MT 7 18 CTH[ CTWT Exodus L L 1 1
('4Q2',) MT 4 8 HJH[ HJWTM Genesis B B 1 1
('4Q6',) MT 48 10 R>H[ R>WT Genesis L L 1 1
('4Q11',) MT 12 4 HJH[ HJWT Exodus M M 1 0
('4Q11',) MT 40 15 HJH[ HJWT Exodus L L 1 0
('4Q14',) MT 11 9 RBH[ RBWT Exodus nan nan 1 1
('4Q22',) MT 7 5 NVH[ NVTJ Exodus B B 0 0
('4Q22',) MT 11 9 RBH[ RBWT Exodus nan nan 1 1
('4Q22',) MT 19 12 <LH[ <LWT Exodus nan nan 1 1
('4Q23',) MT 4 3 <FH[ <FWT Numbers L L 1 1
('4Q23',) MT 9 6 <FH[ <FWT Numbers L L 1 0
('4Q26a',) MT 21 9 ZNH[ ZNWT Leviticus L L 1 1
('4Q28',) MT 24 4 HJH[ HJWT Deuteronomy L L 1 1
('4Q30',) MT 4 1 <FH