In [3]:
import collections

from Bio import pairwise2
from Bio.Seq import Seq
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from data import AllBooks, Book
from utils import most_frequent

from align_config import dss_version, sp_version
from align_functions import make_alignments, collect_matching_words

from tf.app import use

MT = use('etcbc/bhsa')
Fmt, Tmt, Lmt = MT.api.F, MT.api.T, MT.api.L

SP = use('dt-ucph/sp:clone', version=sp_version)
Fsp, Tsp, Lsp = SP.api.F, SP.api.T, SP.api.L

DSS = use('etcbc/dss', version=dss_version)
Fdss, Tdss, Ldss = DSS.api.F, DSS.api.T, DSS.api.L


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


**Locating corpus resources ...**

The requested data is not available offline
	~/text-fabric-data/github/dt-ucph/sp/tf/3.3.1 not found
rate limit is 60 requests per hour, with 30 left for this hour
To increase the rate,see https:/annotation.github.io/text-fabric/tf/advanced/repo.html#github
	connecting to online GitHub repo dt-ucph/sp ... connected
No directory /tf/3.3.1 in #1e77bc31b95aef860f1333fb1af423da1cf99b3cWill try something else
	Failed


No directory /tf/3.3.1 in #1e77bc31b95aef860f1333fb1af423da1cf99b3c	Failed
There were problems with loading data.
The TF API has not been loaded!
The app "dt-ucph/sp" will not work!


AttributeError: 'NoneType' object has no attribute 'F'

In [2]:
SP = use('dt-ucph/sp', version=sp_version)
Fsp, Tsp, Lsp = SP.api.F, SP.api.T, SP.api.L

**Locating corpus resources ...**

The requested data is not available offline
	~/text-fabric-data/github/dt-ucph/sp/tf/3.3.1 not found
rate limit is 60 requests per hour, with 45 left for this hour
To increase the rate,see https:/annotation.github.io/text-fabric/tf/advanced/repo.html#github
	connecting to online GitHub repo dt-ucph/sp ... connected
No directory /tf/3.3.1 in #1e77bc31b95aef860f1333fb1af423da1cf99b3cWill try something else
	Failed


No directory /tf/3.3.1 in #1e77bc31b95aef860f1333fb1af423da1cf99b3c	Failed
There were problems with loading data.
The TF API has not been loaded!
The app "dt-ucph/sp" will not work!


AttributeError: 'NoneType' object has no attribute 'F'

In [49]:
SYLLABLE_TYPE = 'last'

MATRES_FILE = '../data/nouns_adjectives.csv'

In [50]:
PENTATEUCH_BOOKS = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy']
ALL_BOOK_NAMES = [Tmt.sectionFromNode(bo)[0] for bo in Fmt.otype.s('book')]

In [51]:
QSP_SCROLLS = {'1Qisaa', '1QisaaI', '1QisaaII', '2Q3', '4Q13', '4Q20', '2Q7', '4Q27', '1Q4', '2Q12', '4Q37', '4Q38', '4Q38a', '4Q40', '4Q53',
               '4Q57', '2Q13', '4Q78', '4Q80', '4Q82', '4Q128', '4Q129', '4Q134', '4Q135', '4Q136',
                '4Q137', '4Q138', '4Q139', '4Q140', '4Q141', '4Q142', '4Q143', '4Q144', '4Q158', '4Q364',
                '4Q365', '4Q96', '4Q111', '4Q109', '11Q5', '11Q6', '11Q7', '11Q8'}

# Prepare MT and SP texts

Produce dictionary mt_sp_matches which has mt words nodes as keys and matching word numbers from SP as values.

In [52]:
# prepare mt and sp books
MANUSCRIPTS = ['MT', 'SP']
all_books = AllBooks()
for book_name in ALL_BOOK_NAMES:
    book = Book('MT', book_name, Fmt, Tmt, Lmt)
    all_books.data[('MT', book_name)] = book
    
    if book_name in PENTATEUCH_BOOKS:
        book = Book('SP', book_name, Fsp, Tsp, Lsp)
        all_books.data[('SP', book_name)] = book

# Match words

In [53]:
YOD_WORDS = """
 >C=/    >JBH/   >JL==/  >JL===/ >JMH/   >JMJ/   >JPH/   BJYH/   CJR/    CJXH/   FJX/   
 FJX=/   GJD/    GR=/    MT/     MX/     NDH/    NJR=/   QJNH/   RJB/    RJBH/   RJPWT/ 
 RJQ=/   SJG/    SJR/    XJL==/  XJQ/    YJDH/   YJR==/  YJR===/ YJY/    ZD/     ZJPJ/
"""

WAW_WORDS = """
  <GH/   <L/    <LH/   <NJ=/  <WD/   <WLH/  <WR=/  <Z/    <Z=/   >NH/   >PH/   >RT/  
  >V/    >WB/   >WDWT/ >WN/   >WR=/  >WR==/ >WT/   BCT/   BMH/   BWR/   BWY/   CW>H/ 
  CWQ/   CWXJ/  DB/    DQ=/   DWD/   DWR/   GWLH/  GWR/   GWR=/  JPJ/   JWRH/  KCJ/  
  KL/    KWS/   KX/    LV=/   LWBJ/  LWX/   LXJ/   MR/    MR==/  MS/    MWLH/  MWV/  
  MWVH/  MY/    ND/    NGH/   NS/    NWYH/  NXT/   PRH/   PWR/   QR/    QWL/   QWMH/ 
  QWP/   QWRH/  QWY/   R<===/ R>H/   RB=/   RNH/   RQ/    RWM=/  SKH/   SWD/   SWP/  
  SWP=/  SWPH/  SWR/   T>W/   TK/    TM=/   TMJM=/ TP/    TWR/   TWR=/  VWB/   VWBH/ 
  VWR/   XLJ/   XPH/   XQ/    XQH/   XR/    XR=/   XRJ=/  XWMH/  XWX/   XWY/   XZH=/ 
  Y>J/   YB=/   YWM/   YWR/   YX/    ZWB/
"""

YOD_WORDS = YOD_WORDS.split()
WAW_WORDS = WAW_WORDS.split()

In [54]:
dat = pd.read_csv(MATRES_FILE, sep='\t')
dat = dat[(dat.type == SYLLABLE_TYPE)] # & (dat.lex.isin(WAW_WORDS))]
print(dat.shape)

# dat_dss means non-MT, SP is included
dat_dss = dat[~dat.scroll.isin(['MT'])]
scroll_book_combinations = list(set(zip(dat_dss.scroll, dat_dss.book)))
dat_dss.shape

(20843, 37)


(6811, 37)

In [55]:
for manuscript, book_name in scroll_book_combinations:
    if manuscript == 'MT':
        continue
    elif manuscript == 'SP':
        book = Book(manuscript, book_name, Fsp, Tsp, Lsp)
        all_books.data[(manuscript, book_name)] = book
    else:
        book = Book(manuscript, book_name, Fdss, Tdss, Ldss)
        all_books.data[(manuscript, book_name)] = book

In [56]:
def make_matching_word_dict(book_name, all_books):

    all_match_dicts = {}

    matching_book = book_name
    matching_manuscripts = [scr for scr, bo in all_books.data.keys() if bo == matching_book]
    #print(matching_manuscripts)

    for man1 in matching_manuscripts:
        for man2 in matching_manuscripts:
            if man1 != man2:
            
                man1_verse_texts = all_books.data[(man1, matching_book)].verse_text_dict
                man2_verse_texts = all_books.data[(man2, matching_book)].verse_text_dict

                alignments_dict = make_alignments(man1_verse_texts, man2_verse_texts)
    
                man1_word2char = all_books.data[(man1, matching_book)].word2char
                man2_word2char = all_books.data[(man2, matching_book)].word2char
 
                matching_words_dict = collect_matching_words(alignments_dict, man1_word2char, man2_word2char)

                matches = {}
                for man1_word, man2_list in matching_words_dict.items():
                    man2_word = most_frequent(man2_list)
                    matches[man1_word] = man2_word
                all_match_dicts[((man1, matching_book), (man2, matching_book))] = matches
                
    return all_match_dicts, matching_manuscripts

In [57]:
def read_dataset(file):

    return pd.read_csv(file, sep='\t')

In [58]:
class MaterData:
    def __init__(self, man1, man2, section, lex, mater_val1, mater_val2, tf_id1, tf_id2, g_cons1, g_cons2):
        self.man1 = man1
        self.man2 = man2
        self.section = section
        self.lex = lex
        self.mater_val1 = mater_val1
        self.mater_val2 = mater_val2
        self.tf_id1 = tf_id1
        self.tf_id2 = tf_id2
        self.g_cons1 = g_cons1
        self.g_cons2 = g_cons2

In [59]:
def collect_matching_cases(matching_manuscripts, matching_book, dat):

    all_mater_datasets = {}

    for man in matching_manuscripts:
        mater_data = dat[(dat.book == matching_book) & (dat.scroll == man)]
        all_mater_datasets[(man, matching_book)] = mater_data
        
    return all_mater_datasets

In [60]:
def collect_mater_data(matching_manuscripts, all_match_dicts, all_mater_datasets, matching_book):

    manuscript_mater_match = collections.defaultdict(list)
    manuscripts = set()

    for idx, man in enumerate(matching_manuscripts):
        for idx2, man2 in enumerate(matching_manuscripts):
            if idx < idx2:
            
                matching_ids = all_match_dicts[((man, matching_book), (man2, matching_book))]
                man_data = all_mater_datasets[(man, matching_book)]
                man2_data = all_mater_datasets[(man2, matching_book)]
                for _, row in man_data.iterrows():
                
                    tf_id = row.tf_id
                    lex, typ, has_vl = row.lex, row.type, row.has_vowel_letter
                    g_cons1 = row.g_cons
                    section = (row.book, row.chapter, row.verse)
                    if man == 'SP':
                        tf_id = tf_id - 100000

                    matching_tf_id = matching_ids.get(tf_id, None)
                
                    if not matching_tf_id:
                        continue
                    
                    if man2 == 'SP':
                        matching_tf_id = matching_tf_id + 100000
                
                    man2_row = man2_data[(man2_data.tf_id == matching_tf_id) & (man2_data.lex == lex) & (man2_data.type == typ)]
                    
                    if not man2_row.shape[0]:
                        continue
                    has_vl2 = man2_row.has_vowel_letter.iloc[0]
                    g_cons2 = man2_row.g_cons.iloc[0]
                 
                    mater_data = MaterData(man, man2, section, lex, has_vl, has_vl2, tf_id, matching_tf_id, g_cons1, g_cons2)
                
                    manuscript_mater_match[man].append(mater_data)
                    manuscripts.add(man)
                    manuscripts.add(man2)
                    
    return manuscript_mater_match, manuscripts

In [61]:
def register_similarities_with_mt(manuscripts, mt_ids, manuscript_mater_match):

    mater_value_dict = {0: -1,
                        1: 1}

    mater_match_array = np.zeros((len(manuscripts), len(mt_ids)))

    for dat_object in manuscript_mater_match['MT']:
        
        other_man = dat_object.man2
        mt_tfid = dat_object.tf_id1
        other_man_tfid = dat_object.tf_id2
    
        mt_mater = dat_object.mater_val1
        other_man_mater = dat_object.mater_val2
    
        mt_idx = man2idx['MT']
        other_man_idx = man2idx[other_man]
    
        mt_mater_value = mater_value_dict[mt_mater]
        other_man_mater_value = mater_value_dict[other_man_mater]
    
        mt_tf_id = mt_tf2idx[mt_tfid]
    
        mater_match_array[mt_idx, mt_tf_id] = mt_mater_value
        mater_match_array[other_man_idx, mt_tf_id] = other_man_mater_value
        
    return mater_match_array

In [62]:
def get_parallels(manuscript_mater_match, hif_match_dict):
    for scroll in manuscript_mater_match.keys():
        for dat_object in manuscript_mater_match[scroll]:
            man1 = dat_object.man1
            man2 = dat_object.man2
            if man1 == 'MT' or man2 == 'MT':
            
                hif_match_dict[dat_object.tf_id2] = dat_object.tf_id1
                hif_match_dict[dat_object.tf_id1] = dat_object.tf_id2
        
    return hif_match_dict


In [63]:
def count_parallel_cases(mater_match_array):

    mater_arr = np.zeros((2, mater_match_array.shape[1]))

    for col_idx in range(mater_match_array.shape[1]):
        col = mater_match_array[:, col_idx]
        col_counts = collections.Counter(col)
        with_vowel_count = col_counts.get(1, 0)
        without_vowel_count = col_counts.get(-1, 0)
        mater_arr[0, col_idx] = with_vowel_count
        mater_arr[1, col_idx] = without_vowel_count
        
    return mater_arr

In [64]:
hif_match_dict = {}

for book in ALL_BOOK_NAMES:
    print(book)
    all_match_dicts, matching_manuscripts = make_matching_word_dict(book, all_books)
    all_mater_datasets = collect_matching_cases(matching_manuscripts, book, dat)
    manuscript_mater_match, manuscripts = collect_mater_data(matching_manuscripts, all_match_dicts, all_mater_datasets, book)
    
    hif_match_dict = get_parallels(manuscript_mater_match, hif_match_dict)
    print(len(hif_match_dict))
    

Genesis
626
Exodus
1075
Leviticus
1264
Numbers
1378
Deuteronomy
1945
Joshua
1959
Judges
1963
1_Samuel
2051
2_Samuel
2157
1_Kings
2183
2_Kings
2183
Isaiah
4271
Jeremiah
4365
Ezekiel
4405
Hosea
4429
Joel
4473
Amos
4525
Obadiah
4541
Jonah
4614
Micah
4676
Nahum
4698
Habakkuk
4728
Zephaniah
4758
Haggai
4774
Zechariah
4784
Malachi
4788
Psalms
5346
Job
5364
Proverbs
5380
Ruth
5396
Song_of_songs
5421
Ecclesiastes
5429
Lamentations
5471
Esther
5471
Daniel
5533
Ezra
5533
Nehemiah
5533
1_Chronicles
5533
2_Chronicles
5533


In [65]:
len(hif_match_dict)

5533

In [66]:
all_scrolls = set(dat_dss.scroll) 
non_qsp_scrolls = all_scrolls.difference(QSP_SCROLLS)
non_qsp_data = dat_dss[dat_dss.scroll.isin(non_qsp_scrolls)]
non_qsp_pent = non_qsp_data[non_qsp_data.book.isin(PENTATEUCH_BOOKS)]
non_qsp_pent.shape

qsp_data = dat_dss[dat_dss.scroll.isin(QSP_SCROLLS)]

In [67]:
n = 0
full_count = collections.defaultdict(int)
scrolls = set()
scroll_counts = collections.defaultdict(int)


sp_mt_dict = collections.defaultdict(lambda: collections.defaultdict(int))

for index, row in non_qsp_data.iterrows():
    tf_id = row.tf_id, 
    scroll = row.scroll, 
    has_vl = row.has_vowel_letter
    book, ch, ve = row.book, row.chapter, row.verse
    g_cons = row.g_cons
    lex = row.lex
    prefix = row.prefix
    reconstr = row.rec_signs
    
    
    if tf_id[0] in hif_match_dict:
        corr_id = hif_match_dict[tf_id[0]]
        corr_row = dat[dat.tf_id == corr_id]
        #print(scroll)

        if lex != 'YDJQ/':
            continue
        #if scroll[0] != 'SP':
        #    continue
        
        #print(scroll, corr_row.scroll.iloc[0], row.chapter, row.verse, row.lex, g_cons, corr_row.g_cons.iloc[0], 
        #      book, prefix, corr_row.prefix.iloc[0], has_vl, corr_row.has_vowel_letter.iloc[0])
        n += 1
        
        mt_full = corr_row.has_vowel_letter.iloc[0]
        scroll_full = has_vl
        full_count[(scroll_full, mt_full)] += 1
        scrolls.add(scroll)
        scroll_counts[scroll] += 1
        
        #if scroll[0] == '1Q8': # and lex[0] != 'J':
        #if has_vl != corr_row.has_vowel_letter.iloc[0]:
        print(scroll, has_vl, corr_row.has_vowel_letter.iloc[0], g_cons, corr_row.g_cons.iloc[0], prefix, reconstr, book, ch, ve)
            
            
    #else:
    #    print(scroll, book, row.chapter, row.verse, row.lex, row.g_cons)
    #    if scroll[0] != 'SP' and lex[0] == 'J':
    #           print(scroll, book, row.chapter, row.verse, row.lex, row.g_cons)
    #    continue
    
print(n)
print(full_count)
print(len((scrolls)))
print(scroll_counts)


('SP',) 1 1 YDJQ YDJQ nan nnnn Genesis 6 9
('SP',) 1 1 YDJQ YDJQ nan nnnn Genesis 7 1
('1Q8',) 1 1 YDJQ YDJQ H nnnn Isaiah 57 1
('1Q8',) 1 1 YDJQJM YDJQJM nan nnnnnn Isaiah 60 21
('2Q2',) 1 1 YDJQ YDJQ H rnnn Exodus 9 27
('4Q11',) 1 1 YDJQ YDJQ W nnnn Exodus 23 7
('4Q51',) 1 1 YDJQ YDJQ nan rnnn 2_Samuel 4 11
('4Q56',) 1 1 YDJQ YDJQ nan nnnn Isaiah 45 21
('4Q58',) 1 1 YDJQ YDJQ nan nnnn Isaiah 53 11
('4Q73',) 1 1 YDJQM YDJQM nan nnnnn Ezekiel 23 45
('4Q85',) 1 1 YDJQJM YDJQJM nan nnnnnn Psalms 52 8
('4Q87',) 1 1 YDJQJM YDJQJM H nnnnnn Psalms 125 3
('4Q87',) 1 1 YDJQJM YDJQJM H nnnnnn Psalms 125 3
('4Q98b',) 1 1 YDJQ YDJQ nan nnnn Psalms 5 13
('4Q103',) 1 1 YDJQ YDJQ nan nnnn Proverbs 14 32
('4Q103',) 1 1 YDJQ YDJQ nan nnnn Proverbs 15 28
('Mur88',) 1 1 YDJQ YDJQ nan nnnn Zephaniah 3 5
('5/6hev1b',) 1 1 YDJQ YDJQ nan nnnn Psalms 11 3
('5/6hev1b',) 1 1 YDJQ YDJQ nan nnnn Psalms 31 19
19
defaultdict(<class 'int'>, {(1, 1): 19})
14
defaultdict(<class 'int'>, {('SP',): 2, ('1Q8',): 2, ('2Q2

In [68]:
jbl = dat[(dat.lex == 'YDJQ/') & (dat.scroll == 'SP')]
jbl

Unnamed: 0,tf_id,scroll,book,chapter,verse,lex,g_cons,stem,pattern,pattern_g_cons,...,has_prs,has_prefix,has_hloc,has_nme,rec_signs_stem,cor_signs_stem,type,vowel_letter,has_vowel_letter,neigh_vowel_letter
30610,508180,SP,Genesis,6,9,YDJQ/,YDJQ,YDJQ,CCMC,CCMC,...,0,0,0,0,nnnn,nnnn,last,J,1,0
30637,508466,SP,Genesis,7,1,YDJQ/,YDJQ,YDJQ,CCMC,CCMC,...,0,0,0,0,nnnn,nnnn,last,J,1,0
30974,513734,SP,Genesis,18,23,YDJQ/,YDJQ,YDJQ,CCMC,CCMC,...,0,0,0,0,nnnn,nnnn,last,J,1,0
30975,513740,SP,Genesis,18,24,YDJQ/,YDJQJM,YDJQ,CCMC,CCMCMC,...,0,0,0,1,nnnn,nnnn,last,J,1,1
30977,513756,SP,Genesis,18,24,YDJQ/,YDQJM,YDQ,CCC,CCCMC,...,0,1,0,1,nnn,nnn,last,,0,1
30978,513770,SP,Genesis,18,25,YDJQ/,YDJQ,YDJQ,CCMC,CCMC,...,0,0,0,0,nnnn,nnnn,last,J,1,0
30979,513776,SP,Genesis,18,25,YDJQ/,YDJQ,YDJQ,CCMC,CCMC,...,0,1,0,0,nnnn,nnnn,last,J,1,0
30981,513797,SP,Genesis,18,26,YDJQ/,YDQJM,YDQ,CCC,CCCMC,...,0,0,0,1,nnn,nnn,last,,0,1
30984,513831,SP,Genesis,18,28,YDJQ/,YDQJM,YDQ,CCC,CCCMC,...,0,1,0,1,nnn,nnn,last,,0,1
31030,514835,SP,Genesis,20,4,YDJQ/,YDJQ,YDJQ,CCMC,CCMC,...,0,0,0,0,nnnn,nnnn,last,J,1,0


In [69]:
dat.columns

Index(['tf_id', 'scroll', 'book', 'chapter', 'verse', 'lex', 'g_cons', 'stem',
       'pattern', 'pattern_g_cons', 'vs', 'vt', 'nu', 'gn', 'ps', 'sp', 'prs',
       'nme', 'hloc', 'prefix', 'rec_signs', 'cor_signs', 'heb_g_cons',
       'feature', 'other_vowel_ending', 'line', 'column', 'has_prs',
       'has_prefix', 'has_hloc', 'has_nme', 'rec_signs_stem', 'cor_signs_stem',
       'type', 'vowel_letter', 'has_vowel_letter', 'neigh_vowel_letter'],
      dtype='object')

In [70]:
jbl_mtp = jbl[(jbl.scroll == 'MT') & (jbl.book.isin(PENTATEUCH_BOOKS))]
pd.crosstab(jbl_mtp.has_prefix, jbl_mtp.has_vowel_letter)

has_vowel_letter
has_prefix


In [71]:
jbl_mtp[jbl_mtp.has_prefix == 0]

Unnamed: 0,tf_id,scroll,book,chapter,verse,lex,g_cons,stem,pattern,pattern_g_cons,...,has_prs,has_prefix,has_hloc,has_nme,rec_signs_stem,cor_signs_stem,type,vowel_letter,has_vowel_letter,neigh_vowel_letter


In [72]:
jbl_fp = jbl[(jbl.scroll == 'MT') & (~jbl.book.isin(PENTATEUCH_BOOKS))]
pd.crosstab(jbl_fp.has_prefix, jbl_fp.has_vowel_letter)

has_vowel_letter
has_prefix


In [73]:
jbl_q = jbl[(jbl.scroll == '11Q1')]

In [74]:
jbl_q[['g_cons', 'prefix']]

Unnamed: 0,g_cons,prefix
