# Collect extra-biblical data of clauses with and without יש

In this notebook a dataset is made of clauses verbless clauses with subject and predicate complement. The subject is an undetermined NP and the predicate is a PP. A distinction is made between clauses with and without the particle יש. In my research this is the dependent variable, which has the values "jc" and "no_jc". In the file JC_noJC_bib.ipynb a similar dataset is made of biblical data.

In [2]:
import sys, os, csv, collections, pprint

In [3]:
from tf.fabric import Fabric

TF = Fabric(locations='~/github//extrabiblical/tf/0.2')

This is Text-Fabric 7.4.8
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

72 features found and 0 ignored


First create some dictionaries in which in which "extra-database knowledge" is added to the dataset. These are genre and language phase.

In [8]:
prose = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings', 'Jonah', 'Ruth', 'Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles']
prophecy = ['Isaiah', 'Jeremiah', 'Ezekiel', 'Hosea', 'Joel', 'Obadiah', 'Micah', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', 'Amos', 'Nahum', 'Habakkuk']
poetry = ['Song_of_songs','Proverbs','Ecclesiastes', 'Lamentations', 'Psalms', 'Job']
genre_dict = {}

for genre in [prose, prophecy, poetry]:
    for book in genre:
        if book in prose:
            genre_dict[book] = 'prose'
        elif book in prophecy:
            genre_dict[book] = 'prophecy'
        elif book in poetry:
            genre_dict[book] = 'poetry'

In [10]:
ebh_lbh_dict = {}

ebh = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings']
lbh = ['Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles']
for book in ebh:
    ebh_lbh_dict[book] = 'ebh'
for book in lbh:
    ebh_lbh_dict[book] = 'lbh'

double_books = {'1_Samuel': 'Samuel', '2_Samuel' : 'Samuel', '2_Kings' : 'Kings', '1_Kings': 'Kings', '1_Chronicles' : 'Chronicles', '2_Chronicles' : 'Chronicles'}

In the following functions it is decided whether a clause is a main clause or one of the three kinds of dependent clauses that we have defined in the Syntactic Variation project.

In [11]:
# %load main_sub
def in_dep_calc(cl):  
      
    in_dep = ''        
    if F.rela.v(cl) == 'ReSu': # is the clause resumptive?
        moth_obj = E.mother.f(cl)[0]
        in_dep = rela_calc(moth_obj)
    else:
        in_dep = rela_calc(cl) # does the clause have a dependent CCR?

    if in_dep == '':
    	words = L.d(cl, 'word') # is there a wayyiqtol?
    	for word in words:
            if F.vt.v(word) == 'wayq':
                in_dep += 'Main'
                        
    if in_dep == '':  # if everything else does not give a result, we look at the CARC
        cl_atoms = L.d(cl, 'clause_atom')
        in_dep = carc_calc(cl_atoms)
        
    return(in_dep)


#####################################


def carc_calc(cl_atoms):
    in_dep_c = ''
    carc = F.code.v(cl_atoms[0])
    if 999 > int(carc) > 499:
        in_dep_c += 'SubAdv'
    elif int(carc) in {0, 999}:
        in_dep_c = 'Main'
    elif 17 > int(carc) > 9:
        in_dep_c += 'SubAdv'
    elif 75 > int(carc) > 50:
        in_dep_c += 'SubAdv'
    elif 168 > int(carc) > 99:
        in_dep_c += 'Main'
    elif 500 > int(carc) > 299:
        in_dep_c += 'Main'
    elif int(carc) in {200, 201}:         
        while F.code.v(cl_atoms[0]) in {200, 201}:
            cl_atoms = E.mother.f(cl_atoms[0])
        carc = F.code.v(cl_atoms[0])
        if 999 > int(carc) > 499:
            in_dep_c += 'SubAdv'
        elif int(carc) in {0, 999}:
            in_dep_c = 'Main'
        elif 17 > int(carc) > 9:
            in_dep_c += 'SubAdv'
        elif 75 > int(carc) > 50:
            in_dep_c += 'SubAdv'
        elif 168 > int(carc) > 99:
            in_dep_c += 'Main'
        elif 500 > int(carc) > 299:
            in_dep_c += 'Main'
        elif int(carc) in {220, 221, 222, 223}:
            in_dep_c += 'Undc'
        
    else:
        in_dep_c += 'Undc'
        
    return(in_dep_c)

###################################################################################################

def rela_calc(cl):
    in_dep_r = ''
    ccr = F.rela.v(cl)
    if ccr in {'Subj', 'Objc', 'Cmpl', 'PreC', 'Voct', 'Frnt'}:
        in_dep_r += 'SubArg'
    elif ccr in {'Attr', 'RgRc', 'Spec'}:
        in_dep_r += 'SubMod'
    elif ccr in {'Adju', 'PrAd'}:
        in_dep_r += 'SubAdv'
    elif ccr == 'Coor':
        moth_obj = E.mother.f(cl)[0]
        if F.otype.v(moth_obj) in {'word', 'phrase'}:
            in_dep_r += 'SubMod'
        else:
            while F.rela.v(moth_obj) == 'Coor':
                moth_obj = E.mother.f(moth_obj)[0]
            ccr = F.rela.v(cl)
            if ccr in {'Subj', 'Objc', 'Cmpl', 'PreC', 'Voct', 'Frnt'}:
                in_dep_r += 'SubArg'
            elif ccr in {'Attr', 'RgRc', 'Spec'}:
                in_dep_r += 'SubMod'
            elif ccr in {'Adju', 'PrAd'}:
                in_dep_r += 'SubAdv'
                
        if in_dep_r == '':
            if F.otype.v(moth_obj) != 'clause':
                in_dep_r += 'SubMod'
            else:
                cl_atoms = L.d(moth_obj, 'clause_atom')
                in_dep_r = carc_calc(cl_atoms)
                
    return(in_dep_r)


In the function mother_tense_calc() the verb tense of the mother of a clause is retrieved.

In [12]:
def mother_tense_calc(cl):
    
    mother_tense = ''
    cl_atoms = L.d(cl, 'clause_atom')
    moth = E.mother.f(cl_atoms[0])
    if len(moth) == 0:
        mother_tense += 'no_mother'
    else: 
        if F.otype.v(moth[0]) in {'word', 'phrase'}:
            mother_tense += F.otype.v(moth[0])
        else:
            cl = L.u(moth[0], 'clause')[0]
            if F.kind.v(cl) == 'NC':
                mother_tense += 'nominal'
            elif F.kind.v(cl) == 'WP':
                mother_tense += 'no_pred'
                        
            else:
                phrases = L.d(cl, 'phrase')
                pred = False
                prec = False
                for phr in phrases:
                    if F.function.v(phr) in {'Pred', 'PreS', 'PreO'}:
                        pred = True
                        pred_phr = phr
                    elif F.function.v(phr) in {'PreC', 'PtcO'}:
                        prec = True
                        prec_phr = phr
                if pred == True:
                    words = L.d(pred_phr, 'word')
                    for word in words:
                        if F.sp.v(word) == 'verb':
                            mother_tense += F.vt.v(word)
                elif prec == True:
                    words = L.d(prec_phr, 'word')
                    for word in words:
                        if F.sp.v(word) == 'verb':
                            mother_tense += F.vt.v(word)
                            
    return(mother_tense)

In [13]:
extra_cl_funcs = ['Adju', 'Cmpl', 'Conj', 'EPPr', 'ExsS', 'Exst', 'Frnt', 'IntS', 'Intj', 'Loca', 'ModS', 'Modi', 'NCoS', 'NCop', 'Nega', 'Objc', 'PrAd', 'PrcS', 'PreO', 'PreS', 'PtcO', 'Ques', 'Rela', 'Supp', 'Time', 'Voct', 'Unkn']

In [14]:
jc_cl_list = []
jc_clauses = {}

for cl in F.otype.s('clause'):
    feat_list = []
    if F.kind.v(cl) in {'VC', 'WP'}:
        continue
        
    phrases = L.d(cl, 'phrase')
    phr_funcs = [F.function.v(ph) for ph in phrases]
    if 'Subj' in phr_funcs and 'PreC' in phr_funcs:
        if 'Nega' in phr_funcs:
            continue
            
        if 'NCop' in phr_funcs:
            continue
            
        if 'NCoS' in phr_funcs:
            continue
            
        prec_ind = phr_funcs.index('PreC')
        if F.typ.v(phrases[prec_ind]) != 'PP':
            continue
            
        subj_ind = phr_funcs.index('Subj')
        if F.det.v(phrases[subj_ind]) != 'und':
            continue    
        
        jc = False        
        words = L.d(cl, 'word')
        for word in words: 
            if F.lex.v(word) == 'JC': #note that 'JC' is 'JC/' in biblical coding
                jc = True
                
        if jc == True:
            cl_type = 'jc'
        else:
            cl_type = 'no_jc'
        feat_list.append(cl_type)
            
        feat_list.append(str(cl)) # clause id
        bo, ch, ve = T.sectionFromNode(cl)
        feat_list.append(bo) # book
        feat_list.append(str(ch)) # chapter
        feat_list.append(str(ve)) # verse
        feat_list.append(F.typ.v(cl)) # clause type of database
        feat_list.append(F.rela.v(cl) + '_') # clause relation
        
        # eg if book is '1_Samuel', add 'Samuel' to feat_list
        feat_list.append(bo)
            
        # ebh_lbh
        if bo in {'1QM', '1QH','1QS'}:
            feat_list.append('qh')
        elif bo in {'Shirata', 'Pirqe'}:
            feat_list.append('rabbinic')
        else:
            feat_list.append('epigraphic')
            
        #genre
        if bo in {'1QM', '1QS', 'Pirqe'}:
            feat_list.append('prose')
        elif bo in {'1QH', 'Shirata'}:
            feat_list.append('poetry')
        else:
            feat_list.append('epigraphic')
            
        feat_list.append(F.txt.v(cl)[-1]) # Q, D, N
            
        if phr_funcs.index('Subj') < phr_funcs.index('PreC'): #subj and prec order
            feat_list.append('SP')
        else:
            feat_list.append('PS')
            
        # which conjunction
        phr_types = [F.typ.v(phr) for phr in phrases]
        if 'CP' in phr_types: 
            conj = phrases[phr_types.index('CP')]
            words = L.d(conj, 'word')
            words_lex = '_'.join([F.lex.v(wo) for wo in words])
            feat_list.append(words_lex)
        else:
            feat_list.append('no_conj')
            
        # clause length. Note: JC is not counted!!
        if cl_type == 'jc':
            feat_list.append(str(len(phrases) - 1))
        else:
            feat_list.append(str(len(phrases)))
            
        subj_pos = phr_funcs.index('Subj')
        feat_list.append(F.typ.v(phrases[subj_pos])) #phrase type of subject
        feat_list.append(F.det.v(phrases[subj_pos])) #determination of subject
            
        # length of subj in words
        words_subj = L.d(phrases[subj_pos], 'word') # length of subject in words
        subj_len = len(words_subj)
        feat_list.append(str(subj_len))
        
        prec_pos = phr_funcs.index('PreC')
        feat_list.append(F.typ.v(phrases[prec_pos])) #phrase type of prec
        feat_list.append(F.det.v(phrases[prec_pos])) #determination of prec
                
        # length of prec in words
        words_prec = L.d(phrases[prec_pos], 'word')
        prec_len = len(words_prec)
        feat_list.append(str(prec_len))
        
        # main or subordinate clause
        feat_list.append(in_dep_calc(cl))
                                     
        # negation in clause
        phr_types = [F.typ.v(phr) for phr in phrases]
        if 'NegP' in phr_types:
            feat_list.append('neg')
        else:
            feat_list.append('non_neg')
            
        # aramaic or hebrew
        words = L.d(cl, 'word')
        feat_list.append(F.language.v(words[0]))

            
        # collect info about other phrases in clause (present (1) or absent (0))
        for item in extra_cl_funcs:   
            if item in phr_funcs:
                feat_list.append('1')
            else:
                feat_list.append('0')
                
        # tense ot the mother of the clause_atom
        feat_list.append(mother_tense_calc(cl))
                
        jc_cl_list.append(cl)
        jc_clauses[cl] = feat_list
            
        if cl_type == 'jc':
            print(T.sectionFromNode(cl), T.text(words))
            
        #if cl_type == 'no_jc':
        #    print(T.sectionFromNode(cl), T.text(words))


('1QH', 17, 35) כי ישׁ מקוה לשׁבי פשׁע ועוזבי חטאה 
('1QH', 21, 61) ישׁ מקוה בחסדיכה 
('1QH', 24, 64) כיא ישׁ מקוה לאישׁ 
('1QS', 4, 41) אשׁר ישׁ אתו דבר 
('1QS', 4, 41) ישׁ אתי דבר 
('Pirqe', 3, 3) וישׁ ביניהם דברי תורה 
('Pirqe', 3, 15) שׁישׁ בידו תורה ומעשׂים טובים 
('Pirqe', 3, 20) וישׁ להם על מה 
('Pirqe', 4, 12) ישׁ לכ בטלים הרבה כנגדכ 
('Pirqe', 4, 12) ישׁ לו שׂכר הרבה 
('Shirata', 1, 1) וישׁ אז לעתיד 
('Shirata', 1, 3) ישׁ אז לעתיד 
('Shirata', 4, 4) ישׁ גבור במדינה 
('Shirata', 4, 4) ישׁ לו כח וגבורה תכסיס ומלחמה 
('Shirata', 4, 5) ישׁ גבור במדינה 
('Shirata', 4, 6) ישׁ גבור במדינה 
('Shirata', 4, 8) ישׁ גבור במדינה 
('Shirata', 6, 9) ישׁ בהם ממשׁ 


In [15]:
csvh = open(r"jc_nojc_xbib.csv", "w")

header = ['cl_type', 'cl_id', 'book', 'chapter', 'verse', 'clause_type_etcbc', 'clause_rela', 'book2', 'ebh_lbh', 'genre', 'txt_type', 's_p_order', 'CP', 'cl_len', 'subj_type', 'subj_det', 'subj_len', 'pc_type', 'pc_det', 'pc_len', 'main_sub', 'nega', 'language']

for item in extra_cl_funcs:
    header.append(item)
header.append('mother')

csvh.write('{}\n'.format(','.join(header)))

for key in jc_cl_list:
    features = jc_clauses[key]
    csvh.write('{}\n'.format(','.join(features)))
    
csvh.close()