In [1]:
import pandas as pd

In [34]:
from tf.app import use
A = use('etcbc/bhsa', hoist=globals())

A.load(['g_prs', 'g_nme', 'g_pfm', 'g_vbs', 'g_vbe'])

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


True

In [35]:
column_names = ['tf_id', 'book', 'chapter', 'verse', 'lex', 'lex_hebrew', 'g_cons', 'g_cons_hebrew', 'g_cons_no_suff', 'suff', 'vt', 'vs', 
               'gn', 'nu', 'ps', 'verb_ends_on_w', 'has_suffix', 'w_in_suffix', 'has_final_nun', 'hollow_root',
              'hollow_vowel_letter', 'ayin_ayin_root', 'ayin_vowel_letter']

In [36]:
valid_suff_chars = {'H', 'J', 'K', 'M', 'N', 'W'}

In [37]:
lexemes_with_suff = set()
for w in F.otype.s('word'):
    
    is_perfect = F.vt.v(w) == 'perf' and F.ps.v(w) == 'p3' and F.nu.v(w) == 'pl'
    is_imperfect = F.vt.v(w) == 'impf' and F.ps.v(w) in {'p2', 'p3'} and F.nu.v(w) == 'pl' and F.gn.v(w) == 'm'
    is_imperative = F.vt.v(w) == 'impv' and F.nu.v(w) == 'pl' and F.gn.v(w) == 'm'
    
    if is_perfect or is_imperfect or is_imperative:
        suff = ''.join([char for char in F.g_prs.v(w) if char in valid_suff_chars])
        if suff:
            lexemes_with_suff.add(F.lex.v(w))
            
len(lexemes_with_suff)

212

In [38]:
def get_hollow_root_info(lex, g_cons):
    hollow_root = 0
    hollow_vowel_letter = 0
    if lex[1] in {'J', 'W'} and lex[2] != 'H':
        hollow_root = 1
        first_letter_idx = g_cons.index(lex[0])
        if g_cons[first_letter_idx + 1] in {'W', 'J'}:
            hollow_vowel_letter = 1
            
    return hollow_root, hollow_vowel_letter

In [48]:
def get_ayin_ayin_info(lex, g_cons):

    ayin_ayin_root = 0
    ayin_vowel_letter = 0
    if lex[1] == lex[2]:
        ayin_ayin_root = 1
        first_letter_idx = g_cons.index(lex[0])
        
        try:
            if g_cons[first_letter_idx + 1] in {'W', 'J'}:
                ayin_vowel_letter = 1
        except:
            ayin_vowel_letter = 0
    return ayin_ayin_root, ayin_vowel_letter

In [54]:
vbe_set = set()

info_dict = {}

for w in F.otype.s('word'):
    
    is_perfect = F.vt.v(w) == 'perf' and F.ps.v(w) == 'p3' and F.nu.v(w) == 'pl'
    is_imperfect = F.vt.v(w) in {'impf', 'wayq'} and F.ps.v(w) in {'p2', 'p3'} and F.nu.v(w) == 'pl' and F.gn.v(w) == 'm'
    is_imperative = F.vt.v(w) == 'impv' and F.nu.v(w) == 'pl' and F.gn.v(w) == 'm'
    lang = F.language.v(w)
    if is_perfect or is_imperfect or is_imperative and lang == 'Hebrew':
        suff = ''.join([char for char in F.g_prs.v(w) if char in valid_suff_chars])
        
        vbe = F.g_vbe.v(w)
        vbe_set.add(vbe)
        
        lex = F.lex.v(w)
        g_cons = F.g_cons.v(w)
        has_suffix = int(len(suff) > 0)
        bo, ch, ve = T.sectionFromNode(w)
        if F.qere.v(w) != None:
            continue
        if lex in lexemes_with_suff:
            if vbe in {'[U', '[W', '[W.', '[W.-',} or 'N' in vbe:
                #print(w, bo, ch, ve, vbe, F.g_cons.v(w), suff, F.qere.v(w))
                #continue
            
                g_cons_no_suff = F.g_cons.v(w).rstrip(suff)
                final_nun = 0
                if 'N' in vbe:
                    g_cons_no_suff = F.g_cons.v(w).rstrip('N')
                    final_nun = 1
                suff_has_w = int('W' in suff)
                verb_ends_on_w = int(g_cons_no_suff[-1] == 'W')
            
                hollow_root, hollow_vowel_letter = get_hollow_root_info(lex, g_cons)
                ayin_ayin_root, ayin_vowel_letter = get_ayin_ayin_info(lex, g_cons)
                    

                info_dict[w] = [w, bo, ch, ve, lex, F.lex_utf8.v(w), g_cons, F.g_cons_utf8.v(w), g_cons_no_suff, suff, 
                    F.vt.v(w), '-', F.gn.v(w), F.nu.v(w), F.ps.v(w), verb_ends_on_w, has_suffix, suff_has_w, final_nun,
                               hollow_root, hollow_vowel_letter, ayin_ayin_root, ayin_vowel_letter]
                

In [55]:
vbe_set

{'[',
 '[H',
 '[J',
 '[ON',
 '[OW',
 '[OWN',
 '[TEM',
 '[U',
 '[UN',
 '[UN.',
 '[W',
 '[W.',
 '[W.-',
 '[W.>',
 '[W.N',
 '[W.N.',
 '[WJ',
 '[WN'}

In [56]:
len(info_dict)

7284

In [57]:
dat = pd.DataFrame(info_dict).T

dat.columns = column_names
dat.head(20)

Unnamed: 0,tf_id,book,chapter,verse,lex,lex_hebrew,g_cons,g_cons_hebrew,g_cons_no_suff,suff,...,nu,ps,verb_ends_on_w,has_suffix,w_in_suffix,has_final_nun,hollow_root,hollow_vowel_letter,ayin_ayin_root,ayin_vowel_letter
424,424,Genesis,1,22,ML>[,מלא,ML>W,מלאו,ML>W,,...,pl,p2,1,0,0,0,0,0,0,0
564,564,Genesis,1,28,ML>[,מלא,ML>W,מלאו,ML>W,,...,pl,p2,1,0,0,0,0,0,0,0
569,569,Genesis,1,28,KBC[,כבשׁ,KBCH,כבשׁה,KBC,H,...,pl,p2,0,1,0,0,0,0,0,0
675,675,Genesis,2,1,KLH[,כלה,JKLW,יכלו,JKLW,,...,pl,p3,1,0,0,0,0,0,0,0
1192,1192,Genesis,3,1,>KL[,אכל,T>KLW,תאכלו,T>KLW,,...,pl,p2,1,0,0,0,0,0,0,0
1224,1224,Genesis,3,3,>KL[,אכל,T>KLW,תאכלו,T>KLW,,...,pl,p2,1,0,0,0,0,0,0,0
1231,1231,Genesis,3,3,MWT[,מות,TMTWN,תמתון,TMTW,,...,pl,p2,1,0,0,1,1,0,0,0
1241,1241,Genesis,3,4,MWT[,מות,TMTWN,תמתון,TMTW,,...,pl,p2,1,0,0,1,1,0,0,0
1303,1303,Genesis,3,7,JD<[,ידע,JD<W,ידעו,JD<W,,...,pl,p3,1,0,0,0,0,0,0,0
1312,1312,Genesis,3,7,<FH[,עשׂה,J<FW,יעשׂו,J<FW,,...,pl,p3,1,0,0,0,0,0,0,0


In [58]:
dat.to_csv('plural_verbs_mt.csv', sep='\t', index=False)

In [59]:
pd.crosstab(dat.verb_ends_on_w, dat.has_suffix)

has_suffix,0,1
verb_ends_on_w,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4,354
1,6524,402


In [60]:
pd.crosstab(dat.verb_ends_on_w, dat.w_in_suffix)

w_in_suffix,0,1
verb_ends_on_w,Unnamed: 1_level_1,Unnamed: 2_level_1
0,106,252
1,6924,2


In [61]:
from tf.app import use
A = use('dt-ucph/sp', version='3.4', hoist=globals())

A.load(['g_prs', 'g_nme', 'g_pfm', 'g_vbs', 'g_vbe'])

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,5,79878.4,100
chapter,187,2135.79,100
verse,5841,68.38,100
word,114890,3.48,100
sign,399392,1.0,100


True

In [62]:
lexemes_with_suff

{'<BD[',
 '<BR[',
 '<CQ[',
 '<FH[',
 '<LH[',
 '<MM[',
 '<NH[',
 '<RK[',
 '<WD[',
 '<WT[',
 '<YB=[',
 '<YB[',
 '<ZB[',
 '<ZR[',
 '>CR=[',
 '>HB[',
 '>KL[',
 '>MR[',
 '>PP[',
 '>RH[',
 '>SR[',
 '>XZ[',
 'B<L[',
 'B<T[',
 'BHL[',
 'BL<[',
 'BNH[',
 'BQC[',
 'BQQ[',
 'BRK[',
 'BTQ[',
 'BW>[',
 'BXN[',
 'BZZ[',
 'C>L[',
 'CBH[',
 'CBR[',
 'CBX[',
 'CDD[',
 'CJT[',
 'CKL[',
 'CKR[',
 'CKX[',
 'CLK[',
 'CLL[',
 'CLM[',
 'CLX[',
 'CM<[',
 'CMC[',
 'CMD[',
 'CMV[',
 'CNH[',
 'CPV[',
 'CQH[',
 'CRT[',
 'CSS[',
 'CTH[',
 'CVP[',
 'CVX[',
 'CWB[',
 'CXR[',
 'DBQ[',
 'DJG[',
 'DK>[',
 'DLQ[',
 'DMH[',
 'DPQ[',
 'DQR[',
 'DRC[',
 'DRK[',
 'F<R==[',
 'FJM[',
 'FMX[',
 'FN>[',
 'FRP[',
 'FVM[',
 'FVN[',
 'FWF[',
 'G>L[',
 'GDL[',
 'GML[',
 'GNB[',
 'GRC[',
 'HDP[',
 'HLL[',
 'HLM[',
 'HRG[',
 'J<L[',
 'J<Y[',
 'JBL[',
 'JC<[',
 'JD<[',
 'JDH[',
 'JR>[',
 'JRC[',
 'JRD[',
 'JRH=[',
 'JRH[',
 'JSR[',
 'JY>[',
 'JYT[',
 'K<S[',
 'KBC[',
 'KBD[',
 'KLH[',
 'KLM[',
 'KRH[',
 'KSH[',
 'KTR[',
 'KWL[',
 'LJY

In [64]:
vbe_set = set()

info_dict = {}

for w in F.otype.s('word'):
    
    is_perfect = F.vt.v(w) == 'perf' and F.ps.v(w) == 'p3' and F.nu.v(w) == 'pl'
    is_imperfect = F.vt.v(w) in {'impf', 'wayq'} and F.ps.v(w) in {'p2', 'p3'} and F.nu.v(w) == 'pl' and F.gn.v(w) == 'm'
    is_imperative = F.vt.v(w) == 'impv' and F.nu.v(w) == 'pl' and F.gn.v(w) == 'm'
    lang = F.language.v(w)
    if is_perfect or is_imperfect or is_imperative and lang == 'Hebrew':
        suff = ''.join([char for char in F.g_prs.v(w) if char in valid_suff_chars])
        #print(F.lex_utf8.v(w))
        vbe = F.g_vbe.v(w)
        vbe_set.add(vbe)
        
        lex = F.lex.v(w)
        g_cons = F.g_cons.v(w)
        has_suffix = int(len(suff) > 0)
        bo, ch, ve = T.sectionFromNode(w)
        if lex in lexemes_with_suff:
            final_nun = 0
            g_cons_no_suff = F.g_cons.v(w).rstrip(suff)
            if vbe == '[WN':
                g_cons_no_suff = F.g_cons.v(w).rstrip('N')
                final_nun = 1
            
            suff_has_w = int('W' in suff)
            verb_ends_on_w = int(g_cons_no_suff[-1] == 'W')
            
            hollow_root, hollow_vowel_letter = get_hollow_root_info(lex, g_cons)
            ayin_ayin_root, ayin_vowel_letter = get_ayin_ayin_info(lex, g_cons) 
            
            if not g_cons_no_suff.endswith('WN'):
                info_dict[w] = [w, bo, ch, ve, lex, F.lex_utf8.v(w), g_cons, F.g_cons_utf8.v(w), g_cons_no_suff, suff, 
                    F.vt.v(w), '-', F.gn.v(w), F.nu.v(w), F.ps.v(w), verb_ends_on_w, has_suffix, suff_has_w, final_nun,
                               hollow_root, hollow_vowel_letter, ayin_ayin_root, ayin_vowel_letter]

In [65]:
dat = pd.DataFrame(info_dict).T

dat.columns = column_names
dat.head(20)

Unnamed: 0,tf_id,book,chapter,verse,lex,lex_hebrew,g_cons,g_cons_hebrew,g_cons_no_suff,suff,...,nu,ps,verb_ends_on_w,has_suffix,w_in_suffix,has_final_nun,hollow_root,hollow_vowel_letter,ayin_ayin_root,ayin_vowel_letter
405848,405848,Genesis,1,22,ML>[,מלא[,ML>W,מלאו,ML>W,,...,pl,p2,1,0,0,0,0,0,0,0
405987,405987,Genesis,1,28,ML>[,מלא[,ML>W,מלאו,ML>W,,...,pl,p2,1,0,0,0,0,0,0,0
405992,405992,Genesis,1,28,KBC[,כבשׁ[,KBCWH,כבשׁוה,KBCW,H,...,pl,p3,1,1,0,0,0,0,0,0
406099,406099,Genesis,2,1,KLH[,כלה[,JKLW,יכלו,JKLW,,...,pl,p3,1,0,0,0,0,0,0,0
406617,406617,Genesis,3,1,>KL[,אכל[,T>KLW,תאכלו,T>KLW,,...,pl,p2,1,0,0,0,0,0,0,0
406651,406651,Genesis,3,3,>KL[,אכל[,T>KLW,תאכלו,T>KLW,,...,pl,p2,1,0,0,0,0,0,0,0
406658,406658,Genesis,3,3,MWT[,מות[,TMWTWN,תמותון,TMWTW,,...,pl,p2,1,0,0,1,1,1,0,0
406668,406668,Genesis,3,4,MWT[,מות[,TMWTWN,תמותון,TMWTW,,...,pl,p2,1,0,0,1,1,1,0,0
406723,406723,Genesis,3,6,>KL[,אכל[,J>KLW,יאכלו,J>KLW,,...,pl,p3,1,0,0,0,0,0,0,0
406729,406729,Genesis,3,7,JD<[,ידע[,JD<W,ידעו,JD<W,,...,pl,p3,1,0,0,0,0,0,0,0


In [66]:
dat.to_csv('plural_verbs_sp.csv', sep='\t', index=False)

In [67]:
dat.shape

(1679, 23)

In [68]:
pd.crosstab(dat.verb_ends_on_w, dat.has_suffix)

has_suffix,0,1
verb_ends_on_w,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,29
1,1582,68
