# Linking clauses with נתן and שים  with data from the ETCBC database

In this notebook all clauses with the verbs נתן and שים are extracted from the database, together with information about discourse environment, genre, and whether the clause is a main or subordinate clause. These data are linked to valence data of these verbs. 

In [1]:
import sys, os, csv, collections
import pandas as pd

In [2]:
from tf.app import use
A = use('bhsa', hoist=globals())

A.displaySetup(extraFeatures='kind')
A.displaySetup(extraFeatures='prs')
A.displaySetup(extraFeatures='g_cons')

	connecting to online GitHub repo annotation/app-bhsa ... connected
Using TF-app in C:\Users\geitb/text-fabric-data/annotation/app-bhsa/code:
	rv1.2=#5fdf1778d51d938bfe80b37b415e36618e50190c (latest release)
	connecting to online GitHub repo etcbc/bhsa ... connected
Using data in C:\Users\geitb/text-fabric-data/etcbc/bhsa/tf/c:
	rv1.6=#bac4a9f5a2bbdede96ba6caea45e762fe88f88c5 (latest release)
	connecting to online GitHub repo etcbc/phono ... connected
Using data in C:\Users\geitb/text-fabric-data/etcbc/phono/tf/c:
	r1.2 (latest release)
	connecting to online GitHub repo etcbc/parallels ... connected
Using data in C:\Users\geitb/text-fabric-data/etcbc/parallels/tf/c:
	r1.2 (latest release)
   |     0.01s No structure info in otext, the structure part of the T-API cannot be used


In [4]:
# %load main_sub
def in_dep_calc(cl):  
      
    in_dep = ''        
    if F.rela.v(cl) == 'ReSu': # is the clause resumptive?
        moth_obj = E.mother.f(cl)[0]
        in_dep = rela_calc(moth_obj)
    else:
        in_dep = rela_calc(cl) # does the clause have a dependent CCR?

    if in_dep == '':
    	words = L.d(cl, 'word') # is there a wayyiqtol?
    	for word in words:
            if F.vt.v(word) == 'wayq':
                in_dep += 'Main'
                        
    if in_dep == '':  # if everything else does not give a result, we look at the CARC
        cl_atoms = L.d(cl, 'clause_atom')
        in_dep = carc_calc(cl_atoms)
        
    return(in_dep)


def carc_calc(cl_atoms):
    in_dep_c = ''
    carc = F.code.v(cl_atoms[0])
    if 999 > int(carc) > 499:
        in_dep_c += 'SubAdv'
    elif int(carc) in {0, 999}:
        in_dep_c = 'Main'
    elif 17 > int(carc) > 9:
        in_dep_c += 'SubAdv'
    elif 75 > int(carc) > 50:
        in_dep_c += 'SubAdv'
    elif 168 > int(carc) > 99:
        in_dep_c += 'Main'
    elif 500 > int(carc) > 299:
        in_dep_c += 'Main'
    elif int(carc) in {200, 201}:         
        while F.code.v(cl_atoms[0]) in {200, 201}:
            cl_atoms = E.mother.f(cl_atoms[0])
        carc = F.code.v(cl_atoms[0])
        if 999 > int(carc) > 499:
            in_dep_c += 'SubAdv'
        elif int(carc) in {0, 999}:
            in_dep_c = 'Main'
        elif 17 > int(carc) > 9:
            in_dep_c += 'SubAdv'
        elif 75 > int(carc) > 50:
            in_dep_c += 'SubAdv'
        elif 168 > int(carc) > 99:
            in_dep_c += 'Main'
        elif 500 > int(carc) > 299:
            in_dep_c += 'Main'
        elif int(carc) in {220, 221, 222, 223}:
            in_dep_c += 'Undc'
        
    else:
        in_dep_c += 'Undc'
        
    return(in_dep_c)


def rela_calc(cl):
    in_dep_r = ''
    ccr = F.rela.v(cl)
    if ccr in {'Subj', 'Objc', 'Cmpl', 'PreC', 'Voct', 'Frnt'}:
        in_dep_r += 'SubArg'
    elif ccr in {'Attr', 'RgRc', 'Spec'}:
        in_dep_r += 'SubMod'
    elif ccr in {'Adju', 'PrAd'}:
        in_dep_r += 'SubAdv'
    elif ccr == 'Coor':
        moth_obj = E.mother.f(cl)[0]
        if F.otype.v(moth_obj) in {'word', 'phrase'}:
            in_dep_r += 'SubMod'
        else:
            while F.rela.v(moth_obj) == 'Coor':
                moth_obj = E.mother.f(moth_obj)[0]
            ccr = F.rela.v(cl)
            if ccr in {'Subj', 'Objc', 'Cmpl', 'PreC', 'Voct', 'Frnt'}:
                in_dep_r += 'SubArg'
            elif ccr in {'Attr', 'RgRc', 'Spec'}:
                in_dep_r += 'SubMod'
            elif ccr in {'Adju', 'PrAd'}:
                in_dep_r += 'SubAdv'
                
        if in_dep_r == '':
            if F.otype.v(moth_obj) != 'clause':
                in_dep_r += 'SubMod'
            else:
                cl_atoms = L.d(moth_obj, 'clause_atom')
                in_dep_r = carc_calc(cl_atoms)
                
    return(in_dep_r)

In [6]:
ntn_dict = {}
fjm_dict = {}

for cl in F.otype.s('clause'):
    lexemes = [F.lex.v(w) for w in L.d(cl, 'word')]
    consonants = [F.g_cons.v(w) for w in L.d(cl, 'word')]
    lang_of_words = [F.language.v(w) for w in L.d(cl, 'word')]
    
    if 'NTN[' in lexemes:
        bo, ch, ve = T.sectionFromNode(cl)
        feat_list = ['NTN', str(cl), bo, str(ch), str(ve), lang_of_words[0]]
        
        feat_list.append(in_dep_calc(cl)) # main or subordinate clause
        feat_list.append(F.txt.v(cl)[-1]) # Q, D, N, ?
        feat_list.append(" ".join(consonants))

        ntn_dict[cl] = feat_list
        
    if 'FJM[' in lexemes:
        bo, ch, ve = T.sectionFromNode(cl)
        feat_list = ['FJM', str(cl), bo, str(ch), str(ve), lang_of_words[0]]
        
        feat_list.append(in_dep_calc(cl)) # main or subordinate clause
        feat_list.append(F.txt.v(cl)[-1]) # Q, D, N, ?
        feat_list.append(" ".join(consonants))

        fjm_dict[cl] = feat_list

Make a dataframe of the data in the dicts.

In [8]:
df_ntn = pd.DataFrame(ntn_dict).T
df_ntn

Unnamed: 0,0,1,2,3,4,5,6,7,8
427611,NTN,427611,Genesis,1,17,Hebrew,Main,N,W JTN >TM >LHJM B RQJ< H CMJM
427664,NTN,427664,Genesis,1,29,Hebrew,Main,Q,HNH NTTJ LKM >T KL <FB W >T KL H <Y
427797,NTN,427797,Genesis,3,6,Hebrew,Main,N,W TTN GM L >JCH <MH
427823,NTN,427823,Genesis,3,12,Hebrew,SubMod,Q,>CR NTTH <MDJ
427824,NTN,427824,Genesis,3,12,Hebrew,Main,Q,HW> NTNH LJ MN H <Y
427922,NTN,427922,Genesis,4,12,Hebrew,SubArg,Q,TT KXH LK
428269,NTN,428269,Genesis,9,2,Hebrew,Main,Q,B JDKM NTNW
428272,NTN,428272,Genesis,9,3,Hebrew,Main,Q,K JRQ <FB NTTJ LKM >T KL
428298,NTN,428298,Genesis,9,12,Hebrew,SubMod,Q,>CR >NJ NTN BJNJ W BJNJKM W BJN KL NPC XJH
428300,NTN,428300,Genesis,9,13,Hebrew,Main,Q,>T QCTJ NTTJ B <NN


In [9]:
df_fjm = pd.DataFrame(fjm_dict).T
df_fjm

Unnamed: 0,0,1,2,3,4,5,6,7,8
427704,FJM,427704,Genesis,2,8,Hebrew,Main,N,W JFM CM >T H >DM
427936,FJM,427936,Genesis,4,15,Hebrew,Main,N,W JFM JHWH L QJN >WT
428111,FJM,428111,Genesis,6,16,Hebrew,Main,Q,W PTX H TBH B YDH TFJM
428332,FJM,428332,Genesis,9,23,Hebrew,Main,N,W JFJMW <L CKM CNJHM
428634,FJM,428634,Genesis,13,16,Hebrew,Main,Q,W FMTJ >T ZR<K K <PR H >RY
429401,FJM,429401,Genesis,21,13,Hebrew,Main,Q,L GWJ >FJMNW
429406,FJM,429406,Genesis,21,14,Hebrew,SubAdv,N,FM <L CKMH
429431,FJM,429431,Genesis,21,18,Hebrew,SubAdv,Q,KJ L GWJ GDWL >FJMNW
429510,FJM,429510,Genesis,22,6,Hebrew,Main,N,W JFM <L JYXQ BNW
429531,FJM,429531,Genesis,22,9,Hebrew,Main,N,W JFM >TW <L H MZBX M M<L L <YJM


Merge the dataframes.

In [11]:
df_ntn_fjm = pd.concat([df_ntn, df_fjm])
df_ntn_fjm

Unnamed: 0,0,1,2,3,4,5,6,7,8
427611,NTN,427611,Genesis,1,17,Hebrew,Main,N,W JTN >TM >LHJM B RQJ< H CMJM
427664,NTN,427664,Genesis,1,29,Hebrew,Main,Q,HNH NTTJ LKM >T KL <FB W >T KL H <Y
427797,NTN,427797,Genesis,3,6,Hebrew,Main,N,W TTN GM L >JCH <MH
427823,NTN,427823,Genesis,3,12,Hebrew,SubMod,Q,>CR NTTH <MDJ
427824,NTN,427824,Genesis,3,12,Hebrew,Main,Q,HW> NTNH LJ MN H <Y
427922,NTN,427922,Genesis,4,12,Hebrew,SubArg,Q,TT KXH LK
428269,NTN,428269,Genesis,9,2,Hebrew,Main,Q,B JDKM NTNW
428272,NTN,428272,Genesis,9,3,Hebrew,Main,Q,K JRQ <FB NTTJ LKM >T KL
428298,NTN,428298,Genesis,9,12,Hebrew,SubMod,Q,>CR >NJ NTN BJNJ W BJNJKM W BJN KL NPC XJH
428300,NTN,428300,Genesis,9,13,Hebrew,Main,Q,>T QCTJ NTTJ B <NN


Save the df as [csv file](fjm_ntn_qn_mainsub.csv).

In [12]:
df_ntn_fjm.to_csv('fjm_ntn_qn_mainsub.csv')