In [1]:
import os
import re
import json
import numpy as np
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm
from tpn.utils import most_common, oppose, load_data_from_tsv, load_data_from_perseg_tsv
from tpn.scene import get_scenes, postprocess_scenes
from tpn.coref import get_namedict, map_names, correct_coref, update_genderdict, prop_gen, correct_gender
from tpn.actions import get_interactions

In [2]:
files = os.listdir("./../files/tsv_split/")
files = [x for x in files if x.startswith("k00300000001")]
files.sort()

In [17]:
full_namelist = []
full_frame = pd.DataFrame()
gender_dict = {}
output = []

ich_pespective = False
erzähler = "john"
first_person = ["ich","mir","mein","meine","meiner","meinem","meinen"]

for fname in files:
    
    data = load_data_from_perseg_tsv("./../files/tsv_split/"+fname)
    sc = int(re.sub(".*\_|\.txt.*","",fname))
    data["scene"] = sc
    
    scene_namedict = get_namedict(data)
    data = map_names(data, scene_namedict)
    
    if ich_pespective:
        mistakes = data[(data.anys == "no") & (data.token.isin(first_person))].index
        for mis in mistakes:
            data.loc[mis,"named_coref"] = erzähler
    
    
    personal = list(set(scene_namedict.values()))
    full_namelist = list(set(full_namelist+personal))
    
    data = correct_coref(data, full_namelist)
    
    personal = list(set(data[~data.named_coref.isna()].named_coref))
    gender_dict = update_genderdict(data, personal, gender_dict)
    
    data = correct_gender(data, gender_dict)
    
    store = get_interactions(data, personal)
    
    scene_info = {}
    scene_info["personal"] = personal
    scene_info["time"] = sc
    scene_info["verbs"] = store[0]
    scene_info["objects"] = store[1]
    scene_info["constructs"] = store[2]
    scene_info["speech_prop"] = len(data[data.anys == "yes"])/len(data)
    scene_info["speech_prop"] = len(data[data.anys == "yes"])/len(data)
    scene_info["genders"] = gender_dict
    output.append(scene_info)
    
    full_frame = pd.concat([full_frame, data], axis=0)
    full_frame.to_csv("conll+/"+fname[:-11]+".tsv", sep="\t")
    with open('json_store/'+fname[:-11]+".json", 'w') as fout:
            json.dump(output, fout)
            

In [67]:
data[data.sent == 44]

Unnamed: 0,token,sent,pos,upos,morph,lemma,head,deprel,sd,si,sr,sf,coref,ner,anys,scene,named_coref


In [35]:
a = data[data.sent == 12].loc[:,["token","head","deprel","named_coref"]]
a.index = list(range(1,len(a)+1,1))

In [36]:
a

Unnamed: 0,token,head,deprel,named_coref
1,Sichtlich,2,adv,
2,zaudernd,3,adv,
3,hob,0,root,
4,er,3,subj,rashad
5,die,6,det,
6,Bettdecke,3,obja,
7,ein,8,det,
8,Stück,0,root,
9,an,0,root,
10,.,0,root,


In [61]:
full_frame[(full_frame.scene == 17) & (~full_frame.named_coref.isna())]

Unnamed: 0,token,sent,pos,upos,morph,lemma,head,deprel,sd,si,sr,sf,coref,ner,anys,scene,named_coref
10,ich,0,PPER,PRON,1|Sg|_|Nom,ich,10,subj,no,no,no,no,2.0,,no,17,john
22,mir,1,PPER,PRON,1|Sg|_|Dat,mir,3,objd,no,no,no,no,2.0,,no,17,john
77,ich,6,PPER,PRON,1|Sg|_|Nom,ich,4,subj,no,no,no,no,2.0,,no,17,john
83,mir,7,PPER,PRON,1|Sg|_|Dat,mir,5,objd,no,no,no,no,2.0,,no,17,john
101,ich,7,PPER,PRON,1|Sg|_|Nom,ich,21,subj,no,no,no,no,2.0,,no,17,john
103,Frau,7,NN,NOUN,Fem|Acc|Sg,Frau,21,obja,no,no,no,no,5.0,,no,17,wayne
105,Sie,8,PPER,PRON,3|Sg|Fem|Nom,sie,2,subj,no,no,no,no,5.0,,no,17,wayne
110,Mann,8,NN,NOUN,Masc|Dat|Sg,Mann,4,pn,no,no,no,no,6.0,,no,17,kojak
112,der,8,PRELS,PRON,Masc|Nom|Sg,der,9,subj,no,no,no,no,6.0,,no,17,kojak
115,Kojak,8,NE,PROPN,_,Kojak,10,cj,no,no,no,no,6.0,PER,no,17,kojak


In [24]:
s = os.listdir("./../files/out_split/")
s.sort()
len(s)

3956

In [4]:
Counter([x[:-6] for x in s])

Counter({'102568611X_': 21,
         '1025686411_': 16,
         '1025686527_': 19,
         '1025686667_': 18,
         '1025686926_': 18,
         '1025692829_': 16,
         '1025693256_': 16,
         'k00300000001_': 28,
         'k00300000002_': 35,
         'k00300000003_': 20,
         'k00300000006_': 38,
         'k00300000010_': 24,
         'k00300000011_': 28,
         'k00300000012_': 22,
         'k00300000013_': 26,
         'k00300000014_': 19,
         'k00300000018_': 34,
         'k00300000020_': 25,
         'k00300000025_': 29,
         'k00300000026_': 29,
         'k00300000027_': 35,
         'k00300000028_': 18,
         'k00300000032_': 29,
         'k00300000033_': 25,
         'k00300000034_': 13,
         'k00300000036_': 24,
         'k00300000037_': 37,
         'k00300000037_1': 4,
         'k00300000038_': 25,
         'k00300000041_': 33,
         'k00300000043_': 33,
         'k00300000045_': 19,
         'k00300000046_': 29,
         'k00300000047_'

In [93]:
len(s)

1412

In [10]:
z = os.listdir("./../files/in_split/")
len([x for x in z if x.startswith("k00300001152_")])

116

In [6]:
z

['k00300000086_24.txt',
 '102568611X_45.txt',
 'k00300000053_62.txt',
 'k00300000734_85.txt',
 'k00300000089_56.txt',
 'k00300000071_59.txt',
 'k00300000746_59.txt',
 'k00300000764_15.txt',
 'k00300000762_28.txt',
 'k00300000858_00.txt',
 '1025686926_29.txt',
 'k00300000070_52.txt',
 'k00300000027_48.txt',
 'k00300000041_35.txt',
 'k00300000747_02.txt',
 'k00300001140_38.txt',
 'k00300000034_27.txt',
 'k00300000066_09.txt',
 'k00300000003_05.txt',
 'k00300001157_32.txt',
 'k00300000864_31.txt',
 'k00300000736_35.txt',
 'k00300000089_78.txt',
 'k00300001158_32.txt',
 'k00300001164_16.txt',
 'k00300001135_77.txt',
 'k00300000765_23.txt',
 'k00300000762_26.txt',
 'k00300000084_47.txt',
 'k00300000740_52.txt',
 'k00300001162_54.txt',
 'k00300000864_30.txt',
 'k00300000844_21.txt',
 'k00300000026_38.txt',
 'k00300001166_51.txt',
 'k00300001153_01.txt',
 'k00300001162_48.txt',
 'k00300000073_54.txt',
 'k00300001175_13.txt',
 'k00300000769_17.txt',
 'k00300000768_01.txt',
 'k00300001153_50.tx

In [87]:
len(os.listdir("./../files/in_split/"))

11461