In [1]:
import os
import re
import json
import numpy as np
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm
from tpn.utils import most_common, oppose, load_data_from_tsv, load_data_from_perseg_tsv
from tpn.scene import get_scenes, postprocess_scenes
from tpn.coref import get_namedict, map_names, correct_coref, update_genderdict, prop_gen, correct_gender
from tpn.actions import get_interactions

In [None]:
file_id = ""
input_folder = ""
output_folder = ""
conll_output_folder = ""

# set infos for ich-narratives
ich_pespective = False
erzähler = "john"
first_person = ["ich","mir","mein","meine","meiner","meinem","meinen"]

In [2]:
files = os.listdir(input_folder)
files = [x for x in files if x.startswith(file_id)]
files.sort()

In [17]:
full_namelist = []
full_frame = pd.DataFrame()
gender_dict = {}
output = []


for fname in files:
    
    # load tsv file
    data = load_data_from_perseg_tsv(input_folder+"/"+fname)
    
    # add scene number to dataframe
    sc = int(re.sub(".*\_|\.txt.*","",fname))
    data["scene"] = sc
    
    # generate corefID/name dictionary
    scene_namedict = get_namedict(data)
    
    # map names to new column "named_coref"
    data = map_names(data, scene_namedict)
    
    
    # apply postprocessing for ich-narratives
    if ich_pespective:
        mistakes = data[(data.anys == "no") & (data.token.isin(first_person))].index
        for mis in mistakes:
            data.loc[mis,"named_coref"] = erzähler
    
    # get all characters present and update full namelist
    personal = list(set(scene_namedict.values()))
    full_namelist = list(set(full_namelist+personal))
    
    # corefernce postprocessing 
    data = correct_coref(data, full_namelist)
    
    personal = list(set(data[~data.named_coref.isna()].named_coref))
    gender_dict = update_genderdict(data, personal, gender_dict)
    
    data = correct_gender(data, gender_dict)
    
    # query for actions
    store = get_interactions(data, personal)
    
    # create input format for graph creation
    scene_info = {}
    scene_info["personal"] = personal
    scene_info["time"] = sc
    scene_info["verbs"] = store[0]
    scene_info["objects"] = store[1]
    scene_info["constructs"] = store[2]
    scene_info["speech_prop"] = len(data[data.anys == "yes"])/len(data)
    scene_info["genders"] = gender_dict
    output.append(scene_info)
    
    # add scene to novel dataframe
    full_frame = pd.concat([full_frame, data], axis=0)
    
    # store input for graph creation
    with open(output_folder+"/"+fname[:-11]+".json", 'w') as fout:
            json.dump(output, fout)
            
# store all scenes in one conll file
full_frame.to_csv(conll_output_folder+"/"+fname[:-11]+".tsv", sep="\t")       