In [1]:
import pandas as pd
import json

In [10]:
### CONFIGURATION
#
# which Run? 1, 2 or 3
#chosen_run = 1 # one JSON per sentence
chosen_run = 2 # one JSON per document
#chosen_run = 3 # one JSON per relation
#
###

## Auxiliary methods

In [20]:
def create_denotation(start, end, id_counter, tipo):
    span = {}
    span['begin'] = start
    span['end'] = end

    denotation = {}
    denotation['id'] = "T{}".format(id_counter)
    denotation['obj'] = tipo
    denotation['span'] = span
    
    return denotation

In [21]:
def create_relation(id_counter, tipo_rel, subj_entity, obj_entity):
    relation = {}
    relation['id'] = "R{}".format(id_counter)
    relation['pred'] = tipo_rel
    relation['subj'] = subj_entity
    relation['obj'] = obj_entity
    
    return relation

In [22]:
def get_denotations_from_sentence(df, sentence, entity_indexes):
    all_denotations = []
    
    denotation_id = 1
    
    offset = 0
    
    subsentence = sentence
    
    # dictionary with the entities that belong to this sentence
    # maps each entity ID from the dataset to the corresponding Kindred JSON ID (i.e. T1, T2, T3 ...)
    # this dictionary will be used later to check whether the entities in each relation belong to the same sentence or not
    entity_id_to_kindred_id_dict = {}
    
    for entity_index in entity_indexes:
        entity = df.loc[(df.p_index == p_index) & (df.entity_index == entity_index), "entity"].values[0]
        
        entity_id = df.loc[(df.p_index == p_index) & (df.entity_index == entity_index), "entity_id"].values[0]
        
        tipo = df.loc[(df.p_index == p_index) & (df.entity_index == entity_index), "tipo_final"].values[0]

        # "clears" the sentence of entities that have already been processed
        entity_start_index = subsentence.index(entity) + offset
        entity_end_index = entity_start_index + len(entity)
        offset = entity_end_index
        subsentence = subsentence[subsentence.index(entity) + len(entity):]

        denotation = create_denotation(
            entity_start_index,
            entity_end_index,
            denotation_id,
            tipo)
        
        denotation_id += 1
        
        all_denotations.append(denotation)
        
        # add this entity to the dictionary of the sentence
        entity_id_to_kindred_id_dict[entity_id] = denotation['id']
        
    return all_denotations, entity_id_to_kindred_id_dict

In [23]:
def get_denotations_from_doc(df, sentence, doc_index):
    all_denotations = []
    
    denotation_id = 1
    
    # dictionary that maps each entity ID from the dataset to the corresponding Kindred JSON ID (i.e. T1, T2, T3 ...)
    entity_id_to_kindred_id_dict = {}
    
    offset = 0
    subsentence = sentence
    
    # list with all the IDs of the entities that belong to this document
    entity_ids = df.loc[df.doc_index == doc_index, "entity_id"].values
    
    for entity_id in entity_ids:
        entity = df.loc[df.entity_id == entity_id, "entity"].values[0]
        
        tipo = df.loc[df.entity_id == entity_id, "tipo_final"].values[0]

        # "clears" the sentence of entities that have already been processed
        entity_start_index = subsentence.index(entity) + offset
        entity_end_index = entity_start_index + len(entity)
        offset = entity_end_index
        subsentence = subsentence[subsentence.index(entity) + len(entity):]

        denotation = create_denotation(
            entity_start_index,
            entity_end_index,
            denotation_id,
            tipo)
        
        denotation_id += 1
        
        all_denotations.append(denotation)
        
        # add this entity to the dictionary of the sentence
        entity_id_to_kindred_id_dict[entity_id] = denotation['id']
        
    return all_denotations, entity_id_to_kindred_id_dict

In [24]:
def get_relations_from_sentence(df, sentence, entity_indexes, entity_id_to_kindred_id_dict):
    all_relations = []
    
    relation_id = 1
    
    # if the relation has one entity in one sentence and another entity in another sentence, that relation will be discarded
    # kindred is only able to recognize relations between entities in the same sentence
    discarded_relations = []
    
    for entity_index in entity_indexes:
        entity = df.loc[(df.p_index == p_index) & (df.entity_index == entity_index), "entity"].values[0]
        
        entity_id = df.loc[(df.p_index == p_index) & (df.entity_index == entity_index), "entity_id"].values[0]
        
        # corels is a String with the "object entities" of each relation, separated by a space
        # EX: H2-dftre765-12 H2-dftre765-9 H2-dftre765-1
        corels = df.loc[(df.p_index == p_index) & (df.entity_index == entity_index), "corel"].values[0]
        
        # ignore cases where there is no relation
        if (corels == corels):
            # tiporels is a list of relations
            # EX: [autor_de, natural_de, participante_em]
            tiporels = df.loc[(df.p_index == p_index) & (df.entity_index == entity_index), "tiporel"].values[0].split()
            
            # make corels a list and process element by element (each element "corel" is the object id of the relation)
            for corel_index, corel in enumerate(corels.split()):
                # process the relation only if the obj is in the same sentence as the subj AKA it is in the dict
                # the verification "corel_index<len (tiporels)" is because there is some buggy data in the dataset   
                if corel in entity_id_to_kindred_id_dict and corel_index < len(tiporels):
                    
                    relation = create_relation(
                        relation_id,
                        tiporels[corel_index],
                        entity_id_to_kindred_id_dict[entity_id],
                        entity_id_to_kindred_id_dict[corel])
                    
                    relation_id += 1
                    
                    all_relations.append(relation)
                   
                # add the unused relations to the discard list
                # the verification "len (obj_entity) == 1" is also due to buggy data in the dataset   
                else:
                    obj_entity = df.loc[df.entity_id == corel, "entity"].values
                    if (len(obj_entity) == 1):
                        text = "the subj {} ({}) is in sentence {}, but the obj {} ({}) is in other sentence".format(entity, entity_id, p_index, obj_entity[0], corel)
                        discarded_relations.append(text)
                        
    return all_relations, discarded_relations

In [25]:
def get_relations_from_doc(df, sentence, doc_index, entity_id_to_kindred_id_dict):
    all_relations = []
    
    relation_id = 1
    
    # list with all the IDs of the entities that belong to this document
    entity_ids = df.loc[df.doc_index == doc_index, "entity_id"].values
    
    for entity_id in entity_ids:
        entity = df.loc[df.entity_id == entity_id, "entity"].values[0]
        
        # corels is a String with the "object entities" of each relation, separated by a space
        # EX: H2-dftre765-12 H2-dftre765-9 H2-dftre765-1
        corels = df.loc[df.entity_id == entity_id, "corel"].values[0]
        
        # ignore cases where there is no relation
        if (corels == corels):
            # tiporels is a list of relations
            # EX: [autor_de, natural_de, participante_em]
            tiporels = df.loc[df.entity_id == entity_id, "tiporel"].values[0].split()
            
            # make corels a list and process element by element (each element "corel" is the object id of the relation)
            for corel_index, corel in enumerate(corels.split()):
                # process the relation only if the obj is in the dict
                # the verification "corel_index<len (tiporels)" is because there is some buggy data in the dataset 
                if corel in entity_id_to_kindred_id_dict and corel_index < len(tiporels):
                    
                    relation = create_relation(
                        relation_id,
                        tiporels[corel_index],
                        entity_id_to_kindred_id_dict[entity_id],
                        entity_id_to_kindred_id_dict[corel])
                    
                    relation_id += 1
                    
                    all_relations.append(relation)
                        
    return all_relations

## Load Dataframe from CSV and transform to PubAnnotation/Json format

#### Run 1: create a Json file for each sentence

In [26]:
if (chosen_run == 1):
    df = pd.read_csv("../Dataset/dataset_processed.csv")

    all_discarded_relations = []

    # p_index ranges from 1 to 2273 (total number of sentences)
    p_indexes_set = set(df["p_index"])

    for p_index in p_indexes_set:

        sentence = df.loc[df.p_index == p_index, "p_sentence_processed"].values[0]

        # lista com a quantidade de entidades que tem na sentença
        entity_indexes = df.loc[df.p_index == p_index, "entity_index"].values

        all_denotations, entity_id_to_kindred_id_dict = get_denotations_from_sentence(df,
                                                                                          sentence,
                                                                                          entity_indexes)

        all_relations, discarded_relations = get_relations_from_sentence(df, 
                                                                             sentence,
                                                                             entity_indexes,
                                                                             entity_id_to_kindred_id_dict)

        all_discarded_relations.append(discarded_relations)

        # each json file will have "sentence + entities + relations"
        json_per_sentence = {}     
        json_per_sentence['text'] = sentence
        json_per_sentence['denotations'] = all_denotations
        json_per_sentence['relations'] = all_relations

        with open("run1/sentence_p_index_{}.json".format(p_index), "w+", encoding="utf-8") as fileWriter:
            json.dump(json_per_sentence, fileWriter, indent=2, ensure_ascii=False)


    # saves all relations that have been discarded to a file
    with open("run1/discarded_relations.txt", "w+", encoding="utf-8") as fileWriter:
        for discarded_relations in all_discarded_relations:
            for text in discarded_relations:
                print(text, file=fileWriter)

#### Run 2: create a Json file for each document (so we can use all relations)

In [27]:
if (chosen_run == 2):
    df = pd.read_csv("../Dataset/dataset_processed.csv")

    all_discarded_relations = []

    # doc_index ranges from 0 to 128 (total number of documents)
    doc_indexes_set = set(df["doc_index"])

    for doc_index in sorted(doc_indexes_set):

        big_sentence = ""

        # total number of sentences in document
        p_indexes_set = set(df.loc[df.doc_index == doc_index, "p_index"].values)

        # build a big sentence with all sentences from document
        for p_index in sorted(p_indexes_set):
            sentence = df.loc[df.p_index == p_index, "p_sentence_processed"].values[0]

            if (sentence.endswith('.')):
                big_sentence = big_sentence + " " + sentence
            else:
                big_sentence = big_sentence + " " + sentence + "."

        all_denotations, ids_dict = get_denotations_from_doc(df, big_sentence, doc_index)

        all_relations = get_relations_from_doc(df, big_sentence, doc_index, ids_dict)

        big_json = {}
        big_json['text'] = big_sentence
        big_json['denotations'] = all_denotations
        big_json['relations'] = all_relations

        with open("run2/doc_index_{}.json".format(doc_index), "w+", encoding="utf-8") as fileWriter:
            json.dump(big_json, fileWriter, indent=2, ensure_ascii=False)

#### Run 3: create a Json file for each relation

In [28]:
if (chosen_run == 3):
    df = pd.read_csv("../Dataset/dataset_processed.csv")

    relation_counter = 0

    # doc_index ranges from 0 to 128 (total number of documents)
    doc_indexes_set = set(df["doc_index"])

    for doc_index in sorted(doc_indexes_set):

        big_sentence = ""

        # total number of sentences in document
        p_indexes_set = set(df.loc[df.doc_index == doc_index, "p_index"].values)

        # build a big sentence with all sentences from document
        for p_index in sorted(p_indexes_set):
            sentence = df.loc[df.p_index == p_index, "p_sentence_processed"].values[0]

            if (sentence.endswith('.')):
                big_sentence = big_sentence + " " + sentence
            else:
                big_sentence = big_sentence + " " + sentence + "."

        all_denotations, entity_id_to_kindred_id_dict = get_denotations_from_doc(df, big_sentence, doc_index)

        all_relations = get_relations_from_doc(df, big_sentence, doc_index, entity_id_to_kindred_id_dict)

        # here we already have all the entities and relations of the document ...
        # take each of the relations and create a JSON file that only has 1 relation and 2 entities
        for relation in all_relations:
            # get the id (Tx) of the subject and object entities
            subj_kindred_id = relation['subj']
            obj_kindred_id = relation['obj']

            # take the denotations (subject and object entities)
            for denotation in all_denotations:
                if (denotation['id'] == subj_kindred_id):
                    subj_denotation = denotation.copy()
                elif (denotation['id'] == obj_kindred_id):
                    obj_denotation = denotation.copy()

            # replace IDs to start from 1
            relation['id'] = "R1"
            relation['subj'] = "T1"
            relation['obj'] = "T2"
            subj_denotation['id'] = "T1"
            obj_denotation['id'] = "T2"

            big_json = {}
            big_json['text'] = big_sentence
            big_json['denotations'] = [subj_denotation, obj_denotation]
            big_json['relations'] = [relation]

            with open("run3/relation_index_{}.json".format(relation_counter), "w+", encoding="utf-8") as fileWriter:
                json.dump(big_json, fileWriter, indent=2, ensure_ascii=False)

            relation_counter += 1

## EXTRA STEP:
The next step will be done in the Google Colab environment.

Save JSON files to Google Drive and run colab_Kindred from Colab.

Take the files with the results and save them in the folder "runX_results".

After saving all the files, you can proceed to the next step.

OBS: Run 3 ended up exceeding the memory in Google Colab, so I had to run it locally (notebook_Kindred).

## CALCULATE MACRO METRICS FROM RESULTS
MICRO metrics are already in the files.

In [9]:
if (chosen_run == 1):
    src_folder = "run1_results/"
    n_folds = 10
elif (chosen_run == 2):
    src_folder = "run2_results/"
    n_folds = 8
elif (chosen_run == 3):
    src_folder = "run3_results/"
    n_folds = 10

for fold in range(n_folds):

    with open(src_folder + "results_fold_{}.txt".format(fold), "r", encoding="utf-8") as fileReader:
        i = 0
        dicP = []
        dicR = []
        dicF = []

        for line in fileReader:
            if(line.partition("----")[1]):
                break
            else:
                texto1 = line.partition("\tP:")[2]
                precision = texto1.partition(" R:")[0]
                texto2 = texto1.partition(" R:")[2]
                recall = texto2.partition(" F1:")[0]
                fscore = texto2.partition(" F1:")[2]

                if (False):
                    print(line)
                    print("P = {}".format(precision))
                    print("R = {}".format(recall))
                    print("F = {}".format(fscore))

                dicP.append(float(precision))
                dicR.append(float(recall))
                dicF.append(float(fscore))

                i += 1
        
        print(fold)
        textP = "P = {}".format(sum(dicP)/len(dicP))
        print(textP)
        textR = "R = {}".format(sum(dicR)/len(dicR))
        print(textR)
        textF = "F = {}".format(sum(dicF)/len(dicF))
        print(textF)
        print("\n")

0
P = 0.4227953703703704
R = 0.18460962962962965
F = 0.23466196296296293


1
P = 0.3376473461538462
R = 0.1556363846153846
F = 0.19240634615384614


2
P = 0.408187125
R = 0.17130583333333335
F = 0.22988916666666662


3
P = 0.33422672
R = 0.15622392
F = 0.19618123999999998


4
P = 0.25100730769230767
R = 0.10360723076923076
F = 0.13735165384615383


5
P = 0.3489621428571428
R = 0.14185178571428572
F = 0.18467007142857142


6
P = 0.347198
R = 0.17850613793103448
F = 0.21825382758620687


7
P = 0.35917028
R = 0.14306976000000002
F = 0.19165272000000003


8
P = 0.437477962962963
R = 0.16179711111111114
F = 0.22170718518518523


9
P = 0.3884756538461538
R = 0.15152119230769232
F = 0.20156484615384615


