## Importing libraries

In [2]:
import os
os.chdir("/home/airi/PycharmProjects/protagonist_tagger/")

import json
import pandas as pd
from pylighter import Annotation
import ast
import tabulate

from tool.gender_checker import get_personal_titles
from tool.pylighter_utils import read_annotations, annotations_to_pylighter, csv_to_json
from tool.annotations_utils import has_intersection, fix_personal_titles, personal_titles_stats

## Fixing gold_standard annotations by cutting personal_titles

In [4]:
gold_standard_path = "data/testing_sets/test_person_gold_standard"
new_gold_standard_path = "data/testing_sets/test_person_gold_standard_titles"
os.makedirs(new_gold_standard_path)
results_annotated = {}
results_not_annotated = {}
for title in os.listdir(gold_standard_path):
    annotations = read_annotations(os.path.join(gold_standard_path, title))
    annotations = fix_personal_titles(annotations)
    with open(os.path.join(new_gold_standard_path, title), 'w') as f:
        f.write(json.dumps(annotations))

## Calculating errors statistics

In [5]:
def get_errors_stats(path_1, path_2, path_3):
    results = []
    for title in os.listdir(os.path.join(path_1)):
        annotations = read_annotations(os.path.join(path_1, title))
        personal_title_annotated, _ = personal_titles_stats(annotations)
        titles_annotated_count = sum(list(personal_title_annotated.values()))
        annotations_2 = read_annotations(os.path.join(path_2, title))
        annotations_3 = read_annotations(os.path.join(path_3, title))

        old_count = 0
        new_count = 0
        exact_count = 0
        intersections_count = 0
        incorrect_count = 0
        missing_count = 0

        for anno, anno2 in zip(annotations_2, annotations_3):
            entities = anno['entities']
            entities2 = anno2['entities']

            old_count += len(entities)
            new_count += len(entities2)

            matched_count = 0
            for ent1 in entities:
                exact = False
                intersection = False

                for ent2 in entities2:
                    if ent1 == ent2:
                        exact_count += 1
                        exact = True
                        matched_count += 1

                if not exact:
                    for ent2 in entities2:
                        if has_intersection(ent1, ent2):
                            intersections_count += 1
                            intersection = True
                            matched_count += 1

                if not exact and not intersection:
                    incorrect_count += 1

            missing_count += (len(entities2) - matched_count)

        exact_count -= titles_annotated_count

        title_results = {'Title': title.split('.')[0].replace('_', ' '), 
                         '# previously annotated': old_count, 
                         '# correct annotations': exact_count, 
                         '# personal titles annotated': titles_annotated_count, 
                         '# annotations with wrong boundaries (except personal titles)': intersections_count, 
                         '# missing annotations':  missing_count}
        results.append(title_results)
    return pd.DataFrame(results)

In [6]:
results_df = get_errors_stats("data/testing_sets/test_person_gold_standard",
                "data/testing_sets/test_person_gold_standard_titles_corrected",
                "data/testing_sets/test_person_gold_standard_corrected")
results_df

Unnamed: 0,Title,# previously annotated,# correct annotations,# personal titles annotated,# annotations with wrong boundaries (except personal titles),# missing annotations
0,Treasure Island,97,96,0,1,19
1,Emma,115,99,16,0,17
2,The Secret Garden,97,70,25,2,6
3,Frankenstein,93,90,0,3,6
4,The Picture of Dorian Gray,90,75,15,0,18
5,Pride and Prejudice,124,121,2,1,19
6,Dracula,97,97,0,0,15
7,The Great Gatsby,102,88,13,1,36
8,Jane Eyre,97,97,0,0,18
9,Wuthering Heights,108,104,1,3,2


In [7]:
table_latex = results_df.to_latex(index=False)
table_latex = table_latex.split('\n')
table_latex = [' & '.join([x.strip() for x in line.split('&')]) for line in table_latex]
table_latex[2] = ' & '.join(['\\rot{\\textbf{' + x + '}}' for x in table_latex[2].split('&')])
print('\n'.join(table_latex))

\begin{tabular}{lrrrrr}
\toprule
\rot{\textbf{Title }} & \rot{\textbf{ \# previously annotated }} & \rot{\textbf{ \# correct annotations }} & \rot{\textbf{ \# personal titles annotated }} & \rot{\textbf{ \# annotations with wrong boundaries (except personal titles) }} & \rot{\textbf{ \# missing annotations \\}}
\midrule
Treasure Island & 97 & 96 & 0 & 1 & 19 \\
Emma & 115 & 99 & 16 & 0 & 17 \\
The Secret Garden & 97 & 70 & 25 & 2 & 6 \\
Frankenstein & 93 & 90 & 0 & 3 & 6 \\
The Picture of Dorian Gray & 90 & 75 & 15 & 0 & 18 \\
Pride and Prejudice & 124 & 121 & 2 & 1 & 19 \\
Dracula & 97 & 97 & 0 & 0 & 15 \\
The Great Gatsby & 102 & 88 & 13 & 1 & 36 \\
Jane Eyre & 97 & 97 & 0 & 0 & 18 \\
Wuthering Heights & 108 & 104 & 1 & 3 & 2 \\
Anne of Green Gables & 113 & 112 & 1 & 0 & 11 \\
The Catcher in the Rye & 74 & 64 & 9 & 1 & 29 \\
Adventures of Huckleberry Finn & 86 & 78 & 2 & 6 & 40 \\
\bottomrule
\end{tabular}



## Correcting annonations with pylighter

In [11]:
annotations = read_annotations(os.path.join("data/testing_sets/test_person_gold_standard_corrected", 'The_Catcher_in_the_Rye.json'))
labels, corpus = annotations_to_pylighter(annotations)
annotation = Annotation(corpus, labels_names=["PERSON"], labels=labels, save_path="notebooks/annotations/The_Catcher_in_the_Rye.csv")

Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<IPython.core.display.Javascript objec…

In [None]:
csv_path = "/home/airi/PycharmProjects/protagonist_tagger/notebooks/annotations/The_Catcher_in_the_Rye.csv"
json_path = "test_person_gold_standard/The_Catcher_in_the_Rye.json"
csv_to_json(csv_path, json_path)