In [15]:
import json
import sys
import numpy as np

UTILITIES = "../"
sys.path.append(UTILITIES)
from utilities.content_processor import tokenise_text
from utilities.paper_access import get_text

In [16]:
def statistics(dataset, index, llm_model, table, input_path):

    with open(f"{input_path}/kg_{dataset}_{index}_{llm_model}.json") as f:
        kg = json.load(f)

    # Title
    title = f"Title: {kg["title"]} (by {llm_model})"
    print(title)
    table["titles"].append(title)


    # Calculate Tokens
    tokens = 0
    for section in kg["sections"]:
        for paragraph in section["paragraphs"]:
            for sentence in paragraph["sentences"]:
                text = get_text(sentence)
                tokens += len(tokenise_text(text))


    # Tokens
    print(f"Tokens: {tokens}")
    table["tokens"].append(tokens)


    # Entities
    print(f"Entities: {len(kg["nodes"])}")
    table["entities"].append(len(kg["nodes"]))


    # Mentions
    mentions = 0
    for entity in kg["nodes"].values():
        mentions += len(entity["mentions"])
    print(f"Mentions: {mentions}")
    table["mentions"].append(mentions)

    # Relations (Total)
    print(f"Relations (Total): {len(kg["triples"]) + len(kg["triples_typing"])}")
    table["relations_total"].append(len(kg["triples"]) + len(kg["triples_typing"]))

    # Relations (Normal)
    print(f"Relations (Normal): {len(kg["triples"])}")
    table["relations_normal"].append(len(kg["triples"]))

    # Relations (Typing)
    print(f"Relations (Typing): {len(kg["triples_typing"])}")
    table["relations_typing"].append(len(kg["triples_typing"]))


    # Isolated Entities
    non_isolated_entities = []

    for triple in kg["triples"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)

    print(f"Isolated Entities: {len(isolated_entities)}")
    table["isolated_entities"].append(len(isolated_entities))


    # Isolated Entities

    non_isolated_entities = []

    for triple in kg["triples"] + kg["triples_typing"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)
    print(f"Isolated Entities (including taxonomy): {len(isolated_entities)}")
    table["isolated_entities_typing"].append(len(isolated_entities))


    # Runtime
    times = kg["times"]
    print()
    print(f"Stage 0 (Input Pre-processing): {(sum(times[0:1]) / 60):.4g} minutes")
    table["stage_0"].append(sum(times[0:1]) / 60)
    print(f"Stage 1 (Entity Mention Extraction): {(sum(times[1:6]) / 60):.4g} minutes")
    table["stage_1"].append(sum(times[1:6]) / 60)
    print(f"Stage 2 (Coreference Resolution): {(sum(times[6:13]) / 60):.4g} minutes")
    table["stage_2"].append(sum(times[6:13]) / 60)
    print(f"Stage 3 (Local Relation Extraction): {(sum(times[13:16]) / 60):.4g} minutes")
    table["stage_3"].append(sum(times[13:16]) / 60)
    print(f"Stage 4 (Global Relation Extraction): {(sum(times[16:20]) / 60):.4g} minutes")
    table["stage_4"].append(sum(times[16:20]) / 60)
    print(f"Stage 5 (Schema Generation): {(sum(times[20:25]) / 60):.4g} minutes")
    table["stage_5"].append(sum(times[20:25]) / 60)
    print(f"Stage 6 (Data Post-processing): {(sum(times[25:26]) / 60):.4g} minutes")
    table["stage_6"].append(sum(times[25:26]) / 60)
    print(f"Total Runtime: {(sum(times) / 60):.4g} minutes")
    table["total_runtime"].append(sum(times) / 60)

In [17]:
dataset = "SciERC"
max_paper = 100
llm_model = "l"
input_path = f"../../data/input"
output_path = f"../../data/raw_results/gen_{dataset}_{llm_model}.csv"

table = {
    "titles": [],
    "tokens": [],
    "entities": [],
    "mentions": [],
    "relations_total": [],
    "relations_normal": [],
    "relations_typing": [],
    "isolated_entities": [],
    "isolated_entities_typing": [],
    "stage_0": [],
    "stage_1": [],
    "stage_2": [],
    "stage_3": [],
    "stage_4": [],
    "stage_5": [],
    "stage_6": [],
    "total_runtime": []
}

for index in range(1, max_paper + 1):
    statistics(dataset, index, llm_model, table, input_path)
    print()
    print()
    print("--------------------------------------------------")
    print()
    print()


for key, value in table.items():
    if key == "titles":
        table[key].append("Mean")
        table[key].append("STD")
    else:
        mean = float(np.mean(value))
        std = float(np.std(value))
        table[key].append(mean)
        table[key].append(mean)


import pandas as pd

# Convert the dictionary into a DataFrame
df = pd.DataFrame(table)

# Save to CSV file
df.to_csv(output_path, index=False)

print(f"Table saved as {output_path}")


display(df)

Title: X96-1059 (by l)
Tokens: 214
Entities: 36
Mentions: 40
Relations (Total): 13
Relations (Normal): 11
Relations (Typing): 2
Isolated Entities: 22
Isolated Entities (including taxonomy): 19

Stage 0 (Input Pre-processing): 2.166e-06 minutes
Stage 1 (Entity Mention Extraction): 1.274 minutes
Stage 2 (Coreference Resolution): 1.642 minutes
Stage 3 (Local Relation Extraction): 0.7722 minutes
Stage 4 (Global Relation Extraction): 0.4299 minutes
Stage 5 (Schema Generation): 0.9994 minutes
Stage 6 (Data Post-processing): 0.0006801 minutes
Total Runtime: 5.119 minutes


--------------------------------------------------


Title: ICCV_2001_47_abs (by l)
Tokens: 145
Entities: 37
Mentions: 37
Relations (Total): 20
Relations (Normal): 9
Relations (Typing): 11
Isolated Entities: 23
Isolated Entities (including taxonomy): 15

Stage 0 (Input Pre-processing): 4.911e-06 minutes
Stage 1 (Entity Mention Extraction): 1.194 minutes
Stage 2 (Coreference Resolution): 1.754 minutes
Stage 3 (Local Relation

Unnamed: 0,titles,tokens,entities,mentions,relations_total,relations_normal,relations_typing,isolated_entities,isolated_entities_typing,stage_0,stage_1,stage_2,stage_3,stage_4,stage_5,stage_6,total_runtime
0,Title: X96-1059 (by l),214.00,36.00,40.00,13.00,11.00,2.00,22.00,19.00,0.000002,1.274372,1.642166,0.772186,0.429947,0.999438,0.000680,5.118791
1,Title: ICCV_2001_47_abs (by l),145.00,37.00,37.00,20.00,9.00,11.00,23.00,15.00,0.000005,1.193887,1.753739,0.538894,0.372148,1.054296,0.000222,4.913191
2,Title: INTERSPEECH_2013_21_abs (by l),119.00,37.00,39.00,17.00,9.00,8.00,25.00,16.00,0.000004,1.164843,1.569828,0.623569,0.334950,1.235191,0.000220,4.928604
3,Title: H01-1049 (by l),131.00,36.00,39.00,32.00,14.00,18.00,18.00,8.00,0.000004,1.053643,1.495907,0.628268,0.358485,1.159180,0.000219,4.695707
4,Title: C96-1062 (by l),109.00,26.00,27.00,15.00,8.00,7.00,15.00,8.00,0.000004,0.870966,1.115622,0.480700,0.187979,0.889065,0.000158,3.544495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Title: INTERSPEECH_2007_40_abs (by l),109.00,29.00,31.00,14.00,9.00,5.00,17.00,13.00,0.000004,0.820703,1.308771,0.520390,0.212860,0.987866,0.000173,3.850766
98,Title: CVPR_2003_30_abs (by l),93.00,32.00,38.00,25.00,12.00,13.00,17.00,6.00,0.000004,1.041376,1.705525,0.473327,0.349132,1.343912,0.000214,4.913490
99,Title: NIPS_2016_80_abs (by l),95.00,24.00,25.00,38.00,6.00,32.00,15.00,3.00,0.000003,0.845741,1.005258,0.380422,0.198624,0.744862,0.000157,3.175067
100,Mean,134.14,32.83,36.59,21.68,10.82,10.86,19.44,11.92,0.000005,1.054673,1.597898,0.592673,0.427677,1.086937,0.000208,4.760073
