In [10]:
import json
import sys
import numpy as np

UTILITIES = "../"
sys.path.append(UTILITIES)
from utilities.content_processor import tokenise_text
from utilities.paper_access import get_text

In [11]:
def statistics(dataset, index, llm_model, table, input_path):

    with open(f"{input_path}/kg_{dataset}_{index}_{llm_model}.json") as f:
        kg = json.load(f)

    # Title
    title = f"Title: {kg["title"]} (by {llm_model})"
    print(title)
    table["titles"].append(title)


    # Calculate Tokens
    tokens = 0
    for section in kg["sections"]:
        for paragraph in section["paragraphs"]:
            for sentence in paragraph["sentences"]:
                text = get_text(sentence)
                tokens += len(tokenise_text(text))


    # Tokens
    print(f"Tokens: {tokens}")
    table["tokens"].append(tokens)


    # Entities
    print(f"Entities: {len(kg["nodes"])}")
    table["entities"].append(len(kg["nodes"]))


    # Mentions
    mentions = 0
    for entity in kg["nodes"].values():
        mentions += len(entity["mentions"])
    print(f"Mentions: {mentions}")
    table["mentions"].append(mentions)

    # Relations (Total)
    print(f"Relations (Total): {len(kg["triples"]) + len(kg["triples_typing"])}")
    table["relations_total"].append(len(kg["triples"]) + len(kg["triples_typing"]))

    # Relations (Normal)
    print(f"Relations (Normal): {len(kg["triples"])}")
    table["relations_normal"].append(len(kg["triples"]))

    # Relations (Typing)
    print(f"Relations (Typing): {len(kg["triples_typing"])}")
    table["relations_typing"].append(len(kg["triples_typing"]))


    # Isolated Entities
    non_isolated_entities = []

    for triple in kg["triples"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)

    print(f"Isolated Entities: {len(isolated_entities)}")
    table["isolated_entities"].append(len(isolated_entities))


    # Isolated Entities

    non_isolated_entities = []

    for triple in kg["triples"] + kg["triples_typing"]:
        non_isolated_entities.append(triple[0])
        non_isolated_entities.append(triple[2])

    non_isolated_entities = set(non_isolated_entities)

    isolated_entities = []

    for node in kg["nodes"]:
        if node not in non_isolated_entities:
            isolated_entities.append(node)
    print(f"Isolated Entities (including taxonomy): {len(isolated_entities)}")
    table["isolated_entities_typing"].append(len(isolated_entities))


    # Runtime
    times = kg["times"]
    print()
    print(f"Stage 0 (Input Pre-processing): {(sum(times[0:1]) / 60):.4g} minutes")
    table["stage_0"].append(sum(times[0:1]) / 60)
    print(f"Stage 1 (Entity Mention Extraction): {(sum(times[1:6]) / 60):.4g} minutes")
    table["stage_1"].append(sum(times[1:6]) / 60)
    print(f"Stage 2 (Coreference Resolution): {(sum(times[6:13]) / 60):.4g} minutes")
    table["stage_2"].append(sum(times[6:13]) / 60)
    print(f"Stage 3 (Local Relation Extraction): {(sum(times[13:16]) / 60):.4g} minutes")
    table["stage_3"].append(sum(times[13:16]) / 60)
    print(f"Stage 4 (Global Relation Extraction): {(sum(times[16:20]) / 60):.4g} minutes")
    table["stage_4"].append(sum(times[16:20]) / 60)
    print(f"Stage 5 (Schema Generation): {(sum(times[20:25]) / 60):.4g} minutes")
    table["stage_5"].append(sum(times[20:25]) / 60)
    print(f"Stage 6 (Data Post-processing): {(sum(times[25:26]) / 60):.4g} minutes")
    table["stage_6"].append(sum(times[25:26]) / 60)
    print(f"Total Runtime: {(sum(times) / 60):.4g} minutes")
    table["total_runtime"].append(sum(times) / 60)

In [12]:
dataset = "SciERC"
max_paper = 100
llm_model = "g"
input_path = f"../../data/input"
output_path = f"../../data/raw_results/gen_{dataset}_{llm_model}.csv"

table = {
    "titles": [],
    "tokens": [],
    "entities": [],
    "mentions": [],
    "relations_total": [],
    "relations_normal": [],
    "relations_typing": [],
    "isolated_entities": [],
    "isolated_entities_typing": [],
    "stage_0": [],
    "stage_1": [],
    "stage_2": [],
    "stage_3": [],
    "stage_4": [],
    "stage_5": [],
    "stage_6": [],
    "total_runtime": []
}

for index in range(1, max_paper + 1):
    statistics(dataset, index, llm_model, table, input_path)
    print()
    print()
    print("--------------------------------------------------")
    print()
    print()


for key, value in table.items():
    if key == "titles":
        table[key].append("Mean")
        table[key].append("STD")
    else:
        mean = float(np.mean(value))
        std = float(np.std(value))
        table[key].append(mean)
        table[key].append(std)


import pandas as pd

# Convert the dictionary into a DataFrame
df = pd.DataFrame(table)

# Save to CSV file
df.to_csv(output_path, index=False)

print(f"Table saved as {output_path}")


display(df)

Title: X96-1059 (by g)
Tokens: 214
Entities: 35
Mentions: 50
Relations (Total): 44
Relations (Normal): 33
Relations (Typing): 11
Isolated Entities: 7
Isolated Entities (including taxonomy): 4

Stage 0 (Input Pre-processing): 1.065e-06 minutes
Stage 1 (Entity Mention Extraction): 1.694 minutes
Stage 2 (Coreference Resolution): 1.618 minutes
Stage 3 (Local Relation Extraction): 0.5112 minutes
Stage 4 (Global Relation Extraction): 0.1839 minutes
Stage 5 (Schema Generation): 3.546 minutes
Stage 6 (Data Post-processing): 0.000624 minutes
Total Runtime: 7.554 minutes


--------------------------------------------------


Title: ICCV_2001_47_abs (by g)
Tokens: 145
Entities: 29
Mentions: 33
Relations (Total): 30
Relations (Normal): 23
Relations (Typing): 7
Isolated Entities: 6
Isolated Entities (including taxonomy): 3

Stage 0 (Input Pre-processing): 1.069e-06 minutes
Stage 1 (Entity Mention Extraction): 1.377 minutes
Stage 2 (Coreference Resolution): 1.348 minutes
Stage 3 (Local Relation Extr

Unnamed: 0,titles,tokens,entities,mentions,relations_total,relations_normal,relations_typing,isolated_entities,isolated_entities_typing,stage_0,stage_1,stage_2,stage_3,stage_4,stage_5,stage_6,total_runtime
0,Title: X96-1059 (by g),214.00000,35.000000,50.000000,44.000000,33.000000,11.000000,7.000000,4.000000,1.064936e-06,1.693562,1.618394,0.511186,0.183870,3.546277,0.000624,7.553914
1,Title: ICCV_2001_47_abs (by g),145.00000,29.000000,33.000000,30.000000,23.000000,7.000000,6.000000,3.000000,1.068910e-06,1.377097,1.348309,0.458839,0.121343,2.961990,0.000184,6.267763
2,Title: INTERSPEECH_2013_21_abs (by g),119.00000,35.000000,38.000000,27.000000,15.000000,12.000000,17.000000,9.000000,1.736482e-06,1.518260,1.446607,0.471616,0.281954,2.661186,0.000579,6.380204
3,Title: H01-1049 (by g),131.00000,37.000000,42.000000,54.000000,21.000000,33.000000,14.000000,2.000000,1.180172e-06,1.341375,1.711918,0.437781,0.186616,2.679246,0.000226,6.357164
4,Title: C96-1062 (by g),109.00000,25.000000,31.000000,18.000000,11.000000,7.000000,13.000000,9.000000,1.096725e-06,0.989620,1.191557,0.385817,0.101076,2.069813,0.000191,4.738075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,Title: INTERSPEECH_2007_40_abs (by g),109.00000,23.000000,23.000000,28.000000,15.000000,13.000000,6.000000,2.000000,9.377797e-07,2.555701,6.639276,0.991223,0.063514,10.179162,0.000158,20.429035
98,Title: CVPR_2003_30_abs (by g),93.00000,22.000000,24.000000,16.000000,13.000000,3.000000,4.000000,4.000000,9.457270e-07,2.780790,6.674807,0.896232,0.219940,13.785232,0.000161,24.357163
99,Title: NIPS_2016_80_abs (by g),95.00000,23.000000,27.000000,36.000000,19.000000,17.000000,5.000000,1.000000,9.457270e-07,2.686916,7.156486,0.969131,0.075393,11.084035,0.000171,21.972134
100,Mean,134.14000,30.370000,37.320000,35.790000,20.400000,15.390000,9.790000,4.150000,3.106276e-06,1.619762,2.977685,0.563415,0.263125,4.766094,0.000234,10.190318
