In [158]:
from collections import defaultdict
import os
from pprint import pprint

In [159]:
def get_counts_for_file(dir_path: str, filename: str) -> tuple[defaultdict[str, int], defaultdict[str, int], defaultdict[str, int]]:
    with open(dir_path + filename) as f:
        text: str = f.read()

    lines: list[list[str]] = [line.split("\t") for line in text.split("\n")[:-1]]

    entity_tags: defaultdict[str, set] = defaultdict(set)
    entity_tag_counts: defaultdict[str, int] = defaultdict(int)
    entity_tag_unique_counts: defaultdict[str, int] = defaultdict(int)
    relationship_tag_counts: defaultdict[str, int] = defaultdict(int)

    for line in lines:
        tag: str = line[1].split(" ")[0]

        if line[0].startswith("T"):    
            entity_tag_counts[tag] += 1
            if line[2] not in entity_tags[tag]:
                entity_tags[tag].add(line[2])
                entity_tag_unique_counts[tag] += 1
        elif line[0].startswith("R"):
            relationship_tag_counts[tag] += 1

    return entity_tag_counts, entity_tag_unique_counts, relationship_tag_counts

In [161]:
def get_counts_for_dir(dir_path: str) -> tuple[defaultdict[str, float], defaultdict[str, float], defaultdict[str, float]]:
    filenames: list[str] = [
        filename for filename in os.listdir(dir_path) if filename.endswith(".ann")
    ]

    entity_tag_counts: defaultdict[str, int] = defaultdict(int)
    entity_tag_unique_counts: defaultdict[str, int] = defaultdict(int)
    relationship_tag_counts: defaultdict[str, int] = defaultdict(int)

    for filename in filenames:
        entity_tag_counts_, entity_tag_unique_counts_, relationship_tag_counts_ = get_counts_for_file(dir_path, filename)
        for tag in entity_tag_counts_:
            entity_tag_counts[tag] += entity_tag_counts_[tag]
        for tag in entity_tag_unique_counts_:
            entity_tag_unique_counts[tag] += entity_tag_unique_counts_[tag]
        for tag in relationship_tag_counts_:
            relationship_tag_counts[tag] += relationship_tag_counts_[tag]

    return entity_tag_counts, entity_tag_unique_counts, relationship_tag_counts

In [162]:
entity_tag_counts, entity_tag_unique_counts, relationship_tag_counts = get_counts_for_dir("data/training_20180910/")

In [163]:
print("entity_tag_counts")
pprint(entity_tag_counts)
print("entity_tag_unique_counts")
pprint(entity_tag_unique_counts)
print("relationship_tag_counts")
pprint(relationship_tag_counts)

entity_tag_counts
defaultdict(<class 'int'>,
            {'ADE': 959,
             'Dosage': 4221,
             'Drug': 16225,
             'Duration': 592,
             'Form': 6651,
             'Frequency': 6281,
             'Reason': 3855,
             'Route': 5476,
             'Strength': 6691})
entity_tag_unique_counts
defaultdict(<class 'int'>,
            {'ADE': 852,
             'Dosage': 1804,
             'Drug': 11687,
             'Duration': 464,
             'Form': 2342,
             'Frequency': 3488,
             'Reason': 3310,
             'Route': 1467,
             'Strength': 4847})
relationship_tag_counts
defaultdict(<class 'int'>,
            {'ADE-Drug': 1107,
             'Dosage-Drug': 4225,
             'Duration-Drug': 643,
             'Form-Drug': 6654,
             'Frequency-Drug': 6310,
             'Reason-Drug': 5169,
             'Route-Drug': 5538,
             'Strength-Drug': 6702})


In [164]:
entity_tag_counts, entity_tag_unique_counts, relationship_tag_counts = get_counts_for_dir("data/test/")

In [165]:
print("entity_tag_counts")
pprint(entity_tag_counts)
print("entity_tag_unique_counts")
pprint(entity_tag_unique_counts)
print("relationship_tag_counts")
pprint(relationship_tag_counts)

entity_tag_counts
defaultdict(<class 'int'>,
            {'ADE': 625,
             'Dosage': 2681,
             'Drug': 10575,
             'Duration': 378,
             'Form': 4359,
             'Frequency': 4012,
             'Reason': 2545,
             'Route': 3513,
             'Strength': 4230})
entity_tag_unique_counts
defaultdict(<class 'int'>,
            {'ADE': 539,
             'Dosage': 1165,
             'Drug': 7720,
             'Duration': 322,
             'Form': 1600,
             'Frequency': 2303,
             'Reason': 2198,
             'Route': 993,
             'Strength': 3150})
relationship_tag_counts
defaultdict(<class 'int'>,
            {'ADE-Drug': 733,
             'Dosage-Drug': 2695,
             'Duration-Drug': 426,
             'Form-Drug': 4374,
             'Frequency-Drug': 4034,
             'Reason-Drug': 3410,
             'Route-Drug': 3546,
             'Strength-Drug': 4244})
