In [None]:
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz

In [68]:
from collections import defaultdict
import os
from pprint import pprint
import spacy
import scispacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker

In [90]:
def get_tag_counts(dir_path):
    filenames = [filename for filename in os.listdir(dir_path) if filename.endswith(".ann")]

    entity_tags = defaultdict(set)
    entity_tag_counts = defaultdict(int)
    entity_tag_unique_counts = defaultdict(int)
    relationship_tag_counts = defaultdict(int)

    for filename in filenames:
        with open(dir_path + filename) as f:
            text = f.read()

        lines = [line.split("\t") for line in text.split("\n")[:-1]]

        for line in lines:
            tag = line[1].split(" ")[0]

            if line[0].startswith("T"):    
                entity_tag_counts[tag] += 1
                if line[2] not in entity_tags[tag]:
                    entity_tags[tag].add(line[2])
                    entity_tag_unique_counts[tag] += 1
            elif line[0].startswith("R"):
                relationship_tag_counts[tag] += 1
    
    return entity_tag_counts, entity_tag_unique_counts, relationship_tag_counts

In [91]:
entity_tag_counts, entity_tag_unique_counts, relationship_tag_counts = get_counts_for_dir("../../data/training_20180910/")

print("entity_tag_counts")
pprint(entity_tag_counts)
print("entity_tag_unique_counts")
pprint(entity_tag_unique_counts)
print("relationship_tag_counts")
pprint(relationship_tag_counts)

entity_tag_counts
defaultdict(<class 'int'>,
            {'ADE': 959,
             'Dosage': 4221,
             'Drug': 16225,
             'Duration': 592,
             'Form': 6651,
             'Frequency': 6281,
             'Reason': 3855,
             'Route': 5476,
             'Strength': 6691})
entity_tag_unique_counts
defaultdict(<class 'int'>,
            {'ADE': 852,
             'Dosage': 1804,
             'Drug': 11687,
             'Duration': 464,
             'Form': 2342,
             'Frequency': 3488,
             'Reason': 3310,
             'Route': 1467,
             'Strength': 4847})
relationship_tag_counts
defaultdict(<class 'int'>,
            {'ADE-Drug': 1107,
             'Dosage-Drug': 4225,
             'Duration-Drug': 643,
             'Form-Drug': 6654,
             'Frequency-Drug': 6310,
             'Reason-Drug': 5169,
             'Route-Drug': 5538,
             'Strength-Drug': 6702})


In [92]:
entity_tag_counts, entity_tag_unique_counts, relationship_tag_counts = get_counts_for_dir("../../data/test/")

print("entity_tag_counts")
pprint(entity_tag_counts)
print("entity_tag_unique_counts")
pprint(entity_tag_unique_counts)
print("relationship_tag_counts")
pprint(relationship_tag_counts)

entity_tag_counts
defaultdict(<class 'int'>,
            {'ADE': 625,
             'Dosage': 2681,
             'Drug': 10575,
             'Duration': 378,
             'Form': 4359,
             'Frequency': 4012,
             'Reason': 2545,
             'Route': 3513,
             'Strength': 4230})
entity_tag_unique_counts
defaultdict(<class 'int'>,
            {'ADE': 539,
             'Dosage': 1165,
             'Drug': 7720,
             'Duration': 322,
             'Form': 1600,
             'Frequency': 2303,
             'Reason': 2198,
             'Route': 993,
             'Strength': 3150})
relationship_tag_counts
defaultdict(<class 'int'>,
            {'ADE-Drug': 733,
             'Dosage-Drug': 2695,
             'Duration-Drug': 426,
             'Form-Drug': 4374,
             'Frequency-Drug': 4034,
             'Reason-Drug': 3410,
             'Route-Drug': 3546,
             'Strength-Drug': 4244})


In [None]:
nlp = spacy.load("en_core_sci_lg")
nlp.add_pipe("abbreviation_detector")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
linker = nlp.get_pipe("scispacy_linker")

In [83]:
dir_path = "../../data/training_20180910/"
filenames = [filename for filename in os.listdir(dir_path) if filename.endswith(".ann")]

drug_entity_counts = defaultdict(int)
ade_entity_counts = defaultdict(int)    

for i, filename in enumerate(filenames):
    with open(dir_path + filename) as f:
        text = f.read() 

    lines = [line.split("\t") for line in text.split("\n")[:-1]]

    for line in lines:
        if line[0].startswith("T"):
            if line[1].split(" ")[0] == "Drug":
                doc = nlp(line[2])
                drug_entity_counts[len(doc.ents)] += 1
            elif line[1].split(" ")[0] == "ADE":
                doc = nlp(line[2])
                ade_entity_counts[len(doc.ents)] += 1

    if (i + 1) % 10 == 0:
        print(f"Completed for {i + 1} files...")

print("drug_entity_counts")
print(drug_entity_counts)
print("ade_entity_counts")
print(ade_entity_counts)

Completed for 10 files...
Completed for 20 files...
Completed for 30 files...
Completed for 40 files...
Completed for 50 files...
Completed for 60 files...
Completed for 70 files...
Completed for 80 files...
Completed for 90 files...
Completed for 100 files...
Completed for 110 files...
Completed for 120 files...
Completed for 130 files...
Completed for 140 files...
Completed for 150 files...
Completed for 160 files...
Completed for 170 files...
Completed for 180 files...
Completed for 190 files...
Completed for 200 files...
Completed for 210 files...
Completed for 220 files...
Completed for 230 files...
Completed for 240 files...
Completed for 250 files...
Completed for 260 files...
Completed for 270 files...
Completed for 280 files...
Completed for 290 files...
Completed for 300 files...
drug_entity_counts
defaultdict(<class 'int'>, {1: 14908, 2: 712, 0: 568, 4: 4, 3: 33})
ade_entity_counts
defaultdict(<class 'int'>, {2: 127, 1: 801, 3: 17, 0: 13, 4: 1})


In [93]:
dir_path = "../../data/test/"
filenames = [filename for filename in os.listdir(dir_path) if filename.endswith(".ann")]

drug_entity_counts = defaultdict(int)
ade_entity_counts = defaultdict(int)    

for i, filename in enumerate(filenames):
    with open(dir_path + filename) as f:
        text = f.read() 

    lines = [line.split("\t") for line in text.split("\n")[:-1]]

    for line in lines:
        if line[0].startswith("T"):
            if line[1].split(" ")[0] == "Drug":
                doc = nlp(line[2])
                drug_entity_counts[len(doc.ents)] += 1
            elif line[1].split(" ")[0] == "ADE":
                doc = nlp(line[2])
                ade_entity_counts[len(doc.ents)] += 1

    if (i + 1) % 10 == 0:
        print(f"Completed for {i + 1} files...")

print("drug_entity_counts")
print(drug_entity_counts)
print("ade_entity_counts")
print(ade_entity_counts)

  global_matches = self.global_matcher(doc)


Completed for 10 files...
Completed for 20 files...
Completed for 30 files...
Completed for 40 files...
Completed for 50 files...
Completed for 60 files...
Completed for 70 files...
Completed for 80 files...
Completed for 90 files...
Completed for 100 files...
Completed for 110 files...
Completed for 120 files...
Completed for 130 files...
Completed for 140 files...
Completed for 150 files...
Completed for 160 files...
Completed for 170 files...
Completed for 180 files...
Completed for 190 files...
Completed for 200 files...
drug_entity_counts
defaultdict(<class 'int'>, {1: 9728, 2: 455, 0: 368, 3: 20, 4: 4})
ade_entity_counts
defaultdict(<class 'int'>, {1: 510, 2: 88, 3: 12, 0: 12, 4: 2, 6: 1})


In [120]:
with open("../../data/training_20180910/100035.txt", mode="r") as f:
    text = f.read()

start = 11014
end =  11024
entity = text[start:end]

doc = nlp(entity)

if len(doc.ents) == 1 and doc.ents[0].text == entity:
    for kb_ents in doc.ents[0]._.kb_ents:
        print(linker.kb.cui_to_entity[kb_ents[0]].canonical_name)
        for alias in set(linker.kb.cui_to_entity[kb_ents[0]].aliases):
            print(alias)
        print("")

vancomycin
Vancomicina
(1S,2R,18R,22S,25R,28R,40S)-22-(2-amino-2-oxoethyl)-48-[2-O-(3-amino-2,3,6-trideoxy-3-methyl-α-L-lyxo-hexopyranosyl)-β-D-glucopyranosyloxy]-5,15-dichloro-2,18,32,35,37-pentahydroxy-19-[(N-methyl-D-leucyl)amino]-20,23,26,42,44-pentaoxo-7,13-dioxa-21,24,27,41,43-pentaazaoctacyclo[26.14.2.23,6.214,17.18,12.129,33.010,25.034,39]pentaconta-3,5,8(48),9,11,14,16,29(45),30,32,34,36,38,46,49-pentadecaene-40-carboxylic acid
Vancomycin (substance)
Vancomycine
(2.2Sp,3.5Sa,2.6Sp)-O4.2,C3.4:C5.4,O4.6:C3.5,C2.7-tricyclo[N-methyl-D-leucyl-3-chloro-(R)-β-hydroxy-D-tyrosyl-L-asparaginyl-D-2-(4-{[2-O-(3-amino-2,3,6-trideoxy-3-C-methyl-α-L-lyxo-hexopyranosyl)-β-D-glucopyranosyl]oxy}phenyl)glycyl-D-2-(4-hydroxyphenyl)glycyl-3-chloro-(R)-β-hydroxy-L-tyrosyl-L-2-(3,5-dihydroxyphenyl)glycine]
vancomycin
Vancomycin
Product containing vancomycin (medicinal product)
Vancomycinum
Vancomycin-containing product
VANCOMYCIN
VANCO

vancomycin biosynthetic process
vancomycin biosynthesis
vancomy

  global_matches = self.global_matcher(doc)
