In [6]:
import pandas as pd

In [4]:
import pandas as pd
import re
import random
import spacy
from spacy.util import minibatch, compounding
import warnings
import matplotlib.pyplot as plt

In [9]:
# read in the food csv file
abstracts_df = pd.read_csv('../part2/data/abstracts_train.csv', sep='\t')

# print row and column information
abstracts_df.head()

Unnamed: 0,abstract_id,title,abstract
0,1353340,Late-onset metachromatic leukodystrophy: molec...,We report on a new allele at the arylsulfatase...
1,1671881,Two distinct mutations at a single BamHI site ...,Classical phenylketonuria is an autosomal rece...
2,1848636,Debrisoquine phenotype and the pharmacokinetic...,The metabolism of the cardioselective beta-blo...
3,2422478,Midline B3 serotonin nerves in rat medulla are...,Previous experiments in this laboratory have s...
4,2491010,Molecular and phenotypic analysis of patients ...,Eighty unrelated individuals with Duchenne mus...


In [10]:
abstracts_df.shape

(400, 3)

In [11]:
# print the size 
abstracts_df["abstract"].size

400

In [12]:
texts=  abstracts_df["abstract"].apply(lambda text: text.lower())
# filter out foods with more than 3 words, drop any duplicates
#texts = texts[texts.str.split().apply(len) <= 3].drop_duplicates()

# print the size of the documents
texts.size

400

In [13]:
entities_df = pd.read_csv('../part2/data/entities_train.csv', sep='\t')
entities_df.head()

Unnamed: 0,id,abstract_id,offset_start,offset_finish,type,mention,entity_ids
0,0,1353340,11,39,DiseaseOrPhenotypicFeature,metachromatic leukodystrophy,D007966
1,1,1353340,111,126,GeneOrGeneProduct,arylsulfatase A,410
2,2,1353340,128,132,GeneOrGeneProduct,ARSA,410
3,3,1353340,159,187,DiseaseOrPhenotypicFeature,metachromatic leukodystrophy,D007966
4,4,1353340,189,192,DiseaseOrPhenotypicFeature,MLD,D007966


In [14]:
entities_df['type'].unique()

array(['DiseaseOrPhenotypicFeature', 'GeneOrGeneProduct',
       'SequenceVariant', 'OrganismTaxon', 'ChemicalEntity', 'CellLine'],
      dtype=object)

In [15]:
id2abstr ={ abs_id: abs_text for abs_id, abs_text in abstracts_df[['abstract_id','abstract']].values }

In [16]:
id2title ={ abs_id: title for abs_id, title in abstracts_df[['abstract_id','title']].values }

In [17]:
# create training and valid NER corpus using spacy format
training_data =[]

for key, df in entities_df.groupby('abstract_id'):
    #print (key)
    entity_mentions = []
    text = id2title[key]+' '+id2abstr[key]
    text = text.lower()
    for i, row in df.iterrows():
          entity_mention = (row['offset_start'], row['offset_finish'],row['type'])
          #print (text[row['offset_start']:row['offset_finish']])
          entity_mentions.append(entity_mention)
    training_data.append( (text,  {'entities': entity_mentions}))
                                                       

In [18]:
# split the data into training and validation
train_size = int(len(training_data)*90/100)
test_size = len(training_data) - train_size
test_size

40

In [19]:
def select_subset_of_overlapping_chain(
    chain ):
    """
    Select the subset of entities in an overlapping chain to return by greedily choosing the
    longest entity in the chain until there are no entities remaining
    """
    sorted_chain = sorted(chain, key=lambda x: x[1] - x[0], reverse=True)
    selections_from_chain = []
    chain_index = 0
    # dump the current chain by greedily keeping the longest entity that doesn't overlap
    while chain_index < len(sorted_chain):
        entity = sorted_chain[chain_index]
        match_found = False
        for already_selected_entity in selections_from_chain:
            max_start = max(entity[0], already_selected_entity[0])
            min_end = min(entity[1], already_selected_entity[1])
            if len(range(max_start, min_end)) > 0:
                match_found = True
                break

        if not match_found:
            selections_from_chain.append(entity)

        chain_index += 1

    return selections_from_chain

In [20]:
def getOverlap(a, b):
  return max(0, min(a[1], b[1]) - max(a[0], b[0]))

In [21]:
def remove_overlapping_entities(
  sorted_spacy_format_entities):
  """
  Removes overlapping entities from the entity set, by greedilytaking the longest
  entity from each overlapping chain. The input list of entities should be sorted
  and follow the spacy format.
  """

  sorted_spacy_format_entities= sorted(sorted_spacy_format_entities, key=lambda x: x[0])
  spacy_format_entities_without_overlap = []
  current_overlapping_chain = []
  current_overlapping_chain_start = 0
  current_overlapping_chain_end = 0
  for i, current_entity in enumerate(sorted_spacy_format_entities):
    current_entity = sorted_spacy_format_entities[i]
    current_entity_start = int(current_entity[0])
    current_entity_end = int(current_entity[1])

    if len(current_overlapping_chain) == 0:
        current_overlapping_chain.append(current_entity)
        current_overlapping_chain_start = current_entity_start
        current_overlapping_chain_end = current_entity_end
    else:
        min_end = min(current_entity_end, current_overlapping_chain_end)
        max_start = max(current_entity_start, current_overlapping_chain_start)
        if min_end - max_start > 0:
            current_overlapping_chain.append(current_entity)
            current_overlapping_chain_start = min(
                current_entity_start, current_overlapping_chain_start
            )
            current_overlapping_chain_end = max(
                current_entity_end, current_overlapping_chain_end
            )
        else:
            selections_from_chain = select_subset_of_overlapping_chain(
                current_overlapping_chain
            )

            current_overlapping_chain = []
            spacy_format_entities_without_overlap.extend(selections_from_chain)
            current_overlapping_chain.append(current_entity)
            current_overlapping_chain_start = current_entity_start
            current_overlapping_chain_end = current_entity_end

  spacy_format_entities_without_overlap.extend(
      select_subset_of_overlapping_chain(current_overlapping_chain)
  )

  return sorted(spacy_format_entities_without_overlap, key=lambda x: x[0])

In [22]:
test_entities = [(198, 211, 'ChemicalEntity'), (198, 210, 'ChemicalEntity')]
result = remove_overlapping_entities(test_entities)
print (result)
assert result == [(198, 211, 'ChemicalEntity')]

[(198, 211, 'ChemicalEntity')]


In [23]:
test_entities= training_data[2][1]['entities']
test_entities

[(0, 12, 'ChemicalEntity'),
 (52, 67, 'GeneOrGeneProduct'),
 (88, 98, 'ChemicalEntity'),
 (171, 181, 'ChemicalEntity'),
 (214, 226, 'ChemicalEntity'),
 (227, 236, 'ChemicalEntity'),
 (390, 409, 'GeneOrGeneProduct'),
 (442, 452, 'ChemicalEntity'),
 (511, 530, 'GeneOrGeneProduct'),
 (545, 555, 'ChemicalEntity'),
 (603, 613, 'ChemicalEntity'),
 (617, 628, 'ChemicalEntity'),
 (637, 648, 'DiseaseOrPhenotypicFeature'),
 (732, 742, 'ChemicalEntity'),
 (838, 848, 'ChemicalEntity'),
 (944, 955, 'ChemicalEntity'),
 (1059, 1069, 'ChemicalEntity'),
 (1149, 1160, 'ChemicalEntity'),
 (1227, 1236, 'ChemicalEntity'),
 (1238, 1249, 'ChemicalEntity'),
 (1251, 1261, 'ChemicalEntity'),
 (1294, 1317, 'ChemicalEntity'),
 (1389, 1399, 'ChemicalEntity'),
 (1409, 1419, 'ChemicalEntity'),
 (1434, 1445, 'ChemicalEntity'),
 (1512, 1522, 'ChemicalEntity'),
 (1523, 1546, 'ChemicalEntity'),
 (1652, 1662, 'ChemicalEntity'),
 (1691, 1701, 'ChemicalEntity')]

In [24]:
# enrich the data with DailyMed annotations (drug and disease entities)

In [25]:
!wget https://raw.githubusercontent.com/MaastrichtU-IDS/prodigy-drug-indication-annotation/master/relation/drugdisease_relations.jsonl

--2022-03-10 14:40:20--  https://raw.githubusercontent.com/MaastrichtU-IDS/prodigy-drug-indication-annotation/master/relation/drugdisease_relations.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 435558 (425K) [text/plain]
Saving to: ‘drugdisease_relations.jsonl.1’


2022-03-10 14:40:21 (37.5 MB/s) - ‘drugdisease_relations.jsonl.1’ saved [435558/435558]



In [34]:
import json
extra_training_data = []
with open('drugdisease_relations.jsonl') as fr:
    for count, line in enumerate(fr):
        #print (line)
        annotations = json.loads(line)
        
        text = annotations['text']
        #print (annotations['relations'])
        entity_mentions= []
        for a in annotations['relations']:
            #print (a)
            if a['child_span']['label'] == 'DISEASE':
                entity_mention1 = (a['child_span']['start'], a['child_span']['end'], 'DiseaseOrPhenotypicFeature')
                entity_mention2 = (a['head_span']['start'], a['head_span']['end'], 'ChemicalEntity')
                #print (annotations['text'][a['child_span']['start']: a['child_span']['end']], 'DISEASE')
                #print (annotations['text'][a['head_span']['start']: a['head_span']['end']], 'DRUG')
            else:
                entity_mention1 = (a['child_span']['start'], a['child_span']['end'], 'ChemicalEntity')
                entity_mention2 = (a['head_span']['start'], a['head_span']['end'], 'DiseaseOrPhenotypicFeature')
            
            #if entity_mention1 not in entity_mentions:
            entity_mentions.append(entity_mention1)
                
            #if entity_mention2 not in entity_mentions:
            entity_mentions.append(entity_mention2)
        if len(entity_mentions) > 0:
            entity_mentions= remove_overlapping_entities(entity_mentions)
            extra_training_data.append( (text,  {'entities': entity_mentions}))   
        #if count > 5:
        #    break

In [35]:
len(extra_training_data)

26

In [36]:
extra_training_data[:1]

[('              *    1.1    1.2         *    1.3         *    1.4         *    1.5         *    1.6         *    1.7         *    1.8         *    1.9         *    1.10         *    1.11         *    1.12         *    1.13         *    1.14         *    1.16    Ciprofloxacin tablets are indicated in adult patients for treatment of urinary tract infections caused by     Escherichia coli    Klebsiella pneumoniae    Enterobacter cloacae    Serratia marcescens    Proteus mirabilis    Providencia rettgeri    Morganella morganii    Citrobacter koseri    Citrobacter freundii    Pseudomonas aeruginosa    Staphylococcus epidermidis    Staphylococcus saprophyticus    Enterococcus faecalis    Ciprofloxacin tablets are indicated in adult female patients for treatment of acute uncomplicated cystitis caused by     Escherichia coli     Staphylococcus saprophyticus.',
  {'entities': [(260, 273, 'ChemicalEntity'),
    (331, 355, 'DiseaseOrPhenotypicFeature'),
    (689, 702, 'ChemicalEntity'),
    (787

In [37]:
random.shuffle(extra_training_data)

extra_train_data = extra_training_data[:20]
extra_test_data = extra_training_data[20:] 

In [38]:
len(extra_train_data), len(extra_test_data)

(20, 6)

In [39]:
# create trainig and validation file with Spacy  file format

In [41]:
#import pandas as pd
from tqdm import tqdm
#import spacy
from spacy.tokens import DocBin
nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

for text, annot in tqdm(training_data[:-test_size] + extra_train_data): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        #print (start, end, label)
        span = doc.char_span(start, end, label=label, alignment_mode="strict")
        if span is None:
            print("Skipping entity")
            print (start, end, label)
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("train.spacy") # save the docbin object

db = DocBin()
for text, annot in tqdm(training_data[-test_size:]+ extra_test_data): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="strict")
        if span is None:
            print("Skipping entity")
            print (start, end, label)
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("valid.spacy") # save the docbin object


 15%|█▌        | 57/380 [00:00<00:01, 261.56it/s]

Skipping entity
203 206 DiseaseOrPhenotypicFeature
Skipping entity
425 431 ChemicalEntity
Skipping entity
1150 1152 ChemicalEntity
Skipping entity
1153 1162 ChemicalEntity
Skipping entity
1167 1169 ChemicalEntity
Skipping entity
1170 1179 ChemicalEntity
Skipping entity
169 175 ChemicalEntity
Skipping entity
817 823 ChemicalEntity
Skipping entity
1506 1511 GeneOrGeneProduct
Skipping entity
1512 1524 SequenceVariant
Skipping entity
1574 1580 ChemicalEntity
Skipping entity
1663 1668 GeneOrGeneProduct
Skipping entity
1669 1674 SequenceVariant
Skipping entity
748 752 GeneOrGeneProduct
Skipping entity
1341 1347 GeneOrGeneProduct
Skipping entity
1348 1354 SequenceVariant
Skipping entity
1405 1409 GeneOrGeneProduct
Skipping entity
1468 1474 GeneOrGeneProduct
Skipping entity
1475 1481 SequenceVariant
Skipping entity
1491 1495 GeneOrGeneProduct
Skipping entity
1844 1848 GeneOrGeneProduct
Skipping entity
1932 1936 GeneOrGeneProduct
Skipping entity
1989 1995 GeneOrGeneProduct
Skipping entity
1996 

 39%|███▉      | 150/380 [00:00<00:00, 281.54it/s]

83 88 SequenceVariant
Skipping entity
189 192 ChemicalEntity
Skipping entity
816 821 SequenceVariant
Skipping entity
448 452 ChemicalEntity
Skipping entity
676 686 ChemicalEntity
Skipping entity
119 145 GeneOrGeneProduct
Skipping entity
1058 1062 ChemicalEntity
Skipping entity
1100 1104 ChemicalEntity
Skipping entity
1505 1509 ChemicalEntity
Skipping entity
160 163 ChemicalEntity
Skipping entity
359 366 SequenceVariant
Skipping entity
254 258 GeneOrGeneProduct
Skipping entity
258 264 SequenceVariant
Skipping entity
867 870 GeneOrGeneProduct
Skipping entity
1249 1253 GeneOrGeneProduct
Skipping entity
1267 1270 GeneOrGeneProduct
Skipping entity
1360 1363 GeneOrGeneProduct
Skipping entity
1455 1459 GeneOrGeneProduct
Skipping entity
1459 1464 SequenceVariant
Skipping entity
1469 1473 GeneOrGeneProduct
Skipping entity
1473 1479 SequenceVariant
Skipping entity
1617 1621 GeneOrGeneProduct
Skipping entity
1621 1626 SequenceVariant
Skipping entity
1631 1635 GeneOrGeneProduct
Skipping entity
163

 65%|██████▌   | 247/380 [00:00<00:00, 306.23it/s]

Skipping entity
735 740 ChemicalEntity
Skipping entity
595 608 ChemicalEntity
Skipping entity
734 747 ChemicalEntity
Skipping entity
672 679 GeneOrGeneProduct
Skipping entity
680 684 GeneOrGeneProduct
Skipping entity
844 851 GeneOrGeneProduct
Skipping entity
1247 1254 GeneOrGeneProduct
Skipping entity
1257 1261 GeneOrGeneProduct
Skipping entity
513 545 DiseaseOrPhenotypicFeature
Skipping entity
96 99 GeneOrGeneProduct
Skipping entity
132 135 ChemicalEntity
Skipping entity
268 271 GeneOrGeneProduct
Skipping entity
392 395 ChemicalEntity
Skipping entity
487 490 GeneOrGeneProduct
Skipping entity
724 727 GeneOrGeneProduct
Skipping entity
875 878 GeneOrGeneProduct
Skipping entity
939 942 GeneOrGeneProduct
Skipping entity
997 1000 GeneOrGeneProduct
Skipping entity
1408 1411 GeneOrGeneProduct
Skipping entity
1441 1444 GeneOrGeneProduct
Skipping entity
1637 1640 GeneOrGeneProduct
Skipping entity
1824 1827 GeneOrGeneProduct
Skipping entity
1116 1125 GeneOrGeneProduct
Skipping entity
958 962 Gen

 84%|████████▍ | 320/380 [00:01<00:00, 318.51it/s]

Skipping entity
257 264 ChemicalEntity
Skipping entity
611 618 ChemicalEntity
Skipping entity
628 635 ChemicalEntity
Skipping entity
925 932 ChemicalEntity
Skipping entity
946 953 ChemicalEntity
Skipping entity
1167 1174 ChemicalEntity
Skipping entity
1257 1264 ChemicalEntity
Skipping entity
1324 1331 ChemicalEntity
Skipping entity
1533 1540 ChemicalEntity
Skipping entity
28 34 GeneOrGeneProduct
Skipping entity
34 63 SequenceVariant
Skipping entity
910 914 ChemicalEntity
Skipping entity
10 14 GeneOrGeneProduct
Skipping entity
14 21 SequenceVariant
Skipping entity
190 209 DiseaseOrPhenotypicFeature
Skipping entity
192 218 GeneOrGeneProduct
Skipping entity
219 223 GeneOrGeneProduct
Skipping entity
12 16 GeneOrGeneProduct
Skipping entity
111 114 GeneOrGeneProduct
Skipping entity
844 854 GeneOrGeneProduct
Skipping entity
756 760 GeneOrGeneProduct
Skipping entity
765 776 GeneOrGeneProduct
Skipping entity
784 789 GeneOrGeneProduct
Skipping entity
852 857 GeneOrGeneProduct
Skipping entity
862

100%|██████████| 380/380 [00:01<00:00, 324.91it/s]


Skipping entity
419 423 GeneOrGeneProduct
Skipping entity
424 428 GeneOrGeneProduct
Skipping entity
11 14 GeneOrGeneProduct
Skipping entity
15 20 GeneOrGeneProduct
Skipping entity
54 57 GeneOrGeneProduct
Skipping entity
58 63 GeneOrGeneProduct
Skipping entity
64 69 GeneOrGeneProduct
Skipping entity
187 190 GeneOrGeneProduct
Skipping entity
368 371 GeneOrGeneProduct
Skipping entity
374 379 GeneOrGeneProduct
Skipping entity
382 387 GeneOrGeneProduct
Skipping entity
685 688 GeneOrGeneProduct
Skipping entity
828 831 GeneOrGeneProduct
Skipping entity
904 907 GeneOrGeneProduct
Skipping entity
1149 1152 GeneOrGeneProduct
Skipping entity
1293 1296 GeneOrGeneProduct
Skipping entity
1428 1431 GeneOrGeneProduct
Skipping entity
1654 1657 GeneOrGeneProduct
Skipping entity
1836 1839 GeneOrGeneProduct
Skipping entity
1105 1134 GeneOrGeneProduct
Skipping entity
1104 1109 GeneOrGeneProduct
Skipping entity
1086 1088 GeneOrGeneProduct
Skipping entity
1091 1095 GeneOrGeneProduct
Skipping entity
1098 1102 

100%|██████████| 46/46 [00:00<00:00, 383.52it/s]


Skipping entity
0 33 GeneOrGeneProduct
Skipping entity
34 39 GeneOrGeneProduct
Skipping entity
1209 1212 GeneOrGeneProduct
Skipping entity
1213 1218 GeneOrGeneProduct
Skipping entity
537 540 ChemicalEntity
Skipping entity
634 639 GeneOrGeneProduct
Skipping entity
640 644 GeneOrGeneProduct
Skipping entity
980 985 GeneOrGeneProduct
Skipping entity
986 990 GeneOrGeneProduct
Skipping entity
992 1001 GeneOrGeneProduct
Skipping entity
164 176 GeneOrGeneProduct
Skipping entity
881 885 GeneOrGeneProduct
Skipping entity
1461 1465 GeneOrGeneProduct
Skipping entity
1470 1474 GeneOrGeneProduct
Skipping entity
1614 1620 GeneOrGeneProduct
Skipping entity
1660 1664 GeneOrGeneProduct
Skipping entity
1926 1930 GeneOrGeneProduct
Skipping entity
1935 1939 GeneOrGeneProduct
Skipping entity
1358 1363 GeneOrGeneProduct
Skipping entity
1365 1369 GeneOrGeneProduct
Skipping entity
1155 1159 GeneOrGeneProduct
Skipping entity
1161 1187 GeneOrGeneProduct
Skipping entity
1189 1192 GeneOrGeneProduct
Skipping entity