In [1]:
import spacy

from scispacy.abbreviation import AbbreviationDetector

In [2]:
import pandas as pd

In [3]:
nlp = spacy.load("en_core_sci_md")

In [4]:


# Add the abbreviation pipe to the spacy pipeline.
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

In [5]:

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
SBMA 	 (6, 7) Spinal and bulbar muscular atrophy
SBMA 	 (33, 34) Spinal and bulbar muscular atrophy
AR 	 (29, 30) androgen receptor


In [6]:
from scispacy.umls_linking import UmlsEntityLinker
linker = UmlsEntityLinker(resolve_abbreviations=True)

nlp.add_pipe(linker)

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmpxd70cvx4
Finished download, copying /tmp/tmpxd70cvx4 to cache at /home/jovyan/.scispacy/datasets/ea855fd121a193f03190a91417c209d4cd97e63d3ce4b456c248ef7c13a4ca77.03518aabd12de2103a27a50302f37c3d87b0f313a8be08b5ec306c9c4334b9b1.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/nmslib_index.bin not found in cache, downloading to /tmp/tmp0r9hd1aq
Finished download, copying /tmp/tmp0r9hd1aq to cache at /home/jovyan/.scispacy/datasets/5f620d1bd549a98c005ed601a73806ea2cd1a86ae6c54bbc62bcb3b452ca2630.27a7ac6807fde6628311ff7d70b86fefc640d0eb70637b544c591722a2c16c2a.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmpc9hp96ta
Finished download, copying /tmp/tmpc9hp96ta to cache at /home/jovyan/.scispacy/datasets/ff

In [8]:
df = pd.read_csv('/data/output/XMLProduct_DBID.csv')

In [9]:
df.head()

Unnamed: 0,Label_ID,Set_ID,Drug_Brand_Name,Active_ingredient,UNII_ID,Section,Text,WordCount,DB_ID,Drug_name
0,9fb5dedd-b589-42b7-850a-41dfc5bd8688.xml,3fa9bff0-676b-45d9-9d77-a6363c63395f,Omnipaque,Iohexol,4419T9MX03,Indications,OMNIPAQUE 300 is indicated for intrathecal adm...,4207,DB01362,Iohexol
1,2e8a4140-97a5-4031-a6df-4b2663b5bf83.xml,442aed6e-6242-4a96-90aa-d988b62d55e8,Omnipaque,Iohexol,4419T9MX03,Indications,OMNIPAQUE 300 is indicated for intrathecal adm...,4207,DB01362,Iohexol
2,e653596f-6792-4e6b-a50f-2d02b5159a8a.xml,f9e9d558-064c-4b86-8d96-2f8a678c89f9,OMNIPAQUE,Iohexol,4419T9MX03,Indications,OMNIPAQUE (iohexol) injection is a radiographi...,365,DB01362,Iohexol
3,daf75b73-a930-4baa-8b04-703c9f89e2e7.xml,eee15ebe-d349-4497-acef-6abe7a8247fb,OMNIPAQUE,Iohexol,4419T9MX03,Indications,OMNIPAQUE (iohexol) injection is a radiographi...,365,DB01362,Iohexol
4,9fb5dedd-b589-42b7-850a-41dfc5bd8688.xml,3fa9bff0-676b-45d9-9d77-a6363c63395f,Omnipaque,Iohexol,4419T9MX03,Contraindications,OMNIPAQUE should not be administered to patien...,157,DB01362,Iohexol


In [36]:
data =[]
for index, row in df.iterrows():
    if index % 100  == 0:
        perc = index/len(df) *100
        print(str(index) + " : "+ str(perc))
    text =row['Text']
    db_id = row["DB_ID"]
    drug_name = row['Active_ingredient']
    drug_brand_name = row["Drug_Brand_Name"]
    label_id = row["Label_ID"]
    set_id = row["Set_ID"]
    section = row['Section']
    doc = nlp(text)
    if len(doc.ents) > 0:
        for entity in doc.ents:
            for umls_ent in entity._.umls_ents:
                
                umls_concept = linker.umls.cui_to_entity[umls_ent[0]]
                if 'T047' in umls_concept.types:
                    #print("Name: ", entity)
                    #print (umls_concept)
                    data.append([label_id, set_id, entity.start_char, entity.end_char, '|'.join(umls_concept.types), umls_concept.canonical_name, 
                                 text, umls_concept.concept_id, db_id, drug_name, drug_brand_name,  section])

0 : 0.0
100 : 0.1584183511818009
200 : 0.3168367023636018
300 : 0.4752550535454027
400 : 0.6336734047272036
500 : 0.7920917559090045
600 : 0.9505101070908054
700 : 1.1089284582726062
800 : 1.2673468094544071
900 : 1.425765160636208
1000 : 1.584183511818009
1100 : 1.7426018629998101
1200 : 1.9010202141816108
1300 : 2.0594385653634117
1400 : 2.2178569165452124
1500 : 2.3762752677270136
1600 : 2.5346936189088143
1700 : 2.6931119700906154
1800 : 2.851530321272416
1900 : 3.0099486724542173
2000 : 3.168367023636018
2100 : 3.3267853748178187
2200 : 3.4852037259996203
2300 : 3.643622077181421
2400 : 3.8020404283632216
2500 : 3.9604587795450223
2600 : 4.1188771307268235
2700 : 4.277295481908625
2800 : 4.435713833090425
2900 : 4.594132184272226
3000 : 4.752550535454027
3100 : 4.910968886635828
3200 : 5.069387237817629
3300 : 5.22780558899943
3400 : 5.386223940181231
3500 : 5.544642291363031
3600 : 5.703060642544832
3700 : 5.861478993726633
3800 : 6.0198973449084345
3900 : 6.178315696090235
4000 

31500 : 49.90178062226728
31600 : 50.06019897344909
31700 : 50.218617324630884
31800 : 50.377035675812685
31900 : 50.535454026994486
32000 : 50.69387237817629
32100 : 50.85229072935809
32200 : 51.01070908053988
32300 : 51.1691274317217
32400 : 51.32754578290349
32500 : 51.48596413408529
32600 : 51.644382485267094
32700 : 51.802800836448895
32800 : 51.96121918763069
32900 : 52.11963753881249
33000 : 52.2780558899943
33100 : 52.4364742411761
33200 : 52.5948925923579
33300 : 52.7533109435397
33400 : 52.911729294721496
33500 : 53.0701476459033
33600 : 53.2285659970851
33700 : 53.38698434826691
33800 : 53.54540269944871
33900 : 53.70382105063051
34000 : 53.8622394018123
34100 : 54.020657752994104
34200 : 54.179076104175905
34300 : 54.33749445535771
34400 : 54.495912806539515
34500 : 54.654331157721316
34600 : 54.81274950890311
34700 : 54.97116786008491
34800 : 55.12958621126671
34900 : 55.28800456244851
35000 : 55.446422913630315
35100 : 55.60484126481211
35200 : 55.763259615993924
35300 : 

63100 : 99.96197959571637


In [37]:
columns =['Label_ID','Set_ID', 'From','To','Type', 'Annotation', 'Context','CUI_ID','DB_ID','DrugName', 'Drug_Brand_Name', 'Section' ]
newdf = pd.DataFrame(data, columns= columns)

In [40]:
newdf.tail()

Unnamed: 0,Label_ID,Set_ID,From,To,Type,Annotation,Context,CUI_ID,DB_ID,DrugName,Drug_Brand_Name
912832,544ddf40-c75f-564c-e054-00144ff8d46c.xml,544ddf40-c75e-564c-e054-00144ff8d46c,18,29,T047,Cycloplegia,For mydriasis and cycloplegia for diagnostic p...,C0235238,DB00809,TROPICAMIDE,Tropicamide
912833,314f1ebc-353d-413e-beff-ddf3336e051a.xml,4c294bea-5a61-4756-af3c-17ed30f2c0d8,56,69,T047,Hair follicle disorder,This product is for topical application direct...,C0178668,DB00121,BIOTIN,SPAI-SONS
912834,dd8b3696-fb4b-481f-b774-508a38f82e1a.xml,e9ea2ae7-1b98-4034-9fba-8ce2225e0145,15,31,T047,Dentin Sensitivity,AZOPT\r\n(r)\r\n \r\nHypersensitivity to any c...,C0011432,DB01194,BRINZOLAMIDE,Azopt
912835,dd8b3696-fb4b-481f-b774-508a38f82e1a.xml,e9ea2ae7-1b98-4034-9fba-8ce2225e0145,15,31,T047,Familial cold urticaria,AZOPT\r\n(r)\r\n \r\nHypersensitivity to any c...,C0343068,DB01194,BRINZOLAMIDE,Azopt
912836,890daf89-0304-41e5-a66c-5109d2333633.xml,13148b04-9295-41bb-bd8b-76dbd4ce7c00,12,21,T047,Infection caused by Helicobacter pylori,PYtest\r\n14\r\nH. pylori\r\n14\r\n2,C0850666,DB09513,Urea C-14,PYtest


In [41]:
def getDiseaseTerm(row):
   #print (row)
    from_ = row['From']
    to_ = row['To']
    return row['Context'][from_-1:to_]

newdf['DieaseName']= newdf.apply(getDiseaseTerm, axis='columns')

In [43]:
newdf.tail()

Unnamed: 0,Label_ID,Set_ID,From,To,Type,Annotation,Context,CUI_ID,DB_ID,DrugName,Drug_Brand_Name,DieaseName
912832,544ddf40-c75f-564c-e054-00144ff8d46c.xml,544ddf40-c75e-564c-e054-00144ff8d46c,18,29,T047,Cycloplegia,For mydriasis and cycloplegia for diagnostic p...,C0235238,DB00809,TROPICAMIDE,Tropicamide,cycloplegia
912833,314f1ebc-353d-413e-beff-ddf3336e051a.xml,4c294bea-5a61-4756-af3c-17ed30f2c0d8,56,69,T047,Hair follicle disorder,This product is for topical application direct...,C0178668,DB00121,BIOTIN,SPAI-SONS,hair follicle
912834,dd8b3696-fb4b-481f-b774-508a38f82e1a.xml,e9ea2ae7-1b98-4034-9fba-8ce2225e0145,15,31,T047,Dentin Sensitivity,AZOPT\r\n(r)\r\n \r\nHypersensitivity to any c...,C0011432,DB01194,BRINZOLAMIDE,Azopt,\nHypersensitivity
912835,dd8b3696-fb4b-481f-b774-508a38f82e1a.xml,e9ea2ae7-1b98-4034-9fba-8ce2225e0145,15,31,T047,Familial cold urticaria,AZOPT\r\n(r)\r\n \r\nHypersensitivity to any c...,C0343068,DB01194,BRINZOLAMIDE,Azopt,\nHypersensitivity
912836,890daf89-0304-41e5-a66c-5109d2333633.xml,13148b04-9295-41bb-bd8b-76dbd4ce7c00,12,21,T047,Infection caused by Helicobacter pylori,PYtest\r\n14\r\nH. pylori\r\n14\r\n2,C0850666,DB09513,Urea C-14,PYtest,\nH. pylori


In [39]:
newdf.to_csv('/data/output/XMLProduct_annotations_scispacy.csv', index=False)