# PyMetaMapLite Example

In [1]:
from collections import namedtuple
import nltk
from nltk.tokenize import TreebankWordTokenizer
from metamaplite import MetaMapLite, postings_utils

In [2]:
def loadtextfile(inputfile):
    """ Load textfile and return as string """
    text = ''
    with open(inputfile) as fp:
        text = fp.read()
    return text

Load input text:

In [3]:
inputfilename = 'neonatal2.txt'
inputtext = loadtextfile(inputfilename)

Location of MetaMapLite UMLS indexes:

In [4]:
ivfdir = 'pathto/public_mm_lite/data/ivf/2020AA/USAbase'

Assign allowed vocabulary source list, semantic type list, stopwords, and term excluded by concept unique identifer.

In [5]:
use_sources = []
use_semtypes = []
stopwords = []
excludedterms = []

The token's part-of-speech must be in postags set to be looked up in dictionary.

In [6]:
postags = set(["CD", "FW", "RB", "IN", "NN", "NNS",
               "NNP", "NNPS", "JJ", "JJR", "JJS", "LS"])

Instantiate a MetaMapLite instance.

In [7]:
mminst = MetaMapLite(ivfdir, use_sources, use_semtypes, postags,
                     stopwords, excludedterms)

Named tuple ``Token`` contains token text in ``text``, part of speech tag  in ``tag_``, and charater offset is in ``idx``.

In [8]:
Token = namedtuple('Token', ['text', 'tag_', 'idx'])

In [9]:
inputtext

"The 24-h period of exposure to chemically defined medium was not\nsufficient to reverse the effects imposed on the cells by the serum\nused in the first phase of growth. Effect of change in growth\nenvironment on cultured myocardial cells investigated in a\nstandardized medium. Neonatal rat heart cells cultivated in either of\ntwo different media which varied only in their serum supplements were\ntransferred to chemically defined medium (Ham's F10) for 24 h before\nmeasuring a variety of parameters. The 24-h period of exposure to\nchemically defined medium was not sufficient to reverse the effects\nimposed on the cells by the serum used in the first phase of\ngrowth. The cells differed in rate and duration of action potentials\nand contractions. The initial serum composition affected the response\nof the cells to calcium deficiency. Studies involving the effects of\npharmaceutical reagents such as isoproterenol were also influenced by\nthe serum. In attempting to determine the cause a

Convert text into sentences, tokenize each sentence, and apply part-of-speech tagger to each list of sentence tokens.  Then apply recognizer to tagged tokenlist.

In [10]:
sentlist = nltk.sent_tokenize(inputtext)
sent_resultlist = []
sentidx = 0
for sentence in sentlist:
    spanlist = list(TreebankWordTokenizer().span_tokenize(sentence))
    texttokenlist = [sentence[start:end] for start, end in spanlist]
    
    postokenlist = nltk.pos_tag(texttokenlist)
    tokenlist = []
    for token, span in zip(postokenlist, spanlist):
        tokenlist.append(Token(text=token[0], tag_=token[1],
                               idx=(sentidx+span[0])))
    matches = mminst.get_entities(tokenlist, span_info=True)
    sent_resultlist.append(((sentence, sentidx), matches))
    sentidx = sentidx + len(sentence) + 1

In [11]:
len(sentlist), len(sent_resultlist)

(10, 10)

In [12]:
def display_results(mminst, sent_resultlist):
    """ process inputtext returning list of matches """
    for sentence, matches in sent_resultlist:
        print('sentence: "%s, %d"' % sentence)
        for term in matches:
            print('  "%s"' % term.text)
            print('    start: %d' % term.start)
            print('    end: %d' % term.end)
            print('    postings:')
            for post in postings_utils.add_semantic_types(mminst, term.postings):
                print('      {}'.format(post))

In [13]:
display_results(mminst, sent_resultlist)

sentence: "The 24-h period of exposure to chemically defined medium was not
sufficient to reverse the effects imposed on the cells by the serum
used in the first phase of growth., 0"
  "period"
    start: 9
    end: 15
    postings:
      PostingSTS(cui='C0025344', sui='S1237140', idx='41', str='period', src='CHV', termtype='PT', semtypeset=['orgf'])
      PostingSTS(cui='C0025344', sui='S6137818', idx='42', str='Period', src='MEDLINEPLUS', termtype='SY', semtypeset=['orgf'])
      PostingSTS(cui='C0439531', sui='S1237140', idx='4', str='period', src='CHV', termtype='PT', semtypeset=['tmco'])
      PostingSTS(cui='C1561960', sui='S1237140', idx='2', str='period', src='HL7V3.0', termtype='PT', semtypeset=['qnco'])
      PostingSTS(cui='C1948053', sui='S6137818', idx='2', str='Period', src='NCI', termtype='PT', semtypeset=['tmco'])
      PostingSTS(cui='C2347804', sui='S6137818', idx='3', str='Period', src='NCI', termtype='SY', semtypeset=['resa'])
  "exposure to"
    start: 19
    end: 

      PostingSTS(cui='C0007584', sui='S0023469', idx='24', str='Cells', src='NCI_CDISC', termtype='SY', semtypeset=['lbpr'])
      PostingSTS(cui='C0007584', sui='S0023469', idx='25', str='Cells', src='NCI_CDISC', termtype='PT', semtypeset=['lbpr'])
      PostingSTS(cui='C0007584', sui='S0827167', idx='26', str='CELLS', src='NCI_CDISC', termtype='PT', semtypeset=['lbpr'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='1', str='Cells', src='LCH', termtype='PT', semtypeset=['cell'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='2', str='Cells', src='MSH', termtype='MH', semtypeset=['cell'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='3', str='Cells', src='MTH', termtype='SY', semtypeset=['cell'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='4', str='Cells', src='LNC', termtype='LPN', semtypeset=['cell'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='5', str='Cells', src='LNC', termtype='CN', semtypeset=['cell'])
      PostingSTS(cui='C0007634', s

      PostingSTS(cui='C1704788', sui='S7670164', idx='5', str='Defined', src='NCI', termtype='AD', semtypeset=['inpr'])
      PostingSTS(cui='C3539106', sui='S7670164', idx='7', str='Defined', src='SNOMEDCT_US', termtype='PT', semtypeset=['inpr'])
      PostingSTS(cui='C4724435', sui='S7670164', idx='3', str='Defined', src='NCI', termtype='SY', semtypeset=['cnce'])
  "medium"
    start: 428
    end: 434
    postings:
      PostingSTS(cui='C0009458', sui='S0943869', idx='9', str='Medium', src='NCI', termtype='SY', semtypeset=['inpr'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='1', str='Medium', src='NCI', termtype='PT', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='2', str='Medium', src='NCI', termtype='SY', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='3', str='Medium', src='LNC', termtype='LA', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='4', str='Medium', src='SNOMEDCT_US', termtype='PT

    postings:
      PostingSTS(cui='C0009458', sui='S0943869', idx='9', str='Medium', src='NCI', termtype='SY', semtypeset=['inpr'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='1', str='Medium', src='NCI', termtype='PT', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='2', str='Medium', src='NCI', termtype='SY', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='3', str='Medium', src='LNC', termtype='LA', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='4', str='Medium', src='SNOMEDCT_US', termtype='PT', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S0943869', idx='5', str='Medium', src='MTH', termtype='PN', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S11912821', idx='6', str='medium', src='CHV', termtype='PT', semtypeset=['qnco'])
      PostingSTS(cui='C0439536', sui='S6941619', idx='7', str='MEDIUM', src='NCI_CDISC', termtype='PT', semtypeset=['qnco'])
      PostingSTS(cu

      PostingSTS(cui='C0205265', sui='S0324686', idx='8', str='Initial', src='SNOMEDCT_US', termtype='PT', semtypeset=['tmco'])
      PostingSTS(cui='C0205265', sui='S6325504', idx='9', str='initial', src='CHV', termtype='PT', semtypeset=['tmco'])
      PostingSTS(cui='C1279901', sui='S0324686', idx='5', str='Initial', src='NCI', termtype='SY', semtypeset=['qlco'])
      PostingSTS(cui='C1555582', sui='S0324686', idx='4', str='Initial', src='NCI', termtype='SY', semtypeset=['idcn'])
      PostingSTS(cui='C1555582', sui='S6325504', idx='5', str='initial', src='HL7V3.0', termtype='PT', semtypeset=['idcn'])
      PostingSTS(cui='C1555582', sui='S6325504', idx='6', str='initial', src='HL7V3.0', termtype='PT', semtypeset=['idcn'])
      PostingSTS(cui='C1705684', sui='S12203170', idx='3', str='INITIAL', src='NCI_FDA', termtype='PT', semtypeset=['inpr'])
      PostingSTS(cui='C1705685', sui='S12203170', idx='3', str='INITIAL', src='NCI_FDA', termtype='PT', semtypeset=['ftcn'])
  "serum"
    

      PostingSTS(cui='C0013058', sui='S6323368', idx='13', str='Pharmaceutical', src='NCI_NCPDP', termtype='PT', semtypeset=['bodm'])
      PostingSTS(cui='C0013058', sui='S6323368', idx='14', str='Pharmaceutical', src='NCI', termtype='SY', semtypeset=['bodm'])
      PostingSTS(cui='C0031336', sui='S6323368', idx='9', str='Pharmaceutical', src='NCI', termtype='AD', semtypeset=['bmod'])
      PostingSTS(cui='C1135440', sui='S11925827', idx='2', str='pharmaceutical', src='CHV', termtype='PT', semtypeset=['topp'])
      PostingSTS(cui='C1553890', sui='S6323368', idx='2', str='Pharmaceutical', src='HL7V3.0', termtype='PT', semtypeset=['hlca'])
  "reagents"
    start: 881
    end: 889
    postings:
      PostingSTS(cui='C0034760', sui='S0080191', idx='1', str='Reagents', src='MSH', termtype='PEP', semtypeset=['irda'])
      PostingSTS(cui='C0034760', sui='S11932217', idx='2', str='reagents', src='CHV', termtype='SY', semtypeset=['irda'])
  "isoproterenol"
    start: 898
    end: 911
    pos

      PostingSTS(cui='C0596952', sui='S1234365', idx='3', str='mitochondrial membrane', src='CSP', termtype='PT', semtypeset=['celc'])
      PostingSTS(cui='C0596952', sui='S1234365', idx='4', str='mitochondrial membrane', src='GO', termtype='PT', semtypeset=['celc'])
      PostingSTS(cui='C0596952', sui='S3011716', idx='5', str='Mitochondrial membrane', src='FMA', termtype='SY', semtypeset=['celc'])
      PostingSTS(cui='C0596952', sui='S3011716', idx='6', str='Mitochondrial membrane', src='UWDA', termtype='SY', semtypeset=['celc'])
      PostingSTS(cui='C0596952', sui='S3011716', idx='7', str='Mitochondrial membrane', src='SNOMEDCT_US', termtype='PT', semtypeset=['celc'])
      PostingSTS(cui='C0596952', sui='S6133573', idx='8', str='Mitochondrial Membrane', src='MSH', termtype='PM', semtypeset=['celc'])
      PostingSTS(cui='C0596952', sui='S6133573', idx='9', str='Mitochondrial Membrane', src='NCI', termtype='PT', semtypeset=['celc'])
  "for"
    start: 1062
    end: 1065
    posti

      PostingSTS(cui='C0007584', sui='S0023469', idx='24', str='Cells', src='NCI_CDISC', termtype='SY', semtypeset=['lbpr'])
      PostingSTS(cui='C0007584', sui='S0023469', idx='25', str='Cells', src='NCI_CDISC', termtype='PT', semtypeset=['lbpr'])
      PostingSTS(cui='C0007584', sui='S0827167', idx='26', str='CELLS', src='NCI_CDISC', termtype='PT', semtypeset=['lbpr'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='1', str='Cells', src='LCH', termtype='PT', semtypeset=['cell'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='2', str='Cells', src='MSH', termtype='MH', semtypeset=['cell'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='3', str='Cells', src='MTH', termtype='SY', semtypeset=['cell'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='4', str='Cells', src='LNC', termtype='LPN', semtypeset=['cell'])
      PostingSTS(cui='C0007634', sui='S0023469', idx='5', str='Cells', src='LNC', termtype='CN', semtypeset=['cell'])
      PostingSTS(cui='C0007634', s

      PostingSTS(cui='C1274040', sui='S3513919', idx='1', str='Result', src='SNOMEDCT_US', termtype='OAP', semtypeset=['ftcn'])
      PostingSTS(cui='C1274040', sui='S3513919', idx='2', str='Result', src='SNOMEDCT_US', termtype='OAP', semtypeset=['ftcn'])
      PostingSTS(cui='C1274040', sui='S3513919', idx='3', str='Result', src='SNOMEDCT_US', termtype='PT', semtypeset=['ftcn'])
      PostingSTS(cui='C1274040', sui='S3513919', idx='4', str='Result', src='SNOMEDCT_US', termtype='OF', semtypeset=['ftcn'])
      PostingSTS(cui='C1274040', sui='S3513919', idx='5', str='Result', src='SNOMEDCT_US', termtype='OF', semtypeset=['ftcn'])
      PostingSTS(cui='C1274040', sui='S3513919', idx='6', str='Result', src='NCI', termtype='SY', semtypeset=['ftcn'])
      PostingSTS(cui='C1274040', sui='S3513919', idx='7', str='Result', src='MTH', termtype='PN', semtypeset=['ftcn'])
      PostingSTS(cui='C1274040', sui='S6175685', idx='8', str='result', src='NCI_BRIDG_3_0_3', termtype='PT', semtypeset=['ft

In [14]:
inputtext[1639:1660]

'nitroblue tetrazolium'

In [15]:
inputtext[821:831]

'deficiency'