# PyMetaMapLite Example

In [1]:
from collections import namedtuple
import nltk
from nltk.tokenize import TreebankWordTokenizer
from metamaplite import MetaMapLite

In [2]:
def loadtextfile(inputfile):
    """ Load textfile and return as string """
    text = ''
    with open(inputfile) as fp:
        text = fp.read()
    return text

Load input text:

In [3]:
inputfilename = 'neonatal2.txt'
inputtext = loadtextfile(inputfilename)

Location of MetaMapLite UMLS indexes:

In [4]:
ivfdir = '/path/to/public_mm_lite/data/ivf/2020AA/USAbase'

Assign allowed vocabulary source list, semantic type list, stopwords, and term excluded by concept unique identifer.

In [5]:
use_sources = []
use_semtypes = []
stopwords = []
excludedterms = []

The token's part-of-speech must be in postags set to be looked up in dictionary.

In [6]:
postags = set(["CD", "FW", "RB", "IN", "NN", "NNS",
               "NNP", "NNPS", "JJ", "JJR", "JJS", "LS"])

Instantiate a MetaMapLite instance.

In [7]:
mminst = MetaMapLite(ivfdir, use_sources, use_semtypes, postags,
                     stopwords, excludedterms)

Named tuple ``Token`` contains token text in ``text``, part of speech tag  in ``tag_``, and charater offset is in ``idx``.

In [8]:
Token = namedtuple('Token', ['text', 'tag_', 'idx'])

In [9]:
inputtext

"The 24-h period of exposure to chemically defined medium was not\nsufficient to reverse the effects imposed on the cells by the serum\nused in the first phase of growth. Effect of change in growth\nenvironment on cultured myocardial cells investigated in a\nstandardized medium. Neonatal rat heart cells cultivated in either of\ntwo different media which varied only in their serum supplements were\ntransferred to chemically defined medium (Ham's F10) for 24 h before\nmeasuring a variety of parameters. The 24-h period of exposure to\nchemically defined medium was not sufficient to reverse the effects\nimposed on the cells by the serum used in the first phase of\ngrowth. The cells differed in rate and duration of action potentials\nand contractions. The initial serum composition affected the response\nof the cells to calcium deficiency. Studies involving the effects of\npharmaceutical reagents such as isoproterenol were also influenced by\nthe serum. In attempting to determine the cause a

Convert text into sentences, tokenize each sentence, and apply part-of-speech tagger to each list of sentence tokens.  Then apply recognizer to tagged tokenlist.

In [10]:
sentlist = nltk.sent_tokenize(inputtext)
sent_resultlist = []
sentidx = 0
for sentence in sentlist:
    spanlist = list(TreebankWordTokenizer().span_tokenize(sentence))
    texttokenlist = [sentence[start:end] for start, end in spanlist]
    
    postokenlist = nltk.pos_tag(texttokenlist)
    tokenlist = []
    for token, span in zip(postokenlist, spanlist):
        tokenlist.append(Token(text=token[0], tag_=token[1],
                               idx=(sentidx+span[0])))
    matches = mminst.get_entities(tokenlist, span_info=True)
    sent_resultlist.append(((sentence, sentidx), matches))
    sentidx = sentidx + len(sentence) + 1

In [11]:
len(sentlist), len(sent_resultlist)

(10, 10)

In [12]:
def display_results(sent_resultlist):
    """ process inputtext returning list of matches """
    for sentence, matches in sent_resultlist:
        print('sentence: "%s, %d"' % sentence)
        for term in matches:
            print('  "%s"' % term.text)
            print('    start: %d' % term.start)
            print('    end: %d' % term.end)
            print('    postings:')
            for post in term.postings:
                print('      %s' % post)


In [13]:
display_results(sent_resultlist)

sentence: "The 24-h period of exposure to chemically defined medium was not
sufficient to reverse the effects imposed on the cells by the serum
used in the first phase of growth., 0"
  "period"
    start: 9
    end: 15
    postings:
      C0025344|S1237140|33|period|CHV|PT
      C0025344|S6137818|34|Period|MEDLINEPLUS|SY
      C0439531|S1237140|2|period|CHV|PT
      C1561960|S1237140|2|period|HL7V3.0|PT
      C1948053|S6137818|2|Period|NCI|PT
      C2347804|S6137818|3|Period|NCI|SY
  "exposure to"
    start: 19
    end: 30
    postings:
      C0332157|S0642558|1|Exposure to|MTH|PN
      C0332157|S11885829|2|exposure to|CHV|SY
  "chemically"
    start: 31
    end: 41
    postings:
      C0220806|S7669275|24|Chemically|NCI|AD
  "defined"
    start: 42
    end: 49
    postings:
      C1704788|S7670164|5|Defined|NCI|AD
      C4724435|S7670164|3|Defined|NCI|SY
  "medium"
    start: 50
    end: 56
    postings:
      C0009458|S0943869|11|Medium|NCI|SY
      C0439536|S0943869|1|Medium|RCD|PT


      C1279901|S0324686|2|Initial|NCI|SY
      C1555582|S0324686|4|Initial|NCI|SY
      C1555582|S6325504|5|initial|HL7V3.0|PT
      C1555582|S6325504|6|initial|HL7V3.0|PT
      C1705684|S12203170|3|INITIAL|NCI_FDA|PT
      C1705685|S12203170|3|INITIAL|NCI_FDA|PT
  "serum"
    start: 757
    end: 762
    postings:
      C0229671|S0403972|1|Serum|RCD|PT
      C0229671|S0403972|2|Serum|SNM|PT
      C0229671|S0403972|3|Serum|LNC|LPN
      C0229671|S0403972|4|Serum|LNC|LS
      C0229671|S0403972|5|Serum|LCH_NW|PT
      C0229671|S0403972|6|Serum|NCI|SY
      C0229671|S0403972|7|Serum|UWDA|PT
      C0229671|S0403972|8|Serum|FMA|SY
      C0229671|S0403972|9|Serum|MSH|MH
      C0229671|S0403972|10|Serum|NCI|PT
      C0229671|S0403972|11|Serum|MTH|PN
      C0229671|S0401563|12|SERUM|NCI_CDISC|PT
      C0229671|S1239724|13|serum|AOD|DE
      C0229671|S1239724|14|serum|CSP|PT
      C0229671|S1239724|15|serum|CHV|PT
      C0229671|S1239724|16|serum|NCI_NCI-GLOSS|PT
      C1546774|S0403972|2|Serum|

    start: 1501
    end: 1507
    postings:
      C1274040|S3513919|1|Result|NCI|SY
      C1274040|S3513919|2|Result|MTH|PN
      C1274040|S6175685|3|result|NCI_BRIDG_3_0_3|PT
      C1546471|S3513919|2|Result|HL7V2.5|PT
      C2825142|S3513919|3|Result|NCI|SY
  "changes"
    start: 1511
    end: 1518
    postings:
      C0392747|S11866307|6|changes|CHV|SY
      C0443172|S11866307|7|changes|CHV|SY
  "fatty acid"
    start: 1526
    end: 1536
    postings:
      C0015684|S0001341|13|Fatty acid|SNM|PT
      C0015684|S0001341|14|Fatty acid|FMA|PT
      C0015684|S0418706|15|fatty acid|CSP|PT
      C0015684|S0418706|16|fatty acid|CHV|PT
      C0015684|S0418706|17|fatty acid|NCI_NCI-GLOSS|PT
      C0015684|S0418706|18|fatty acid|MMSL|IN
      C0015684|S2228055|19|FATTY ACID|VANDF|IN
      C0015684|S6126086|20|Fatty Acid|NCI_CTRP|DN
      C0015684|S6126086|21|Fatty Acid|NCI|PT
  "composition"
    start: 1537
    end: 1548
    postings:
      C0486616|S1937865|1|Composition|NCI|PT
      C048661

In [14]:
inputtext[1639:1660]

'nitroblue tetrazolium'

In [15]:
inputtext[821:831]

'deficiency'