## Find external code content using Web Scraping

*requirements: Python 3.7.12+, requests, and, bs4 libraries. See the `requirements.txt` file*
- search implementation guide web pages
-  use package/spec internal resource to get all urls for IG
 - match text using regex
 - match codesystem urls ...
 - save as dict
 - print output 

In [2]:
from bs4 import BeautifulSoup
from requests import get
from re import compile
from pathlib import Path
from json import loads

base_url = "http://hl7.org/fhir/us/davinci-alerts"  # change this to your IG's base_url
exclude =['.history.html','.ttl.html','.xml.html']  # exclude redundant pages that duplicate the content
my_package = Path(r'/Users/ehaas/.fhir/packages/hl7.fhir.us.davinci-alerts#current/package/other/spec.internals')  # local package file to fetch the spec_internal files that lists all the IG page urls
my_targets=loads(my_package.read_text())["targets"] # read the spec internals files and get all the page url to scrape
my_targets = [t for t in my_targets if not t.endswith(tuple(exclude))]
print(len(my_targets))
print(f"my_targets= {my_targets[0:10]}\n...]")

100
my_targets= ['StructureDefinition-discharge-notification-messageheader.profile.json.html', 'StructureDefinition-notifications-bundle-examples.html', 'roles.html', 'StructureDefinition-notifications-messageheader-mappings.html', 'StructureDefinition-adt-notification-encounter-mappings.html', 'StructureDefinition-notifications-messageheader-testing.html', 'Bundle-discharge-notification-message-bundle-01.json.html', 'Bundle-admit-notification-message-bundle-01.html', 'StructureDefinition-adt-notification-encounter.html', 'StructureDefinition-adt-notification-coverage-definitions.html']
...]


In [3]:
search_strings = ['LOINC','SNOMED', 'CPT', 'ICD', 'X12','NUCC', 'NUBC']  #External Codes Acronyms
# search_strings = ['http://loinc.org',
                  # 'http://snomed.info/sct',
                  # 'http://www.ama-assn.org/go/cpt',
                  # 'http://hl7.org/fhir/sid/icd-10',
                  # 'http://hl7.org/fhir/sid/icd-9',
                  # 'https://codesystem.x12.org',
                  # 'http://cts.nlm.nih.gov/fhir/ValueSet/2.16.840.1.114222.4.11.1066',
                  # 'https://www.nubc.org/CodeSystem/PatDischargeStatus',
                  # 'http://www.ada.org/cdt',
                  #  ]  #External Codes Systems
# search_strings = ['X12']
matches = {}
for i,page in enumerate(my_targets):
    URL = f'{base_url}/{page}'
    # print(f"{i+1}) {URL}")
    matches[URL]=[]
    response = get(URL) # fetch the ig pages
    html_content = response.text
    # print(response.status_code)   
    soup = BeautifulSoup(html_content)  # scrape the pages 
    # mypage = soup.get_text(' ', strip=True)
    for match in search_strings:  # look for matches on each page
      pattern = compile(match)
      for element in soup.find_all(string=True):
          if pattern.search(element):
            pos = html_content.find(element.strip())  # get approx position of matches
            matches[URL].append({
            "match": match,
            "text": element.strip(),
            "pos": pos if pos != -1 else None,
             })
            # print(f"Found {match} mentioned in {URL} at character position {pos if pos != -1 else None}")
            print('------------------MATCH  FOUND !!!--------------------')        

------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------
------------------MATCH  FOUND !!!--------------------


In [4]:
# Print out all nice n pretty like in markdown
i = 0
for k,v in sorted(matches.items()):
    
    if v:
      i += 1
      print(f"{i}) <{k}>")
      for mymatch in v:
        print(mymatch['match'])
        print(f"    - Found {mymatch['match']} at character {mymatch['pos']} (matched text:`{mymatch['text'][0:50].replace('>','&gt;').replace('<','&lt;')}...`)")

1) <http://hl7.org/fhir/us/davinci-alerts/StructureDefinition-adt-notification-condition-definitions.html>
SNOMED
    - Found SNOMED at character 36972 (matched text:`The data type is CodeableConcept because clinicalS...`)
SNOMED
    - Found SNOMED at character 39577 (matched text:`verificationStatus is not required.  For example, ...`)
SNOMED
    - Found SNOMED at character 48800 (matched text:`SNOMEDCTBodyStructures...`)
2) <http://hl7.org/fhir/us/davinci-alerts/StructureDefinition-adt-notification-condition-mappings.html>
SNOMED
    - Found SNOMED at character 9400 (matched text:`Mappings for SNOMED CT Concept Domain Binding (htt...`)
SNOMED
    - Found SNOMED at character 18394 (matched text:`Mappings for SNOMED CT Attribute Binding (http://s...`)
3) <http://hl7.org/fhir/us/davinci-alerts/StructureDefinition-adt-notification-condition.html>
SNOMED
    - Found SNOMED at character 55893 (matched text:`SNOMEDCTBodyStructures...`)
SNOMED
    - Found SNOMED at character 55893 (matched t