In [1]:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from asset_scraper import get_variables
import os

'''
Scrapes any project within the main_variables.json file, output is an html page within the registry that includes each IG's title, url, and decription.
Uses Selenium due to needing to wait until the guides are loaded on the website

DOCS
https://www.selenium.dev/selenium/docs/api/py/index.html
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
'''


def get_guides(url):
    ''' Opens Simplifier Guides page for a project and scrapes all IGs for title, url and description. Note that as this does not include login details no private IGS are scraped'''
    driver = webdriver.Firefox()
    driver.get(url)
    
    # This waits until the urls are loaded before getting webpage which are under class="guides-table-row"
    timeout = 5
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'guides-table-row')) # Ensure the relative urls are loaded before getting page
        WebDriverWait(driver, timeout).until(element_present)
    except TimeoutException:
        print ("Timed out waiting for page to load")

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.close()
    return soup

def get_attributes(soup, url, guides_dict):
    organization = soup.find("div", class_="pre-title").find("a")
    print(organization)
    project = '<a href='+url+'>'+soup.find("h1", class_="title").text.strip()+'</a>'
    project_description = soup.find("div", class_="description").text.strip()

    results = soup.find(id="guides")
    
    title_elements = results.find_all("div", class_="title")
    desc_elements = results.find_all("div", class_="description")
    relative_urls = [tr['data-url'] for tr in results.find_all("tr", class_="guides-table-row")]

    titles = []
    for title in title_elements:
        titles.append(title.text.strip()) # strips all \n and whitespace

    descriptions = []
    for description in desc_elements:
        descriptions.append(description.text)
    
    prefix = 'https://simplifier.net'
    relative_urls = [prefix+x.split('?')[0] for x in relative_urls] #adds prefix and removes ?version=current so that guide opens with 'default'

    guides = list(zip(titles,relative_urls, descriptions))
    if organization not in guides_dict.keys():
        guides_dict.update({organization:[project_description,guides]})
    else:
        guides_dict[organization][1].append(guides)
    return guides_dict

def sort_ukcore(guides):
    ''' used to ensure latest uk core stu version is shown at the top '''
    igs = []
    other_guides = []
    for tup in guides:
        if 'implementation' in tup[0].lower() and 'development' not in tup[0].lower():
            igs.append(tup)
        else:
            other_guides.append(tup)
    igs = sorted(igs,reverse=True)
    other_guides = sorted(other_guides)
    guides = igs+other_guides
    return guides


'''
#### Create webpage ####
This creates the html for the page. Note: it is hard to read, Flask is potentially a better way '''

path = './guides/Interoperability-Standard-Registry-Guide/About-Interoperability/FHIR-Guides/'

def guides_to_html(org, guides):
    page = path+'/'+org.text+'.page.md'
    if os.path.exists(page):
        os.remove(page)
    md_file = open(page,"w")
    print(f'''
<div class="container-nhs-pale-grey">

## {org}
{guides[0]}

</div>
<br>
<div class="col-grid">
''',file=md_file)
    for guide in guides[1]:
        print(f'''
<div class="col-grid-content">
<div class="col-grid-body">
    <h4 class="col-grid-title"><b><a href="{guide[1]}">{guide[0]}</a></b></h4>
    <p class="col-grid-text">{guide[2]}</p>
</div>
</div>
''',file=md_file)
    print("</div>\n\n---",file=md_file)

repo_to_url = get_variables('main_variables.json', 'repo_to_url')
project_urls = repo_to_url.values()

guides_dict = {}
for url in project_urls:
    soup = get_guides(url+'/~guides')
    guides_dict = get_attributes(soup, url, guides_dict)

print(f"DICT:{guides_dict}")
for org, guides in guides_dict.items():
    if 'uk core' in org.text.lower() and 'stu' not in org.text.lower():
        guides[1] = sort_ukcore(guides[1])
    guides_to_html(org, guides)


Files ignored:
<a href="https://simplifier.net/organization/hl7uk">HL7 UK</a>
<a href="https://simplifier.net/organization/nhsdigital">NHS England</a>
<a href="https://simplifier.net/organization/nhsdigital">NHS England</a>
DICT:{<a href="https://simplifier.net/organization/hl7uk">HL7 UK</a>: ['Query', [('UK Core Implementation Guide 2.0.1 - STU2 Sequence', 'https://simplifier.net/guide/uk-core-implementation-guide-stu2', 'The UK Core for STU2 Sequence ballot, by HL7 UK'), ('UK Core Implementation Guide STU3 Sequence', 'https://simplifier.net/guide/uk-core-implementation-guide-stu3-sequence', 'Development build of the UK Core for STU3 Sequence '), ('UK Core Hazard Log', 'https://simplifier.net/guide/uk-core-hazard-log', 'This generic hazard logs captures the generic hazards for the UK Core.'), ('UK Core Implementation Guidance Directory', 'https://simplifier.net/guide/uk-core-implementation-guidance-directory', 'This guide provides a directory of information and guidance that will assi

In [2]:
for k, v in guides_dict.items():
    if 'uk core' in k.text.lower() and 'stu' not in k.text.lower():
        guides[1] = sort_ukcore(guides[1])

In [3]:
guides_dict.keys()

dict_keys([<a href="https://simplifier.net/organization/hl7uk">HL7 UK</a>, <a href="https://simplifier.net/organization/nhsdigital">NHS England</a>])