In [164]:
import requests
from bs4 import BeautifulSoup
import logging
import re

logger = logging.getLogger(__name__)

# Retrieving cert details

In [165]:
r = requests.get("https://docs.microsoft.com/en-us/learn/certifications/azure-database-administrator-associate")
parsed_html = BeautifulSoup(r.content, "html.parser")

In [166]:
def get_title(html):
    # Seek element
    title_els = html.find_all("h1", {"class": "title"})
    if len(title_els) > 1:
        raise ValueError("Found multiple title elements")
    if not title_els:
        raise ValueError("Didn't find title element")
    # Seek content
    title = title_els[0].string
    if not title:
        raise ValueError("Title was empty")
    return title

def get_summary_hilights(html):
    # Seek summary element
    summary_els = html.find_all("div", {"class": "summary"})
    if len(summary_els) > 1:
        raise ValueError("Found multiple summary elements")
    if not summary_els:
        raise ValueError("Didn't find summary element")
    # Seek any (optional) hilights
    hilight_els = summary_els[0].find_all("strong")
    hilight_parents = [el.parent for el in hilight_els if el.parent.strings]
    hilights_with_links = [
        {
            "content": "".join(el.strings),
            "links": [{"text": "".join(a.strings), "url": a["href"]} for a in el.find_all("a")] 
        }
        for el in hilight_parents
    ]
    return hilights_with_links

def get_skills_measured_topics(html):
    # Seek heading element (right before topics)
    heading_string = "Skills measured"
    heading_els = html.find_all("h3", string=heading_string)
    if len(heading_els) > 1:
        raise ValueError("Found multiple headings with text "+heading_string)
    if not heading_els:
        raise ValueError("Didn't find heading with "+heading_string)
    # Seek topics
    topics_el = heading_els[0].find_next_sibling("div")
    if not topics_el:
        raise ValueError("Didn't find next sibling containing actual 'skills measured' topics")
    # Filter out notes if exist (present in some pages, in some not)
    topics = [t for t in topics_el.stripped_strings if "This list is not definitive or exhaustive" not in t]
    return topics

def get_skills_measured_link(html):
    # Seek element
    link_string = "skills outline"
    link_els = html.find_all(lambda tag: tag.name == "a" and link_string in "".join(tag.strings))
    if len(link_els) > 1:
        raise ValueError("Found multiple links with "+link_string)
    if not link_els:
        raise ValueError("Didn't find link with "+link_string)
    return link_els[0]["href"]

In [167]:
get_title(parsed_html)

'Microsoft Certified: Azure Database Administrator Associate'

In [168]:
get_summary_hilights(parsed_html)

[{'content': 'In response to the coronavirus (COVID-19) situation, Microsoft is implementing several temporary changes to our training and certification program. Learn more.',
  'links': [{'text': 'Learn more',
    'url': 'https://www.microsoft.com/en-us/learning/community-blog-post.aspx?BlogId=8&Id=375289'}]}]

In [169]:
get_skills_measured_topics(parsed_html)

['Plan and implement data platform resources',
 'Implement a secure environment',
 'Monitor and optimize operational resources',
 'Optimize query performance',
 'Perform automation of tasks',
 'Plan and implement a High Availability and Disaster Recovery (HADR) environment',
 'Perform administration by using T-SQL']

In [170]:
get_skills_measured_link(parsed_html)

'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4qjf6'

# Retrieving exam details

In [171]:
r = requests.get("https://docs.microsoft.com/en-us/learn/certifications/exams/az-220")
parsed_html = BeautifulSoup(r.content, "html.parser")

In [176]:
def get_title(html):
    # Seek element
    title_els = html.find_all(lambda tag: tag.name == "h1" and "Exam " in "".join(tag.strings) and ": " in "".join(tag.strings))
    if len(title_els) > 1:
        raise ValueError("Found multiple title elements")
    if not title_els:
        raise ValueError("Didn't find title element")
    # Seek content
    title = title_els[0].string
    if not title:
        raise ValueError("Title was empty")
    return title

def get_summary_hilights(html):
    # Seek summary element
    summary_els = html.find_all("div", {"class": "summary"})
    if len(summary_els) > 1:
        raise ValueError("Found multiple summary elements")
    if not summary_els:
        raise ValueError("Didn't find summary element")
    # Seek any (optional) hilights
    hilight_els = summary_els[0].find_all("strong")
    hilight_parents = [el.parent for el in hilight_els if el.parent.strings]
    hilights_with_links = [
        {
            "content": "".join(el.strings),
            "links": [{"text": "".join(a.strings), "url": a["href"]} for a in el.find_all("a")] 
        }
        for el in hilight_parents
    ]
    return hilights_with_links

def get_skills_measured_topics(html):
    # Seek heading element (right before topics)
    heading_string = "Skills measured"
    heading_els = html.find_all("h2", string=heading_string)
    if len(heading_els) > 1:
        raise ValueError("Found multiple headings with text "+heading_string)
    if not heading_els:
        raise ValueError("Didn't find heading with "+heading_string)
    # Seek topics
    topics_el = heading_els[0].find_next_sibling("div")
    if not topics_el:
        raise ValueError("Didn't find next sibling containing actual 'skills measured' topics")
    # Filter out notes if exist (present in some pages, in some not)
    topics = [t for t in topics_el.stripped_strings if "This list is not definitive or exhaustive" not in t]
    return topics

def get_skills_measured_link(html):
    # Seek element
    link_string = "skills outline"
    link_els = html.find_all(lambda tag: tag.name == "a" and link_string in "".join(tag.strings))
    if len(link_els) > 1:
        raise ValueError("Found multiple links with "+link_string)
    if not link_els:
        raise ValueError("Didn't find link with "+link_string)
    return link_els[0]["href"]

In [173]:
get_title(parsed_html)

'Exam AZ-220: Microsoft Azure IoT Developer (beta)'

In [175]:
get_summary_hilights(parsed_html)

[{'content': 'In response to the coronavirus (COVID-19) situation, Microsoft is implementing several temporary changes to our training and certification program. Learn more.',
  'links': [{'text': 'Learn more',
    'url': 'https://www.microsoft.com/en-us/learning/community-blog-post.aspx?BlogId=8&Id=375289'}]},
 {'content': 'Beta exams are not scored immediately because we are gathering data on the quality of the questions and the exam. Learn more about the value and importance of beta exams.',
  'links': [{'text': 'about the value and importance of beta exams',
    'url': '/en-us/learn/certifications/certification-exams#participating-in-beta-exams'}]}]

In [177]:
get_skills_measured_topics(parsed_html)

['Implement the IoT solution infrastructure (15-20%)',
 'Provision and manage devices (20-25%)',
 'Implement Edge (15-20%)',
 'Process and manage data (15-20%)',
 'Monitor, troubleshoot, and optimize IoT solutions (15-20%)',
 'Implement security (15-20%)']

In [178]:
get_skills_measured_link(parsed_html)

'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4nBeC'