# NAB Policy Scraper
The policies are stored behind a login page. 
1. Interesting content is under  <main id="main-content"></main>

In [1]:
import os
import requests
from urllib.parse import urlparse, urljoin
import shutil

from loguru import logger

from IPython.display import HTML
from bs4 import BeautifulSoup

### Scraper Code
# Notes
1. On save remove scripts that check if session is expired.
2. att .html to urls with no extention


In [2]:
def clean_html_nab(html: str) -> str:
    '''Remove script elements that cause session expired messages'''
    soup = BeautifulSoup(html, 'html.parser')
    for element in soup("script"):
      element.decompose()

    return str(soup)

def fetch_content(url:str, cookie:str):
    ''' Get the contents of a URL using a logged in cookie'''
    session = requests.Session()

    # Send the request with the cookies
    try:
      response = session.get(url, cookies={'Cookie': cookie})
    
      if not response:
        logger.warning(f"No response for {url}")
        return ""
      if response.status_code != 200:
        logger.warning(f"Response {response.status} for {url}")
        return ""
    
    except Exception as e:
      logger.error(f"Error {repr(e)} for {url}")
      return ""
    
    return response.text

downloads=set()
def save_url(url:str, download_path:str, cookie:str) -> str:
  '''Save the url to the download path and return the contents'''
  html_content = fetch_content(url,cookie)
  html_content = clean_html_nab(html_content)

  full_path =  download_path + urlparse(url).path.strip()
  if full_path[-1] == "/":
    full_path = full_path[-1]
  
  extension = os.path.splitext(full_path)[-1].lower()
  if extension == "":
    full_path += ".html"
  
  directory = os.path.dirname(full_path)
  
  if not os.path.exists(directory):
    os.makedirs(directory) 

  logger.info(f"Downloading {url} to {full_path}")
  downloads.add(full_path)
  try:
    with open(full_path, 'w') as f:
        f.write(html_content)
  except Exception as e:
    logger.error(f"Error {e}")
    
  return html_content


cache=set()
domain = "https://www.nabbroker.com.au"
def scrape(url:str, cookie, download_path:str, depth:int, starting_page:bool, search_only:bool) -> None:
    # Stop if when we get to the bottom
    if depth and depth <= 0:
        logger.info(f"Bottomed out {url}")
        return 
    
    path = urlparse(url).path
    extension = os.path.splitext(path)[-1].lower()
      
    # skip already processed urls   
    if path in cache:
        return
    cache.add(path)

    if not url.startswith(domain):
      logger.info(f"Skipping domain {url}")
      return
    
    # WBC uses url without an extention
    if extension == "":
        extension = ".html"
    
    # Skip non .html
    if extension not in [".html", ".aspx", ".pdf"]:
        logger.info(f"Skipping extention {url}")
        return
    if '/templates/' in path or '/forms/' in path:
       logger.info(f"Skipping form or template {url}")
       return

    # download all resources except starting page
    if starting_page:
        html_content = fetch_content(url,cookie)
        
    else:
        if search_only:
            logger.info(f"Downloading {url}")
            downloads.add(path)
            html_content = fetch_content(url,cookie)
        else:
            html_content = save_url(url, download_path, cookie)
        
    # if we have not yet exceeded the depth find links and recurse
    if extension == ".html" and depth and depth > 0:
        soup = BeautifulSoup(html_content)
        #only read withing the main tag
        soup = soup.find('main', id='main-content')
        for link in soup.select('a'):
            if href := link.get('href'):
                if not urlparse(href).netloc:
                    href = urljoin(domain, href)

                scrape(href, cookie, download_path, depth-1, starting_page=False, search_only=search_only)
    
    return


# NAB Download

In [3]:
cookie="""affinity="74bbd0877705a77c"; AMCVS_4986658252DDA4900A490D4D%40AdobeOrg=1; s_ecid=MCMID%7C87752352428887483822390507444315316928; AMCV_4986658252DDA4900A490D4D%40AdobeOrg=1585540135%7CMCIDTS%7C20092%7CMCMID%7C87752352428887483822390507444315316928%7CMCAAMLH-1736476889%7C8%7CMCAAMB-1736476889%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1735879289s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; s_vnum=1738339200634%26vn%3D1; s_invisit=true; s_cc=true; ak_bmsc=644785C2534C73D8EDE3D92994179FFA~000000000000000000000000000000~YAAQPgNiaOo7MSmUAQAAwBEKKho9qs480nLh7XQ0FTpRbXHnGXHyh3t+XAWaVdK5pI2CAx1bnKbSL4fQ2JblK3Tk9wCWY+8m2UdTUYkyLf2kD5LEqho8hiq6GIhcg5bmkMJZq41Xsqtmx+p2uI1gIsEdmX9SfkpyY6O/Q6SpGKCQ0sKZdeJzz0xo55dsUDrBnLZy4GmjiNSS8RozA6+nBnwLgYWBQXODJuz7jeGDe9D6Jc7++F38Awtymha7Gd+9gNKug+TPAexPrNG4CSB9gInF73MYvlOmtKHzuFXp0rqpqlAKSvYO/3bOAHWHlfO8ws6JqcoUtwcX+5SC/8OTGXvxGA11kWEaU7ZdNdjv2Ij84TRKPloL+GgPWS/jHQF2vbU1aaQHbMnw+I6WBlV+9CzE8llHWbZ//Fz9/qMhEBkKDwpLorJuYQArKNXzVcnu0m7ezFXALRuuU1wRiOKNvj0rAs2u; bearer=%7B%22type%22%3A%22Bearer%22%2C%22name%22%3A%22Authorization%22%2C%22value%22%3A%22AT.AiLbcOeZ3BCIXL8H1Fz_vSvnJQ-caQyC26Cg2RHfeG8%22%2C%22expires%22%3A14399%7D; userName=Paul Booth; groups=portal-application-aem-broker; aemToken=eyJraWQiOiJRelUzX05HT1g3Vjc4Qk91RmRXY0dhUmgyVkUiLCJ4NXQiOiJ2UGVaTHp0aW85Tk9PczVmaEtvcnhsZVBkUUUiLCJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiI2QTE1MTk1NS00N0Y3LTIxRkEtMDk5MS1ERDZDM0FBMzRFODciLCJuYmYiOjE3MzU4NzIwOTMsImlhdCI6MTczNTg3MjA5MywiZXhwIjoxNzM1ODcyMzkzLCJpc3MiOiJodHRwOi8vbmFiYXBpZGV2Lm5hYmRldi5jb20uYXUiLCJzdWIiOiJlOm5hYmJyb2tlcjpIU0w1ODY3MiIsImF1ZCI6WyJhZW1fdWEiXSwiYWNyIjoiMiIsImFtciI6InB3ZCIsInR0aSI6MCwidmVyIjoiMS4wIiwidXNyIjoiSFNMNTg2NzIiLCJwYXJlbnRVaWQiOiI1ODY3MiIsInVpZCI6IkhTTDU4NjcyIiwiY24iOiJQYXVsIEJvb3RoIiwibW9mIjpbInBvcnRhbC1hcHBsaWNhdGlvbi1hZW0tYnJva2VyIl0sInNwaWQiOiI1ODY3MiJ9.FSZQgfHWWvlj6h7tttThQXgHfWNtcWd60GxtZjwund689O96oou0d4s0f9z_GRcbQkcrkYnfjT6ChWmsJjSseVnJCDMiyHMhIFJfRhmYsI-nPnVOpXxch5ykElvkrNLjXono3Hb6z1oZcrt9faj2QDQxRG4HUtliC3HBJVnertygcnedl2kGfP2KsCzvzoRCNEtiKW44sZEoo_LluaruZi94Gg4er3MsjkgPeWqIAYWYax8N1fphEnhzX0bTrFZZl3-Eg-5YuERcXDiH6E1z8u8Gjh6I1Erc36ybdyOaomgacDyw1CpOa5jP8loF5C8VmS6fREta3HLiT7OSlKdwGw; bearerToken=AT.AiLbcOeZ3BCIXL8H1Fz_vSvnJQ-caQyC26Cg2RHfeG8; dafGroups=eyJraWQiOiJRelUzX05HT1g3Vjc4Qk91RmRXY0dhUmgyVkUiLCJ4NXQiOiJ2UGVaTHp0aW85Tk9PczVmaEtvcnhsZVBkUUUiLCJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiI2QTE1MTk1NS00N0Y3LTIxRkEtMDk5MS1ERDZDM0FBMzRFODciLCJuYmYiOjE3MzU4NzIwOTMsImlhdCI6MTczNTg3MjA5MywiZXhwIjoxNzM1ODcyMzkzLCJpc3MiOiJodHRwOi8vbmFiYXBpZGV2Lm5hYmRldi5jb20uYXUiLCJzdWIiOiJlOm5hYmJyb2tlcjpIU0w1ODY3MiIsImF1ZCI6WyJhZW1fdWEiXSwiYWNyIjoiMiIsImFtciI6InB3ZCIsInR0aSI6MCwidmVyIjoiMS4wIiwidXNyIjoiSFNMNTg2NzIiLCJwYXJlbnRVaWQiOiI1ODY3MiIsInVpZCI6IkhTTDU4NjcyIiwiY24iOiJQYXVsIEJvb3RoIiwibW9mIjpbInBvcnRhbC1hcHBsaWNhdGlvbi1hZW0tYnJva2VyIl0sInNwaWQiOiI1ODY3MiJ9.FSZQgfHWWvlj6h7tttThQXgHfWNtcWd60GxtZjwund689O96oou0d4s0f9z_GRcbQkcrkYnfjT6ChWmsJjSseVnJCDMiyHMhIFJfRhmYsI-nPnVOpXxch5ykElvkrNLjXono3Hb6z1oZcrt9faj2QDQxRG4HUtliC3HBJVnertygcnedl2kGfP2KsCzvzoRCNEtiKW44sZEoo_LluaruZi94Gg4er3MsjkgPeWqIAYWYax8N1fphEnhzX0bTrFZZl3-Eg-5YuERcXDiH6E1z8u8Gjh6I1Erc36ybdyOaomgacDyw1CpOa5jP8loF5C8VmS6fREta3HLiT7OSlKdwGw; isFirstLogin=true; _abck=73DA5D38CE22E95FBA477FA9C8B1D89C~0~YAAQPgNiaJY9MSmUAQAA8yAKKg2FrjrqePq16LrFKLcrf+ehGttsc7zlGX1an8wawjAE9k9fHmYDsbHyybPLHjG3cBhXULc5prIs+Bx+pQs/4LHHHnYz7+s+P4oGxnIJfa0nmw7rBkVGBjzVPiNab+vD04ISPlIAmns6gG+MGxVCAdRQ2hZfsSv/ZPpqRSephdIQ88iLF+FtlYX0Ck07QhKPcKveWt8KNtV/MCiNT+XCOnxNMH1qSc37WomrvzMrhOi6ONTv/0OeiNwQzlkC6Y3IUg1TBni2ZfA53337OAJ/jK3SpGgSk2KFLK9qI694PGCi3HlmoYtcggnQ4E+Olthb89ojxmpANlwwXR+uT7ZgT9w59l68u5h8V5p3hinm4CFlLYn9+HNGYZwu5xr+FSlLEt6BcxKLUlawm2hCdLNhP+9egIT19kFTKCE5eYoeT3JuhjszD+QVxQMn8KAPGoiBw6gHDSpDYCNSYnhnCRhLCQvZNw==~-1~-1~-1; bm_sz=433C54F08AEA07E1CCB30E6F68E93C31~YAAQPgNiaLU/MSmUAQAAZjAKKhrH7xZ5vm776kwsVU/sXeuj73O4B3bHdAa9oKEu5jKJ9ZjFPGJ9yBuglp+tT1o5ZbCz87gPFe4TbJ5VoLdkiZNhmGj4UvKvN0xnFNyEyb7AMioft0J60Kt2F54lntKVIcv5DjaPJlwXqhIIxj9c6l94d2vFXbzG4K7HcsSdBHjGe9ZP2phuEjASe92SW9k3dzO/ZtJScHits1pbM5la4K7IIyVmajr3Nq84x10olEf2jc2tcCVYsIWc1ooh2JwNYZ7Qnr7ul8/CjWCdAJ7I4ApeZdWyyIKe/tvvt+2xh4c+sY+gTK4AasLFr6GWGVRnoM+JPov17N5XkK0QMuy2nA+bgwtpW86NArk3u6t0Uu4+8IUAjMpuidibHb8UmqvYlt18dO1LLVQ6Y6VvHZq8~3486512~3290416; bm_sv=7A359CCAF77A72FDD0667D785CAD4EAC~YAAQPgNiaMc/MSmUAQAAIjEKKhptEt8rDz288BEvQcDiwymM4kyzI3k0F44WDMpr8E+jKmrUB+XQ5tf/qalqsR6mlCecCEZzQ6AHcMT7JZyCTeF3wDgWWs7e/J9VUynKAxlkwsXiGrUfYTCbO09kmi/4Nxaak6pPsk74Qe+zI5OUUtaUsgG6WALkciNgyqgVCzvuPuEpnspXc4uG2buTQe7oNOR385hSwCyPq1dNgfg0Imi30PQJbXHnXu8O1tBW2hiCX7fo~1; gpv_pN=nab-broker%3Aportal%3Acredit%20policies; s_nr=1735872159253-New; s_sq=nab-broker-prd%3D%2526pid%253Dnab-broker%25253Aportal%25253Acredit%252520policies%2526pidt%253D1%2526oid%253Dhttps%25253A%25252F%25252Fwww.nabbroker.com.au%25252Fsecure%25252Fcredit-policies%25252Floan-purpose%2526ot%253DA; ADRUM=s=1735872159266&r=https%3A%2F%2Fwww.nabbroker.com.au%2Fsecure%2Fcredit-policies"""
directory_path=r"/home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal"

if os.path.exists(directory_path):
    shutil.rmtree(directory_path)  
os.makedirs(directory_path) 

#Need depth 8 
cache = set()
downloads = set()
scrape("https://www.nabbroker.com.au/secure/credit-policies" ,    
       cookie, depth=8, download_path=directory_path, 
       starting_page=True, search_only=False)
print(len(downloads), "downloads from", len(cache), "urls")

[32m2025-01-03 10:43:48.117[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_url[0m:[36m49[0m - [1mDownloading https://www.nabbroker.com.au/secure/credit-policies/index to /home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal/secure/credit-policies/index.html[0m
[32m2025-01-03 10:43:48.385[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_url[0m:[36m49[0m - [1mDownloading https://www.nabbroker.com.au/secure/credit-policies/taking-an-application to /home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal/secure/credit-policies/taking-an-application.html[0m
[32m2025-01-03 10:43:48.581[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_url[0m:[36m49[0m - [1mDownloading https://www.nabbroker.com.au#intro to /home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal.html[0m
[32m2025-01-03 10:43:48.983[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_url[0m:[36m49[0m - [1mDownloading https://www.nabbroker.com.au/serv

62 downloads from 120 urls
