# NAB Policy Scraper
The policies are stored behind a login page. 
1. Interesting content is under  <main id="main-content"></main>

In [1]:
import os
import requests
from urllib.parse import urlparse, urljoin
import shutil

from loguru import logger

from IPython.display import HTML
from bs4 import BeautifulSoup

### Scraper Code
# Notes
1. On save remove scripts that check if session is expired.
2. att .html to urls with no extention


In [18]:
def clean_html_nab(html: str) -> str:
    '''Remove script elements that cause session expired messages'''
    soup = BeautifulSoup(html, 'html.parser')
    for element in soup("script"):
      element.decompose()

    return str(soup)

def fetch_content(url:str, cookie:str):
    ''' Get the contents of a URL using a logged in cookie'''
    session = requests.Session()

    # Send the request with the cookies
    try:
      response = session.get(url, cookies={'Cookie': cookie})
    
      if not response:
        logger.warning(f"No response for {url}")
        return ""
      if response.status_code != 200:
        logger.warning(f"Response {response.status} for {url}")
        return ""
    
    except Exception as e:
      logger.error(f"Error {repr(e)} for {url}")
      return ""
    
    return response.text

downloads=set()
def save_url(url:str, download_path:str, cookie:str) -> str:
  '''Save the url to the download path and return the contents'''
  html_content = fetch_content(url,cookie)
  html_content = clean_html_nab(html_content)

  full_path =  download_path + urlparse(url).path.strip()
  if full_path[-1] == "/":
    full_path = full_path[-1]
  
  extension = os.path.splitext(full_path)[-1].lower()
  if extension == "":
    full_path += ".html"
  
  directory = os.path.dirname(full_path)
  
  if not os.path.exists(directory):
    os.makedirs(directory) 

  logger.info(f"Downloading {url} to {full_path}")
  downloads.add(full_path)
  try:
    with open(full_path, 'w') as f:
        f.write(html_content)
  except Exception as e:
    logger.error(f"Error {e}")
    
  return html_content


cache=set()
domain = "https://www.nabbroker.com.au"
def scrape(url:str, cookie, download_path:str, depth:int, starting_page:bool, search_only:bool) -> None:
    # Stop if when we get to the bottom
    if depth and depth <= 0:
        logger.info(f"Bottomed out {url}")
        return 
    
    path = urlparse(url).path
    extension = os.path.splitext(path)[-1].lower()
      
    # skip already processed urls   
    if path in cache:
        return
    cache.add(path)

    if not url.startswith(domain):
      logger.info(f"Skipping domain {url}")
      return
    
    # WBC uses url without an extention
    if extension == "":
        extension = ".html"
    
    # Skip non .html
    if extension not in [".html", ".aspx", ".pdf"]:
        logger.info(f"Skipping extention {url}")
        return
    if '/templates/' in path or '/forms/' in path:
       logger.info(f"Skipping form or template {url}")
       return

    # download all resources except starting page
    if starting_page:
        html_content = fetch_content(url,cookie)
        
    else:
        if search_only:
            logger.info(f"Downloading {url}")
            downloads.add(path)
            html_content = fetch_content(url,cookie)
        else:
            html_content = save_url(url, download_path, cookie)
        
    # if we have not yet exceeded the depth find links and recurse
    if extension == ".html" and depth and depth > 0:
        soup = BeautifulSoup(html_content)
        #only read withing the main tag
        soup = soup.find('main', id='main-content')
        for link in soup.select('a'):
            if href := link.get('href'):
                if not urlparse(href).netloc:
                    href = urljoin(domain, href)

                scrape(href, cookie, download_path, depth-1, starting_page=False, search_only=search_only)
    
    return


# Testing

In [19]:
cookie="""AMCVS_4986658252DDA4900A490D4D%40AdobeOrg=1; s_ecid=MCMID%7C42645900095515735650247837997914772494; s_cc=true; affinity="92c14e6e1f91af68"; _abck=33BBE8C790C695E7B9E7E227AD1DFF00~0~YAAQPgNiaCe+E9uTAQAAP4mT3A1N7QzvX6JNn9wKOq5WZrKeGng9TklWSzCzRl4LDcMehtNfloNIaR3yxkp5U8b+KS5LnqWKl1tXPPzdPqOYwc9+q+ta3iTqBPkLqHtlMF9JROPKucNyTyBoQcW3PvOFBDma84CUnVWD1iyDZ3FJCysY+kcmBBXExghg4l9nmUvo7dEGdLZz7y5soup7TMIeDC0vdmcg5pM+o7xdxqHUo0vT/FriwoTPhGm/4riuMNfqxA0zKSjTIdudJgwwGcX/wKvfbLi+oRt9lBEEtJ80Nw0jlaEKTIqpUSy/E+1pr37iCYOlLJ0fHLcQ3yAp1RW6qRL+CFj5f85xsT5CiGxQq2WbfUDvhS4ktjrk7ZMhpOxP3+TN/M2wcG8LCiy+bGLv3072SlK7JtjRBkmVccg09K0XpQQF9Ap8+BMsKpgvvrxfETVnRBvzZEHrgdllP76/hRrHDB+h5ghtVEt1hmwQ96rn3Q==~-1~-1~-1; bm_mi=AEEEC9830B7E1AE20F6F1354A874271E~YAAQPgNiaHC+E9uTAQAAcouT3BptAcm6re58lgsbZWBBdiZjYtk2+1foFsjOvbxZMRR/diUdY14xC97+QjdYnwXBHqm/XibGc8lrbb/9O7KtbZ8vKz4kN4NwlLpZqS/VePPsq1yTWagZ3DROMNdkMuiVZjgUtEClgXci9+5dTBGMDFtThtv4jpsAFpdLm+BZX+EgC0I45u763n7gbIeowf9AOb/cAf5n/LpmhBh2nRg/sI0WaKS5ZULCM4R1wk5ruOgtDOLyED5ulRUqezc/fJYvReP2+o6vHK6dyhAuei+2jkVb/K0uurvOpQlPNVi8InAJ~1; ak_bmsc=7F510CA9B51C7008FB9FEF88B1FD5AA1~000000000000000000000000000000~YAAQPgNiaEm/E9uTAQAA5JGT3BrHpPupoAJGr7m3tthKWFUJ5Uqe9+7argOJhuLFMAuR2RMlGpwVpS6UHnYxrPEGEtwTnQmezGDntlAQHCocQfsu6WehvVnN+x3Apd2I0Pn3JqN0G2sNQ9r3tzdt1WPUxRfZdYGiwSGYpPgHqSUkGFFyn862PEstDtJ3BoYgLKbLds5CB8jeAEk2SpjbvQeDdyDsTPJuyFUZ7a7m/8Uqbh+qerGpZohs+fyKBgdcvzDLVyu9n3X9It+n1abfx6nlWxygvv+0EmC34P/Mkbquml0BeDFvYRgdnAqCE0H8DIGUFWsgebrM49SY/PYW1Hzv+exMYkrRT4xo/aLRqybyCYF4IY6Tfoj5EIzkVLBDvn4V3mBZMnc6yverNErwGQv8yoZf3fHazjbRs0KGo10EnVvGaQFYYztZhr8wROkzJ12nzpdVwltV3CHR3TsGmu4NdPy8up1Ud58NPhMLO8767lmY; AMCV_4986658252DDA4900A490D4D%40AdobeOrg=1585540135%7CMCIDTS%7C20076%7CMCMID%7C42645900095515735650247837997914772494%7CMCAAMLH-1735177278%7C8%7CMCAAMB-1735177278%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1734579678s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; s_vnum=1735660800016%26vn%3D6; s_invisit=true; bearer=%7B%22type%22%3A%22Bearer%22%2C%22name%22%3A%22Authorization%22%2C%22value%22%3A%22AT.ujyzQp0O-YzxsjnMVkg3nVvkJdZJApf7KXkNK-QpNgI%22%2C%22expires%22%3A14399%7D; userName=Paul Booth; groups=portal-application-aem-broker; aemToken=eyJraWQiOiJRelUzX05HT1g3Vjc4Qk91RmRXY0dhUmgyVkUiLCJ4NXQiOiJ2UGVaTHp0aW85Tk9PczVmaEtvcnhsZVBkUUUiLCJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiIwNDI2MDhEMi0xRjJFLUY0QzctMjgwRS03QTdEQkZEMTlGQUYiLCJuYmYiOjE3MzQ1NzM4OTIsImlhdCI6MTczNDU3Mzg5MiwiZXhwIjoxNzM0NTc0MTkyLCJpc3MiOiJodHRwOi8vbmFiYXBpZGV2Lm5hYmRldi5jb20uYXUiLCJzdWIiOiJlOm5hYmJyb2tlcjpIU0w1ODY3MiIsImF1ZCI6WyJhZW1fdWEiXSwiYWNyIjoiMiIsImFtciI6InB3ZCIsInR0aSI6MCwidmVyIjoiMS4wIiwidXNyIjoiSFNMNTg2NzIiLCJwYXJlbnRVaWQiOiI1ODY3MiIsInVpZCI6IkhTTDU4NjcyIiwiY24iOiJQYXVsIEJvb3RoIiwibW9mIjpbInBvcnRhbC1hcHBsaWNhdGlvbi1hZW0tYnJva2VyIl0sInNwaWQiOiI1ODY3MiJ9.Bskfm0oFRCn9sfEO0lthqcnGeFnLXHtWMvt0uYp1oU4420h9eaXMG3x-4B573SfgJ7Bd4e_eEiqJCC8kM339lXzJuJHYSUmRJfRFJqi8MUopnAqHGeGEN5RPWeAS1FP-hjaUoZ0jeGMYeorCnN17WUmpI4NhVROU0yVqbeAEuAHOvrwC__flBrOBjEmdT7uY3BZn4Gwq64AmKWNdLoTPN-PHFvs9J6jrQ8AdifJozvq4c8L8rCioVl_XIA8nvSv1KzJpra5JOGqj-eJZ6ilEdIh_NDfBv2wKMjyvNEwUmM0yGsvbht_zP4Bn3HrTDX9Oo4L-q84tJCZAUNHGX4r_0Q; bearerToken=AT.ujyzQp0O-YzxsjnMVkg3nVvkJdZJApf7KXkNK-QpNgI; dafGroups=eyJraWQiOiJRelUzX05HT1g3Vjc4Qk91RmRXY0dhUmgyVkUiLCJ4NXQiOiJ2UGVaTHp0aW85Tk9PczVmaEtvcnhsZVBkUUUiLCJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiIwNDI2MDhEMi0xRjJFLUY0QzctMjgwRS03QTdEQkZEMTlGQUYiLCJuYmYiOjE3MzQ1NzM4OTIsImlhdCI6MTczNDU3Mzg5MiwiZXhwIjoxNzM0NTc0MTkyLCJpc3MiOiJodHRwOi8vbmFiYXBpZGV2Lm5hYmRldi5jb20uYXUiLCJzdWIiOiJlOm5hYmJyb2tlcjpIU0w1ODY3MiIsImF1ZCI6WyJhZW1fdWEiXSwiYWNyIjoiMiIsImFtciI6InB3ZCIsInR0aSI6MCwidmVyIjoiMS4wIiwidXNyIjoiSFNMNTg2NzIiLCJwYXJlbnRVaWQiOiI1ODY3MiIsInVpZCI6IkhTTDU4NjcyIiwiY24iOiJQYXVsIEJvb3RoIiwibW9mIjpbInBvcnRhbC1hcHBsaWNhdGlvbi1hZW0tYnJva2VyIl0sInNwaWQiOiI1ODY3MiJ9.Bskfm0oFRCn9sfEO0lthqcnGeFnLXHtWMvt0uYp1oU4420h9eaXMG3x-4B573SfgJ7Bd4e_eEiqJCC8kM339lXzJuJHYSUmRJfRFJqi8MUopnAqHGeGEN5RPWeAS1FP-hjaUoZ0jeGMYeorCnN17WUmpI4NhVROU0yVqbeAEuAHOvrwC__flBrOBjEmdT7uY3BZn4Gwq64AmKWNdLoTPN-PHFvs9J6jrQ8AdifJozvq4c8L8rCioVl_XIA8nvSv1KzJpra5JOGqj-eJZ6ilEdIh_NDfBv2wKMjyvNEwUmM0yGsvbht_zP4Bn3HrTDX9Oo4L-q84tJCZAUNHGX4r_0Q; isFirstLogin=true; bm_sz=4069B7D627B2148A3FC335904F816A48~YAAQLpZUuM/k2tuTAQAApzWp3BqLsqHW647TecxcHMTgE355cvJZrSnBIa5EDB58t2UBYJ9QZYPiCNM1c55f3WdHYCCAla2YLhhVI/NcZhYiiVQGJ6MZ/9+SRgXHs2TesTTgkT3iwQV6/h8dEl4OhYpef/sAPBJ/wzM2d6CP01fx3fpF+hUjRPxorkvdVAQW1N92QbdMUW7sUmjFz4EOzNzA17ezIl9ib1YsWIgckW8jOojBjFQH3Lkq8i2S4B3iA9chSHQnxcT36OX4uKhqJ4PSoZ+PnXu7Vhie+73DI42kGq4nN9ZeWteIZ6FcNGU8qXSYQN2+q8Nmwejwuc6vWBdQj0QTJ7YlpoHic06/IXBL7PSY6CsetWFY3oOtmG7AsMVNM6ofjTkVpASVEyXeKb5cvA/71PTHNQBcULWULYg03tzzrrHAfx9RSgPJpOooYcfBvLY+a/5RxM6BJUiKlNPsJbSsrorn0oEuxVZPeQ==~3425847~3687986; bm_sv=4A1754CBE921B58CDE6DFADDBF2DD89C~YAAQLpZUuPrk2tuTAQAA0Dap3BratZ35HYO1RQN4PHE75y6ix1sRQ5bZJ8Q7oYEhN2ACfvD/Ho1yOZVzdOUPTvLKu/hvINosl90dMIBekAWA54PcZMfamT/O54roWpVVGC7XQePM9chcRE+pSoK++R2w2ikbA33fe0mm+qMcEbID7+YBun0IdlG4WEOq5ol1zdwFiTTlE9RWDVFU4S8k/rb86vNm3zAyM2z8wUG0LZACPlEyVrq6xrHZ5bIPv2t0zbJ5o0p7/g==~1; gpv_pN=nab-broker%3Aportal%3Acredit%20policies; s_sq=%5B%5BB%5D%5D; s_nr=1734573903235-Repeat; RT="z=1&dm=www.nabbroker.com.au&si=eed04e22-1c47-459a-804d-9fa4b6e4de5f&ss=m4uno0v3&sl=b&tt=4cg&obo=5&rl=1&ld=umiv&r=539nin05&ul=umix"; ADRUM=s=1734573905641&r=https%3A%2F%2Fwww.nabbroker.com.au%2Fsecure%2Fcredit-policies"""
directory_path=r"/home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal"

#depth 3: 23/35
#depth 4: 51 downloads from 76 urls
#depth 5: 64 downloads from 97 urls
#depth 6:77 downloads from 116 urls
#depth 7: 80 downloads from 120 urls
cache = set()
downloads = set()
scrape("https://www.nabbroker.com.au/secure/credit-policies" ,    
       cookie, depth=7, download_path=directory_path, 
       starting_page=True, search_only=True)
print(len(downloads), "downloads from", len(cache), "urls")

[32m2024-12-19 10:22:40.901[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape[0m:[36m98[0m - [1mDownloading https://www.nabbroker.com.au/secure/credit-policies/index[0m
[32m2024-12-19 10:22:41.125[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape[0m:[36m98[0m - [1mDownloading https://www.nabbroker.com.au/secure/credit-policies/taking-an-application[0m
[32m2024-12-19 10:22:41.314[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape[0m:[36m98[0m - [1mDownloading https://www.nabbroker.com.au#intro[0m
[32m2024-12-19 10:22:41.497[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape[0m:[36m98[0m - [1mDownloading https://www.nabbroker.com.au/service-levels[0m
[32m2024-12-19 10:22:41.715[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape[0m:[36m98[0m - [1mDownloading https://www.nabbroker.com.au/content/dam/nabbroker/public/documents/reports/nab-market-megatrends-report-2024.pdf[0m
[32m2024-12-19 10:22:43.546[0m | [1mINFO    [0m | [36m__main

62 downloads from 120 urls


# NAB Download

In [20]:
cookie="""AMCVS_4986658252DDA4900A490D4D%40AdobeOrg=1; s_ecid=MCMID%7C42645900095515735650247837997914772494; s_cc=true; affinity="92c14e6e1f91af68"; _abck=33BBE8C790C695E7B9E7E227AD1DFF00~0~YAAQPgNiaCe+E9uTAQAAP4mT3A1N7QzvX6JNn9wKOq5WZrKeGng9TklWSzCzRl4LDcMehtNfloNIaR3yxkp5U8b+KS5LnqWKl1tXPPzdPqOYwc9+q+ta3iTqBPkLqHtlMF9JROPKucNyTyBoQcW3PvOFBDma84CUnVWD1iyDZ3FJCysY+kcmBBXExghg4l9nmUvo7dEGdLZz7y5soup7TMIeDC0vdmcg5pM+o7xdxqHUo0vT/FriwoTPhGm/4riuMNfqxA0zKSjTIdudJgwwGcX/wKvfbLi+oRt9lBEEtJ80Nw0jlaEKTIqpUSy/E+1pr37iCYOlLJ0fHLcQ3yAp1RW6qRL+CFj5f85xsT5CiGxQq2WbfUDvhS4ktjrk7ZMhpOxP3+TN/M2wcG8LCiy+bGLv3072SlK7JtjRBkmVccg09K0XpQQF9Ap8+BMsKpgvvrxfETVnRBvzZEHrgdllP76/hRrHDB+h5ghtVEt1hmwQ96rn3Q==~-1~-1~-1; bm_mi=AEEEC9830B7E1AE20F6F1354A874271E~YAAQPgNiaHC+E9uTAQAAcouT3BptAcm6re58lgsbZWBBdiZjYtk2+1foFsjOvbxZMRR/diUdY14xC97+QjdYnwXBHqm/XibGc8lrbb/9O7KtbZ8vKz4kN4NwlLpZqS/VePPsq1yTWagZ3DROMNdkMuiVZjgUtEClgXci9+5dTBGMDFtThtv4jpsAFpdLm+BZX+EgC0I45u763n7gbIeowf9AOb/cAf5n/LpmhBh2nRg/sI0WaKS5ZULCM4R1wk5ruOgtDOLyED5ulRUqezc/fJYvReP2+o6vHK6dyhAuei+2jkVb/K0uurvOpQlPNVi8InAJ~1; ak_bmsc=7F510CA9B51C7008FB9FEF88B1FD5AA1~000000000000000000000000000000~YAAQPgNiaEm/E9uTAQAA5JGT3BrHpPupoAJGr7m3tthKWFUJ5Uqe9+7argOJhuLFMAuR2RMlGpwVpS6UHnYxrPEGEtwTnQmezGDntlAQHCocQfsu6WehvVnN+x3Apd2I0Pn3JqN0G2sNQ9r3tzdt1WPUxRfZdYGiwSGYpPgHqSUkGFFyn862PEstDtJ3BoYgLKbLds5CB8jeAEk2SpjbvQeDdyDsTPJuyFUZ7a7m/8Uqbh+qerGpZohs+fyKBgdcvzDLVyu9n3X9It+n1abfx6nlWxygvv+0EmC34P/Mkbquml0BeDFvYRgdnAqCE0H8DIGUFWsgebrM49SY/PYW1Hzv+exMYkrRT4xo/aLRqybyCYF4IY6Tfoj5EIzkVLBDvn4V3mBZMnc6yverNErwGQv8yoZf3fHazjbRs0KGo10EnVvGaQFYYztZhr8wROkzJ12nzpdVwltV3CHR3TsGmu4NdPy8up1Ud58NPhMLO8767lmY; AMCV_4986658252DDA4900A490D4D%40AdobeOrg=1585540135%7CMCIDTS%7C20076%7CMCMID%7C42645900095515735650247837997914772494%7CMCAAMLH-1735177278%7C8%7CMCAAMB-1735177278%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1734579678s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; s_vnum=1735660800016%26vn%3D6; s_invisit=true; bearer=%7B%22type%22%3A%22Bearer%22%2C%22name%22%3A%22Authorization%22%2C%22value%22%3A%22AT.ujyzQp0O-YzxsjnMVkg3nVvkJdZJApf7KXkNK-QpNgI%22%2C%22expires%22%3A14399%7D; userName=Paul Booth; groups=portal-application-aem-broker; aemToken=eyJraWQiOiJRelUzX05HT1g3Vjc4Qk91RmRXY0dhUmgyVkUiLCJ4NXQiOiJ2UGVaTHp0aW85Tk9PczVmaEtvcnhsZVBkUUUiLCJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiIwNDI2MDhEMi0xRjJFLUY0QzctMjgwRS03QTdEQkZEMTlGQUYiLCJuYmYiOjE3MzQ1NzM4OTIsImlhdCI6MTczNDU3Mzg5MiwiZXhwIjoxNzM0NTc0MTkyLCJpc3MiOiJodHRwOi8vbmFiYXBpZGV2Lm5hYmRldi5jb20uYXUiLCJzdWIiOiJlOm5hYmJyb2tlcjpIU0w1ODY3MiIsImF1ZCI6WyJhZW1fdWEiXSwiYWNyIjoiMiIsImFtciI6InB3ZCIsInR0aSI6MCwidmVyIjoiMS4wIiwidXNyIjoiSFNMNTg2NzIiLCJwYXJlbnRVaWQiOiI1ODY3MiIsInVpZCI6IkhTTDU4NjcyIiwiY24iOiJQYXVsIEJvb3RoIiwibW9mIjpbInBvcnRhbC1hcHBsaWNhdGlvbi1hZW0tYnJva2VyIl0sInNwaWQiOiI1ODY3MiJ9.Bskfm0oFRCn9sfEO0lthqcnGeFnLXHtWMvt0uYp1oU4420h9eaXMG3x-4B573SfgJ7Bd4e_eEiqJCC8kM339lXzJuJHYSUmRJfRFJqi8MUopnAqHGeGEN5RPWeAS1FP-hjaUoZ0jeGMYeorCnN17WUmpI4NhVROU0yVqbeAEuAHOvrwC__flBrOBjEmdT7uY3BZn4Gwq64AmKWNdLoTPN-PHFvs9J6jrQ8AdifJozvq4c8L8rCioVl_XIA8nvSv1KzJpra5JOGqj-eJZ6ilEdIh_NDfBv2wKMjyvNEwUmM0yGsvbht_zP4Bn3HrTDX9Oo4L-q84tJCZAUNHGX4r_0Q; bearerToken=AT.ujyzQp0O-YzxsjnMVkg3nVvkJdZJApf7KXkNK-QpNgI; dafGroups=eyJraWQiOiJRelUzX05HT1g3Vjc4Qk91RmRXY0dhUmgyVkUiLCJ4NXQiOiJ2UGVaTHp0aW85Tk9PczVmaEtvcnhsZVBkUUUiLCJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiIwNDI2MDhEMi0xRjJFLUY0QzctMjgwRS03QTdEQkZEMTlGQUYiLCJuYmYiOjE3MzQ1NzM4OTIsImlhdCI6MTczNDU3Mzg5MiwiZXhwIjoxNzM0NTc0MTkyLCJpc3MiOiJodHRwOi8vbmFiYXBpZGV2Lm5hYmRldi5jb20uYXUiLCJzdWIiOiJlOm5hYmJyb2tlcjpIU0w1ODY3MiIsImF1ZCI6WyJhZW1fdWEiXSwiYWNyIjoiMiIsImFtciI6InB3ZCIsInR0aSI6MCwidmVyIjoiMS4wIiwidXNyIjoiSFNMNTg2NzIiLCJwYXJlbnRVaWQiOiI1ODY3MiIsInVpZCI6IkhTTDU4NjcyIiwiY24iOiJQYXVsIEJvb3RoIiwibW9mIjpbInBvcnRhbC1hcHBsaWNhdGlvbi1hZW0tYnJva2VyIl0sInNwaWQiOiI1ODY3MiJ9.Bskfm0oFRCn9sfEO0lthqcnGeFnLXHtWMvt0uYp1oU4420h9eaXMG3x-4B573SfgJ7Bd4e_eEiqJCC8kM339lXzJuJHYSUmRJfRFJqi8MUopnAqHGeGEN5RPWeAS1FP-hjaUoZ0jeGMYeorCnN17WUmpI4NhVROU0yVqbeAEuAHOvrwC__flBrOBjEmdT7uY3BZn4Gwq64AmKWNdLoTPN-PHFvs9J6jrQ8AdifJozvq4c8L8rCioVl_XIA8nvSv1KzJpra5JOGqj-eJZ6ilEdIh_NDfBv2wKMjyvNEwUmM0yGsvbht_zP4Bn3HrTDX9Oo4L-q84tJCZAUNHGX4r_0Q; isFirstLogin=true; bm_sz=4069B7D627B2148A3FC335904F816A48~YAAQLpZUuM/k2tuTAQAApzWp3BqLsqHW647TecxcHMTgE355cvJZrSnBIa5EDB58t2UBYJ9QZYPiCNM1c55f3WdHYCCAla2YLhhVI/NcZhYiiVQGJ6MZ/9+SRgXHs2TesTTgkT3iwQV6/h8dEl4OhYpef/sAPBJ/wzM2d6CP01fx3fpF+hUjRPxorkvdVAQW1N92QbdMUW7sUmjFz4EOzNzA17ezIl9ib1YsWIgckW8jOojBjFQH3Lkq8i2S4B3iA9chSHQnxcT36OX4uKhqJ4PSoZ+PnXu7Vhie+73DI42kGq4nN9ZeWteIZ6FcNGU8qXSYQN2+q8Nmwejwuc6vWBdQj0QTJ7YlpoHic06/IXBL7PSY6CsetWFY3oOtmG7AsMVNM6ofjTkVpASVEyXeKb5cvA/71PTHNQBcULWULYg03tzzrrHAfx9RSgPJpOooYcfBvLY+a/5RxM6BJUiKlNPsJbSsrorn0oEuxVZPeQ==~3425847~3687986; bm_sv=4A1754CBE921B58CDE6DFADDBF2DD89C~YAAQLpZUuPrk2tuTAQAA0Dap3BratZ35HYO1RQN4PHE75y6ix1sRQ5bZJ8Q7oYEhN2ACfvD/Ho1yOZVzdOUPTvLKu/hvINosl90dMIBekAWA54PcZMfamT/O54roWpVVGC7XQePM9chcRE+pSoK++R2w2ikbA33fe0mm+qMcEbID7+YBun0IdlG4WEOq5ol1zdwFiTTlE9RWDVFU4S8k/rb86vNm3zAyM2z8wUG0LZACPlEyVrq6xrHZ5bIPv2t0zbJ5o0p7/g==~1; gpv_pN=nab-broker%3Aportal%3Acredit%20policies; s_sq=%5B%5BB%5D%5D; s_nr=1734573903235-Repeat; RT="z=1&dm=www.nabbroker.com.au&si=eed04e22-1c47-459a-804d-9fa4b6e4de5f&ss=m4uno0v3&sl=b&tt=4cg&obo=5&rl=1&ld=umiv&r=539nin05&ul=umix"; ADRUM=s=1734573905641&r=https%3A%2F%2Fwww.nabbroker.com.au%2Fsecure%2Fcredit-policies"""
directory_path=r"/home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal"

if os.path.exists(directory_path):
    shutil.rmtree(directory_path)  
os.makedirs(directory_path) 

#Need depth 8 
cache = set()
downloads = set()
scrape("https://www.nabbroker.com.au/secure/credit-policies" ,    
       cookie, depth=8, download_path=directory_path, 
       starting_page=True, search_only=False)
print(len(downloads), "downloads from", len(cache), "urls")

[32m2024-12-19 10:23:48.765[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_url[0m:[36m49[0m - [1mDownloading https://www.nabbroker.com.au/secure/credit-policies/index to /home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal/secure/credit-policies/index.html[0m
[32m2024-12-19 10:23:48.944[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_url[0m:[36m49[0m - [1mDownloading https://www.nabbroker.com.au/secure/credit-policies/taking-an-application to /home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal/secure/credit-policies/taking-an-application.html[0m
[32m2024-12-19 10:23:49.067[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_url[0m:[36m49[0m - [1mDownloading https://www.nabbroker.com.au#intro to /home/kevinmcisaac/Projects/policy-pal-pages/policy/NAB/Broker_Portal.html[0m
[32m2024-12-19 10:23:49.252[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_url[0m:[36m49[0m - [1mDownloading https://www.nabbroker.com.au/serv

62 downloads from 120 urls
