In [None]:
import pandas as pd
import urllib.parse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC





base_url = 'https://www.legislation.act.gov.au'
url = f'{base_url}/results?category=cAct&classifier=&status=Current&alpha=&query=&action=browse'



def fetch_page_content(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch the page content: {url}")
        return None
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def get_legislation_links(url):
    soup = fetch_page_content(url)
    links = soup.find_all('a')
    legislation_links = []

    for link in links:
        href = link.get('href')
        if href and href.startswith('/a/') and '#' not in href:
            legislation_links.append(f'{base_url}{href}')

    return legislation_links

def get_html_version_link(legislation_link):
    legislation_id = legislation_link.split('/')[-2]
    print(f"{base_url}/View/a/{legislation_id}/current/html/{legislation_id}.html")
    return f"{base_url}/View/a/{legislation_id}/current/html/{legislation_id}.html"

def scrape_legislation_sections(html_link):
    options = Options()
    options.add_argument('--headless')

    driver = webdriver.Chrome(options=options)
    driver.get(html_link)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.ID, "progressBar")))

    legislation_soup = BeautifulSoup(driver.page_source, 'html.parser')

    driver.quit()

    print(legislation_soup.prettify())

    legislation_title = legislation_soup.find('title').text.strip()
    legislation_id = html_link.split('/')[-3]

    section_data = []

    for section_title in legislation_soup.find_all('p', {'class': 'AH5Sec'}):
        section_data.append(process_section_title(section_title, legislation_id, legislation_title))

    for definition in legislation_soup.find_all('p', {'class': 'aDef'}):
        section_data.append(process_text_element(definition, legislation_id, legislation_title, 'Definition'))

    for defpara in legislation_soup.find_all('p', {'class': 'aDefpara'}):
        section_data.append(process_text_element(defpara, legislation_id, legislation_title, 'DefPara'))

    for note in legislation_soup.find_all('p', {'class': 'aNote'}):
        section_data.append(process_text_element(note, legislation_id, legislation_title, 'Note'))

    for note_bullet in legislation_soup.find_all('p', {'class': 'aNoteBullet'}):
        section_data.append(process_text_element(note_bullet, legislation_id, legislation_title, 'NoteBullet'))

    for main in legislation_soup.find_all('p', {'class': 'Amain'}):
        section_data.append(process_text_element(main, legislation_id, legislation_title, 'Main'))

    for para in legislation_soup.find_all('p', {'class': 'Apara'}):
        section_data.append(process_text_element(para, legislation_id, legislation_title, 'Para'))

    for subpara in legislation_soup.find_all('p', {'class': 'Asubpara'}):
        section_data.append(process_text_element(subpara, legislation_id, legislation_title, 'SubPara'))

    return section_data

def process_section_title(section_title, legislation_id, legislation_title):
    return {
        'LegislationId': legislation_id,
        'LegislationTitle': legislation_title,
        'Type': 'SectionTitle',
        'Text': section_title.get_text(strip=True).replace(section_title.find('span', {'class': 'CharSectNo'}).text, '', 1)
    }

def process_text_element(element, legislation_id, legislation_title, element_type):
    return {
        'LegislationId': legislation_id,
        'LegislationTitle': legislation_title,
        'Type': element_type,
        'Text': element.get_text(strip=True)
    }

# Get the legislation links
legislation_links = get_legislation_links(url)

if not legislation_links:
    print("No legislation links were found.")
else:
    data = []

    for legislation_link in legislation_links:
        print(f"Processing legislation link: {legislation_link}")

        html_link = get_html_version_link(legislation_link)
        if not html_link:
            print(f"No HTML version link found for {legislation_link}.")
            continue

        scraped_sections = scrape_legislation_sections(html_link)
        print(html_link)
        if not scraped_sections:
            print(f"No sections were extracted for {legislation_link}.")
        else:
            print(f"Extracted {len(scraped_sections)} sections for {legislation_link}.")
            data.extend(scraped_sections)

    if data:
        # Create a DataFrame and store the data
        df = pd.DataFrame(data, columns=['LegislationId', 'LegislationTitle', 'Type', 'Text'])

        # Print the DataFrame
        print(df)
    else:
        print("No data was extracted.")

