In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = 'https://www.legislation.act.gov.au'
url = f'{base_url}/results?category=cAct&classifier=&status=Current&alpha=&query=&action=browse'

def fetch_page_content(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser')

def get_legislation_links(url):
    soup = fetch_page_content(url)
    links = soup.find_all('a')
    legislation_links = []

    for link in links:
        href = link.get('href')
        if href and href.startswith('/a/'):
            legislation_links.append(f'{base_url}{href}')

    return legislation_links

def get_html_version_link(legislation_link):
    legislation_id = legislation_link.split('/')[-2]
    print(f"{base_url}/View/a/{legislation_id}/current/html/{legislation_id}.html")
    return f"{base_url}/View/a/{legislation_id}/current/html/{legislation_id}.html"

def scrape_legislation_sections(html_link):
    legislation_soup = fetch_page_content(html_link)
    act_title = legislation_soup.title.text.strip()

    sections = legislation_soup.find_all('p', {'class': 'AH5Sec'})

    return sections


# Get the legislation links
legislation_links = get_legislation_links(url)
print(f'Found {len(legislation_links)} legislation links.')

# Process the first two legislation links
test_legislation_links = legislation_links[:2]
data = []

for legislation_link in test_legislation_links:
    html_link = get_html_version_link(legislation_link)
    data.extend(scrape_legislation_sections(html_link))

print(f'Scraped content for {len(data)} sections.')

# Create a DataFrame and store the data
df = pd.DataFrame(data, columns=['Act', 'Section', 'Text'])

# Print the DataFrame
print(df)