# Web-scraping Scottish Health Survey functions

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

In [None]:
def sanitize_filename(filename):
    """
    Remove characters that are not allowed in file names.

    Args:
        filename (str): The original filename.

    Returns:
        str: The sanitized filename without invalid characters.
    """
    return re.sub(r'[^\w\-_. ]', '', filename)

In [None]:
def scrape_health_survey_data(year):
    """
    Scrape health survey data for a specific year.

    Args:
        year (int): The year for which to scrape data.

    Returns:
        None. Downloads Excel files and saves them in a folder with the year as the name.
    """
    base_url = f"https://www.gov.scot/publications/scottish-health-survey-{year}-supplementary-tables/"
    
    response = requests.get(base_url)
    if response.status_code == 404:
        print(f"No data available for {year}. Skipping...")
        return
    
    response.raise_for_status()  # Check for any HTTP errors
    soup = BeautifulSoup(response.content, "html.parser")
    excel_links = []

    # Find all the links on the page and filter for Excel files
    for link in soup.find_all("a", href=True):
        href = link["href"]
        # Convert relative URLs to absolute URLs
        absolute_url = urljoin(base_url, href)
        if absolute_url.endswith(".xls") or absolute_url.endswith(".xlsx"):
            excel_links.append((absolute_url, link.text.strip()))

    # Create a folder with the year as the name
    folder_name = str(year)
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Download the Excel files with link names as file names in the folder
    for idx, (link_url, link_name) in enumerate(excel_links, start=1):
        if not link_name:
            print(f"Skipping download for {year} - File {idx} - Empty link name.")
            continue
        
        file_name = os.path.join(folder_name, sanitize_filename(link_name) + ".xls")
        if not file_name:
            print(f"Skipping download for {year} - File {idx} - Invalid link name: {link_name}")
            continue

        response = requests.get(link_url)
        with open(file_name, "wb") as file:
            file.write(response.content)
            print(f"Downloaded {file_name} for {year}.")

In [None]:
def scrape_health_survey_data_for_years(start_year, end_year):
    """
    Scrape health survey data for multiple years.

    Args:
        start_year (int): The starting year for scraping data.
        end_year (int): The ending year for scraping data.

    Returns:
        None. Downloads Excel files and saves them in folders with the corresponding year as the name.
    """
    for year in range(start_year, end_year + 1):
        scrape_health_survey_data(year)

In [None]:
# Example usage:
scrape_health_survey_data_for_years(2011, 2022)