In [1]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# URL of the web page to scrape
URL = 'https://discomap.eea.europa.eu/App/AQViewer/index.html?fqn=Airquality_Dissem.hra.countries_sel&EUCountries=Yes&ScenarioDescription=WHO_2021_AQG_Scen_Base&AirPollutant=PM2.5&UrbanisationDegree=All%20Areas%20(incl.unclassified)&Year=2020#'

# Initialize a Chrome WebDriver and navigate to the URL
driver = webdriver.Chrome()
driver.get(URL)
wait = WebDriverWait(driver, 30)

def scroll_through_table(driver, wait):
    """
    Scrolls through the table on the webpage.
    
    This function ensures that the table is scrolled to the bottom,
    allowing any loaded data to appear.
    """
    try:
        table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#mainTable table')))
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", table)
        time.sleep(2)  # Brief pause to allow the page to load
    except Exception as e:
        print("Error while scrolling:", e)

# Function to extract table data
column_indices = [0, 2, 3, 4, 5, 6, 8, 11]  # Define the columns to be scraped

def extract_table_data(driver, wait, all_data, page_count):
    """
    Extracts data from the table on the current page.
    Each row is read, and selected columns are added to the all_data list.
    The function also tracks the number of pages processed.
    """
    try:
        table = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#mainTable table')))
        rows = table.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            if len(columns) > 0:  # Check if the row contains columns
                all_data.append([columns[i].text for i in column_indices if i < len(columns)])
        print(f"Scraped data from page {page_count[0]}")
        page_count[0] += 1  # Increment page count for tracking
    except Exception as e:
        print("Error extracting table data:", e)

# Filters Setup
year_2020_checkbox_id = "Year_14"  
year_2020_checkbox = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, year_2020_checkbox_id)))
if year_2020_checkbox.is_selected():
    year_2020_checkbox.click()  # Uncheck the year 2020

years_to_select = ["Year_12", "Year_13"]  # Check the checkboxes for years 2018 and 2019
for year_id in years_to_select:
    year_checkbox = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, year_id)))
    if not year_checkbox.is_selected():
        year_checkbox.click()
    time.sleep(3)  # Wait for the page to update

# Set filter for 'Air Pollutant' to include all pollutants
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "AirPollutant")))
driver.find_element(By.ID, "AirPollutant").find_element(By.XPATH, ".//option[text()='[ all ]']").click()
time.sleep(2)  # Wait for the filter to apply

# Initialize list for storing scraped data and a list for page count
all_data = []
page_count = [1]

# Pagination Logic
current_page = 1
total_pages = 8  # Total number of pages to scrape

while current_page <= total_pages: # Continue looping until all pages are processed
    try:
        scroll_through_table(driver, wait) # Scroll through the table on the current webpage
        extract_table_data(driver, wait, all_data, page_count) # Extract data from the table

        if current_page >= total_pages:
            break  # Break the loop if the last page is reached

        page_number_selector = f"#tableFooter > div > div.col-md-8.col-6 > nav > ul > li:nth-child({current_page + 2}) > a"
        page_number_element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, page_number_selector)))
        page_number_element.click()  # Click to navigate to the next page
        time.sleep(3)  # Wait for the next page to load

        current_page += 1  # Increment to the next page

    except Exception as e:
        print(f"Error occurred on page {current_page}: {e}")  # Print error message if any exception occurs
        break # Break the loop on encountering an error

# Close the WebDriver after finishing data extraction
driver.quit()

def save_to_csv(all_data, filename='1.scraped_airquality_data.csv'):
    """
    Saves the scraped data into a CSV file.

    The data is written along with a header row, defining the structure of the table.
    """
    header = ["Year", "Country", "Air Pollutant", "Population", "Populated Area [km2]", "Air Pollution Average [ug/m3]",
              "Premature Deaths", "Years Of Life Lost"]
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Write the header row
        writer.writerows(all_data)  # Write all the scraped data rows

# Save the scraped data to a CSV file
save_to_csv(all_data)
print("Data saved to '1.scraped_airquality_data_stage1.csv'")


Scraped data from page 1
Scraped data from page 2
Scraped data from page 3
Scraped data from page 4
Scraped data from page 5
Scraped data from page 6
Scraped data from page 7
Scraped data from page 8
Data saved to '1.scraped_airquality_data.csv'
