In [None]:
# if needed, install and load selenium
#!pip install selenium
#!apt-get update
#!apt install -y chromium-chromedriver
#!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [1]:
# Load libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import random

In [2]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

In [3]:
# Define the base URL with a placeholder for page number
base_url = "https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=8242911&page={}"
main_url = "https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=8242911"

# Initialize a list to store feedback data
all_feedback_data = []

# Define a function to extract feedback data from a page
def extract_feedback_data(driver):
    feedback_data = []
    feedback_elements = driver.find_elements(By.TAG_NAME, "feedback-item")
    for feedback in feedback_elements:
        # Extract the organization name
        try:
            organization = feedback.find_element(By.CLASS_NAME, "ecl-u-type-prolonged-m").text.strip()
        except:
            organization = "No organization listed"

        # Extract the feedback link
        try:
            link_element = feedback.find_element(By.TAG_NAME, "a")
            feedback_link = link_element.get_attribute("href")
        except:
            feedback_link = "No link available"

        # Extract the feedback content
        try:
            feedback_content = feedback.find_element(By.CLASS_NAME, "ecl-u-type-paragraph").text.strip()
        except:
            feedback_content = "No feedback content"

        # Append the data to the list
        feedback_data.append({
            "Organization": organization,
            "Feedback Content": feedback_content,
            "Feedback Link": feedback_link
        })
    return feedback_data

# Process the main URL (without page parameter)
print(f"Accessing main URL: {main_url}...")
try:
    driver.get(main_url)
    time.sleep(5)  # Wait for JavaScript to load content
    feedback_data = extract_feedback_data(driver)
    for entry in feedback_data:
        entry["Page"] = 0  # Mark these as from the main URL
        all_feedback_data.append(entry)
except Exception as e:
    print(f"Error accessing main URL: {e}")

# Iterate through pages 1 to 14
for page in range(1, 15):  # 1 to 14 inclusive
    print(f"Accessing page {page}...")
    try:
        # Navigate to the page
        driver.get(base_url.format(page))
        time.sleep(5)  # Wait for JavaScript to load content
        feedback_data = extract_feedback_data(driver)
        for entry in feedback_data:
            entry["Page"] = page
            all_feedback_data.append(entry)
    except Exception as e:
        print(f"Error accessing page {page}: {e}")

# Close the browser
driver.quit()

# Save the data to a CSV file
if all_feedback_data:
    df = pd.DataFrame(all_feedback_data)
    df.to_csv("../data/processed/structured_feedback_data.csv", index=False)
    print("Data saved to structured_feedback_data.csv")
else:
    print("No data to save.")

Accessing main URL: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=8242911...
Accessing page 1...
Accessing page 2...
Accessing page 3...
Accessing page 4...
Accessing page 5...
Accessing page 6...
Accessing page 7...
Accessing page 8...
Accessing page 9...
Accessing page 10...
Accessing page 11...
Accessing page 12...
Accessing page 13...
Accessing page 14...
Data saved to structured_feedback_data.csv


In [24]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

In [5]:
# Load the structured CSV file
input_file = "../data/processed/structured_feedback_data.csv"
data = pd.read_csv(input_file)

In [27]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

# Load the structured feedback data
input_file = "../data/processed/structured_feedback_data.csv"
data = pd.read_csv(input_file)

# Add new columns for the extracted fields
columns = [
    "Feedback reference", "Submitted on", "Submitted by", "User type",
    "Organisation", "Organisation size", "Transparency register number",
    "Country of origin", "Initiative", "Additional Message"
]
for col in columns:
    if col not in data.columns:
        data[col] = ""

# Define the extraction function
def extract_fields(driver, feedback_link):
    extracted_data = {
        "Feedback reference": "",
        "Submitted on": "",
        "Submitted by": "",
        "User type": "",
        "Organisation": "",
        "Organisation size": "",
        "Transparency register number": "",
        "Country of origin": "",
        "Initiative": "",
        "Additional Message": ""
    }

    try:
        driver.get(feedback_link)
        time.sleep(5)  # Wait for the page to load

        # Define a helper function to extract field values dynamically
        def safe_extract_field(label_text):
            try:
                # Find the label matching the field and then get its sibling value
                label = driver.find_element(By.XPATH, f"//div[text()='{label_text}']")
                value = label.find_element(By.XPATH, "./following-sibling::div").text.strip()
                return value
            except Exception:
                return ""  # Return empty string if label or value is missing

        # Extract each field using its label
        extracted_data["Feedback reference"] = safe_extract_field("Feedback reference")
        extracted_data["Submitted on"] = safe_extract_field("Submitted on")
        extracted_data["Submitted by"] = safe_extract_field("Submitted by")
        extracted_data["User type"] = safe_extract_field("User type")
        extracted_data["Organisation"] = safe_extract_field("Organisation")
        extracted_data["Organisation size"] = safe_extract_field("Organisation size")
        extracted_data["Transparency register number"] = safe_extract_field("Transparency register number")
        extracted_data["Country of origin"] = safe_extract_field("Country of origin")
        extracted_data["Initiative"] = safe_extract_field("Initiative")

        # Extract additional message content
        try:
            extracted_data["Additional Message"] = driver.find_element(
                By.CSS_SELECTOR, "#feedback__messages > div:nth-child(1) > p"
            ).text.strip()
        except Exception:
            extracted_data["Additional Message"] = ""

    except Exception as e:
        print(f"Error accessing link {feedback_link}: {e}")

    return extracted_data

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode for efficiency
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

# Process each row in the dataset
for index, row in data.iterrows():
    feedback_link = row["Feedback Link"]
    if feedback_link != "No link available":
        print(f"Processing feedback at index {index}: {feedback_link}")
        try:
            extracted_fields = extract_fields(driver, feedback_link)
            for key, value in extracted_fields.items():
                data.at[index, key] = value  # Update the dataframe with extracted data
        except Exception as e:
            print(f"Error processing link {feedback_link}: {e}")

# Close the browser
driver.quit()

# Save the updated data to a new CSV file
output_file = "../data/processed/round1_feedback.csv"
data.to_csv(output_file, index=False)
print(f"Data saved to {output_file}")

Processing feedback at index 0: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F551055_en
Processing feedback at index 1: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F551054_en
Processing feedback at index 2: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F551053_en
Processing feedback at index 3: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F551052_en
Processing feedback at index 4: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F551051_en
Processing feedback at index 5: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/1252

In [28]:
data

Unnamed: 0,Organization,Feedback Content,Feedback Link,Page,Feedback reference,Submitted on,Submitted by,User type,Organisation,Organisation size,Transparency register number,Country of origin,Initiative,Additional Message
0,Consumer Technology Association (United States),CTA's comments are in the attached file.,https://ec.europa.eu/info/law/better-regulatio...,0,F551055,10 September 2020,Douglas Johnson,Business association,Consumer Technology Association,Medium (50 to 249 employees),,United States,Artificial intelligence – ethical and legal re...,CTA's comments are in the attached file.
1,Center for Democracy & Technology (United States),CDT welcomes the opportunity to provide input ...,https://ec.europa.eu/info/law/better-regulatio...,0,F551054,10 September 2020,Stan Adams,Non-governmental organisation (NGO),Center for Democracy & Technology,Small (10 to 49 employees),,United States,Artificial intelligence – ethical and legal re...,CDT welcomes the opportunity to provide input ...
2,ETNO - European Telecommunications Network Ope...,ETNO welcomes the Commission’s objective to fo...,https://ec.europa.eu/info/law/better-regulatio...,0,F551053,10 September 2020,Paolo Grassia,Business association,ETNO - European Telecommunications Network Ope...,Micro (1 to 9 employees),08957111909-85,Belgium,Artificial intelligence – ethical and legal re...,ETNO welcomes the Commission’s objective to fo...
3,EuroCommerce (Belgium),Artificial Intelligence (AI) can offer signifi...,https://ec.europa.eu/info/law/better-regulatio...,0,F551052,10 September 2020,Ena Salihovic,Business association,EuroCommerce,Small (10 to 49 employees),,Belgium,Artificial intelligence – ethical and legal re...,Artificial Intelligence (AI) can offer signifi...
4,Slovak Alliance for Innovation Economy (Slovakia),Smart government approaches to regulation will...,https://ec.europa.eu/info/law/better-regulatio...,0,F551051,10 September 2020,Michal Kardos,Business association,Slovak Alliance for Innovation Economy,Micro (1 to 9 employees),,Slovakia,Artificial intelligence – ethical and legal re...,Smart government approaches to regulation will...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,Anonymous,1. Opportunität eines Regulierungsrahmens für ...,https://ec.europa.eu/info/law/better-regulatio...,12,F540758,04 August 2020,,,,,,,Artificial intelligence – ethical and legal re...,1. Opportunität eines Regulierungsrahmens für ...
129,BusinessEurope (Belgium),BusinessEurope welcomes the positive tone of t...,https://ec.europa.eu/info/law/better-regulatio...,12,F540645,03 August 2020,patrick GRANT,Business association,BusinessEurope,Small (10 to 49 employees),3978240953-79,Belgium,Artificial intelligence – ethical and legal re...,BusinessEurope welcomes the positive tone of t...
130,Anonymous,I would like to express my support for the 2nd...,https://ec.europa.eu/info/law/better-regulatio...,13,F539970,30 July 2020,,,,,,,Artificial intelligence – ethical and legal re...,I would like to express my support for the 2nd...
131,Bart De Witte (Germany),As we had great concerns that the first releas...,https://ec.europa.eu/info/law/better-regulatio...,13,F539630,28 July 2020,Bart De Witte,EU citizen,,,,Germany,Artificial intelligence – ethical and legal re...,As we had great concerns that the first releas...


In [29]:
# Now we'll do the same but with the second round 
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

# Define the base URL with a placeholder for page number
base_url = "https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003&page={}"
main_url = "https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003"

# Initialize a list to store feedback data
all_feedback_data = []

# Define a function to extract feedback data from a page
def extract_feedback_data(driver):
    feedback_data = []
    feedback_elements = driver.find_elements(By.TAG_NAME, "feedback-item")
    for feedback in feedback_elements:
        # Extract the organization name
        try:
            organization = feedback.find_element(By.CLASS_NAME, "ecl-u-type-prolonged-m").text.strip()
        except:
            organization = "No organization listed"

        # Extract the feedback link
        try:
            link_element = feedback.find_element(By.TAG_NAME, "a")
            feedback_link = link_element.get_attribute("href")
        except:
            feedback_link = "No link available"

        # Extract the feedback content
        try:
            feedback_content = feedback.find_element(By.CLASS_NAME, "ecl-u-type-paragraph").text.strip()
        except:
            feedback_content = "No feedback content"

        # Append the data to the list
        feedback_data.append({
            "Organization": organization,
            "Feedback Content": feedback_content,
            "Feedback Link": feedback_link
        })
    return feedback_data

# Process the main URL (without page parameter)
print(f"Accessing main URL: {main_url}...")
try:
    driver.get(main_url)
    time.sleep(5)  # Wait for JavaScript to load content
    feedback_data = extract_feedback_data(driver)
    for entry in feedback_data:
        entry["Page"] = 0  # Mark these as from the main URL
        all_feedback_data.append(entry)
except Exception as e:
    print(f"Error accessing main URL: {e}")

# Iterate through pages 1 to 14
for page in range(1, 32):   
    print(f"Accessing page {page}...")
    try:
        # Navigate to the page
        driver.get(base_url.format(page))
        time.sleep(5)  # Wait for JavaScript to load content
        feedback_data = extract_feedback_data(driver)
        for entry in feedback_data:
            entry["Page"] = page
            all_feedback_data.append(entry)
    except Exception as e:
        print(f"Error accessing page {page}: {e}")

# Close the browser
driver.quit()

# Save the data to a CSV file
if all_feedback_data:
    df = pd.DataFrame(all_feedback_data)
    df.to_csv("../data/processed/round2_feedback_data.csv", index=False)
    print("Data saved to round2_feedback_data.csv")
else:
    print("No data to save.")

Accessing main URL: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003...
Accessing page 1...
Accessing page 2...
Accessing page 3...
Accessing page 4...
Accessing page 5...
Accessing page 6...
Accessing page 7...
Accessing page 8...
Accessing page 9...
Accessing page 10...
Accessing page 11...
Accessing page 12...
Accessing page 13...
Accessing page 14...
Accessing page 15...
Accessing page 16...
Accessing page 17...
Accessing page 18...
Accessing page 19...
Accessing page 20...
Accessing page 21...
Accessing page 22...
Accessing page 23...
Accessing page 24...
Accessing page 25...
Accessing page 26...
Accessing page 27...
Accessing page 28...
Accessing page 29...
Accessing page 30...
Accessing page 31...
Data saved to round2_feedback_data.csv


In [33]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)

In [34]:
# Load the structured feedback data
input_file = "../data/processed/round2_feedback_data.csv"
data = pd.read_csv(input_file)

# Add new columns for the extracted fields
columns = [
    "Feedback reference", "Submitted on", "Submitted by", "User type",
    "Organisation", "Organisation size", "Transparency register number",
    "Country of origin", "Initiative", "Additional Message"
]
for col in columns:
    if col not in data.columns:
        data[col] = ""

# Define the extraction function
def extract_fields(driver, feedback_link):
    extracted_data = {
        "Feedback reference": "",
        "Submitted on": "",
        "Submitted by": "",
        "User type": "",
        "Organisation": "",
        "Organisation size": "",
        "Transparency register number": "",
        "Country of origin": "",
        "Initiative": "",
        "Additional Message": ""
    }

    try:
        driver.get(feedback_link)
        time.sleep(5)  # Wait for the page to load

        # Define a helper function to extract field values dynamically
        def safe_extract_field(label_text):
            try:
                # Find the label matching the field and then get its sibling value
                label = driver.find_element(By.XPATH, f"//div[text()='{label_text}']")
                value = label.find_element(By.XPATH, "./following-sibling::div").text.strip()
                return value
            except Exception:
                return ""  # Return empty string if label or value is missing

        # Extract each field using its label
        extracted_data["Feedback reference"] = safe_extract_field("Feedback reference")
        extracted_data["Submitted on"] = safe_extract_field("Submitted on")
        extracted_data["Submitted by"] = safe_extract_field("Submitted by")
        extracted_data["User type"] = safe_extract_field("User type")
        extracted_data["Organisation"] = safe_extract_field("Organisation")
        extracted_data["Organisation size"] = safe_extract_field("Organisation size")
        extracted_data["Transparency register number"] = safe_extract_field("Transparency register number")
        extracted_data["Country of origin"] = safe_extract_field("Country of origin")
        extracted_data["Initiative"] = safe_extract_field("Initiative")

        # Extract additional message content
        try:
            extracted_data["Additional Message"] = driver.find_element(
                By.CSS_SELECTOR, "#feedback__messages > div:nth-child(1) > p"
            ).text.strip()
        except Exception:
            extracted_data["Additional Message"] = ""

    except Exception as e:
        print(f"Error accessing link {feedback_link}: {e}")

    return extracted_data

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode for efficiency
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

# Process each row in the dataset
for index, row in data.iterrows():
    feedback_link = row["Feedback Link"]
    if feedback_link != "No link available":
        print(f"Processing feedback at index {index}: {feedback_link}")
        try:
            extracted_fields = extract_fields(driver, feedback_link)
            for key, value in extracted_fields.items():
                data.at[index, key] = value  # Update the dataframe with extracted data
        except Exception as e:
            print(f"Error processing link {feedback_link}: {e}")

# Close the browser
driver.quit()

# Save the updated data to a new CSV file
output_file = "../data/processed/round2_feedback.csv"
data.to_csv(output_file, index=False)
print(f"Data saved to {output_file}")


Processing feedback at index 0: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F2665651_en
Processing feedback at index 1: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F2665650_en
Processing feedback at index 2: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F2665649_en
Processing feedback at index 3: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F2665648_en
Processing feedback at index 4: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/F2665647_en
Processing feedback at index 5: https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives

In [35]:
data

Unnamed: 0,Organization,Feedback Content,Feedback Link,Page,Feedback reference,Submitted on,Submitted by,User type,Organisation,Organisation size,Transparency register number,Country of origin,Initiative,Additional Message
0,Equinet (Belgium),Equinet welcomes the opportunity to provide co...,https://ec.europa.eu/info/law/better-regulatio...,0,F2665651,06 August 2021,,Non-governmental organisation (NGO),Equinet,Micro (1 to 9 employees),,Belgium,Artificial intelligence – ethical and legal re...,Equinet welcomes the opportunity to provide co...
1,AI Austria (Austria),AI Austria welcomes the opportunity to comment...,https://ec.europa.eu/info/law/better-regulatio...,0,F2665650,06 August 2021,Jeannette Gorzala,Non-governmental organisation (NGO),AI Austria,Small (10 to 49 employees),,Austria,Artificial intelligence – ethical and legal re...,AI Austria welcomes the opportunity to comment...
2,Digitalcourage e.V. (Germany),This submission to the AIA consultation is sen...,https://ec.europa.eu/info/law/better-regulatio...,0,F2665649,06 August 2021,,Non-governmental organisation (NGO),Digitalcourage e.V.,Small (10 to 49 employees),,Germany,Artificial intelligence – ethical and legal re...,This submission to the AIA consultation is sen...
3,UC Berkeley Center for Human-Compatible AI (Un...,The EU AI Act is an important step in the righ...,https://ec.europa.eu/info/law/better-regulatio...,0,F2665648,06 August 2021,,Academic/research Institution,UC Berkeley Center for Human-Compatible AI,Small (10 to 49 employees),,United States,Artificial intelligence – ethical and legal re...,The EU AI Act is an important step in the righ...
4,CrowdStrike (United States),In response to the European Commission’s reque...,https://ec.europa.eu/info/law/better-regulatio...,0,F2665647,06 August 2021,,Company/business,CrowdStrike,Large (250 or more),,United States,Artificial intelligence – ethical and legal re...,In response to the European Commission’s reque...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,NEC Laboratories Europe GmbH (Germany),Test.,https://ec.europa.eu/info/law/better-regulatio...,29,F2256824,05 May 2021,,Academic/research Institution,NEC Laboratories Europe GmbH,Medium (50 to 249 employees),,Germany,Artificial intelligence – ethical and legal re...,Test.
300,Agence du Numérique (AdN) (Belgium),Please find below our feedback on the AI impac...,https://ec.europa.eu/info/law/better-regulatio...,30,F2256808,05 May 2021,Antoine Hublet,Company/business,Agence du Numérique (AdN),Small (10 to 49 employees),,Belgium,Artificial intelligence – ethical and legal re...,Please find below our feedback on the AI impac...
301,gauthier lasou (France),Un règlement européen qui va définir des exige...,https://ec.europa.eu/info/law/better-regulatio...,30,F2256463,28 April 2021,gauthier lasou,EU citizen,,,,France,Artificial intelligence – ethical and legal re...,Un règlement européen qui va définir des exige...
302,SB Science Management UG (haftungsbeschränkt) ...,"Unfortunately, standardisation measures are st...",https://ec.europa.eu/info/law/better-regulatio...,30,F2242340,27 April 2021,,Company/business,SB Science Management UG (haftungsbeschränkt),Micro (1 to 9 employees),,Germany,Artificial intelligence – ethical and legal re...,"Unfortunately, standardisation measures are st..."
