In [8]:
# %%
# --- Imports ---
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import csv
import os
import re


In [9]:
# %%
# --- Setup WebDriver ---
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [10]:
# %%
# --- Collect Program Links from All Pages ---
base_url = "https://www.foerderdatenbank.de/SiteGlobals/FDB/Forms/Suche/Foederprogrammsuche_Formular.html?resourceId=0065e6ec-5c0a-4678-b503-b7e7ec435dfd&input_=23adddb0-dcf7-4e32-96f5-93aec5db2716&pageLocale=de&filterCategories=FundingProgram&templateQueryString=KI&submit=Suchen"
driver.get(base_url)

all_links = []

for page_num in range(1, 6):
    print(f"Scraping page {page_num}...")
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "p.card--title a"))
    )

    funding_elements = driver.find_elements(By.CSS_SELECTOR, "p.card--title a")
    for element in funding_elements:
        link = element.get_attribute("href")
        if link and link not in all_links:
            all_links.append(link)

    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "a.forward.button")
        next_page_url = next_button.get_attribute("href")
        if next_page_url:
            driver.get(next_page_url)
            time.sleep(3)
        else:
            print("No further pages found. Stopping.")
            break
    except Exception:
        print("No forward button found or last page reached.")
        break

print(f"Total links collected: {len(all_links)}")
driver.quit()


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
No forward button found or last page reached.
Total links collected: 34


In [11]:
# %%
# --- Define extraction logic ---
def extract_features(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

        name = soup.find("h1", class_="title")
        name = name.text.strip() if name else "Name information not found."

        desc_div = soup.find("div", class_="rich--text")
        description = desc_div.find("p").text.strip() if desc_div and desc_div.find("p") else "Description information not found."

        domain_dt = soup.find("dt", string=re.compile("F\u00f6rderbereich"))
        domain = domain_dt.find_next("dd").text.strip() if domain_dt else "Domain information not found."

        eligibility_dt = soup.find("dt", string=re.compile("F\u00f6rderberechtigte"))
        eligibility = eligibility_dt.find_next("dd").text.strip() if eligibility_dt else "Eligibility information not found."

        location_dt = soup.find("dt", string=re.compile("F\u00f6rdergebiet"))
        location = location_dt.find_next("dd").text.strip() if location_dt else "Location information not found."

        contact_dt = soup.find("dt", string=re.compile("Ansprechpunkt"))
        if contact_dt:
            contact_info = contact_dt.find_next("dd")
            contact_name = contact_info.find("span", class_="link--label")
            contact_email = contact_info.find("a", href=lambda x: x and "mailto" in x)
            contact_phone = contact_info.find("p", class_="tel")
            contact_address = contact_info.find("p", class_="locality")
            contact = f"Name: {contact_name.text.strip() if contact_name else 'N/A'}, Email: {contact_email.text.strip() if contact_email else 'N/A'}, Phone: {contact_phone.text.strip() if contact_phone else 'N/A'}, Address: {contact_address.text.strip() if contact_address else 'N/A'}"
        else:
            contact = "Contact information not found."

        return [name, description, domain, eligibility, location, contact, url]
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return ["Error"] * 7


In [12]:
# %%
# --- Extract features from all links ---
data = []
for i, link in enumerate(all_links):
    print(f"Processing {i + 1}/{len(all_links)}: {link}")
    features = extract_features(link)
    data.append(features)
    time.sleep(1.5)  # avoid hammering the server


Processing 1/34: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/KultStiftBund/kunst-und-ki.html
Processing 2/34: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/anwendung-ki-wirkstoffforschung.html
Processing 3/34: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Land/NRW/next-in-nrw.html
Processing 4/34: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/zukunft-wertschoepfung-deutschland.html
Processing 5/34: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/bmbf-neurobiologisch-inspirierte-ki.html
Processing 6/34: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/bmbf_mathematik_fuer_innovationen.html
Processing 7/34: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/projekte-zum-thema-6-g-in-die-anwendung-bringen.html
Processing 8/34: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/selbstaendige-forschungsgru

In [13]:
# %%
# --- Save to CSV ---
columns = ["name", "description", "domain", "eligibility", "location", "contact", "url"]
df = pd.DataFrame(data, columns=columns)
output_path = "data/funding-foerderdatenbank-data.csv"
os.makedirs("data", exist_ok=True)
df.to_csv(output_path, index=False)
print(f"✅ Data saved to {output_path}")


✅ Data saved to data/funding-foerderdatenbank-data.csv


In [None]:
# # %% [markdown]
# # # 🚀 Förderdatenbank Scraper (Improved Version)

# # %%
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from webdriver_manager.chrome import ChromeDriverManager
# from bs4 import BeautifulSoup
# import pandas as pd
# import requests
# import time
# import os
# import re
# from tqdm import tqdm

# # %%
# # --- Config ---
# BASE_URL = "https://www.foerderdatenbank.de/SiteGlobals/FDB/Forms/Suche/Foederprogrammsuche_Formular.html?resourceId=0065e6ec-5c0a-4678-b503-b7e7ec435dfd&input_=23adddb0-dcf7-4e32-96f5-93aec5db2716&pageLocale=de&filterCategories=FundingProgram&templateQueryString=KI&submit=Suchen"
# OUTPUT_DIR = "data"
# OUTPUT_FILE = os.path.join(OUTPUT_DIR, "funding-foerderdatenbank-data.csv")
# os.makedirs(OUTPUT_DIR, exist_ok=True)

# # %%
# # --- Setup WebDriver ---
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# # %%
# # --- Collect Program Links from All Pages ---
# all_links = set()
# page_num = 1
# driver.get(BASE_URL)

# while True:
#     print(f"Scraping page {page_num}...")
#     try:
#         WebDriverWait(driver, 10).until(
#             EC.presence_of_all_elements_located((By.CSS_SELECTOR, "p.card--title a"))
#         )
#         funding_elements = driver.find_elements(By.CSS_SELECTOR, "p.card--title a")
#         for element in funding_elements:
#             link = element.get_attribute("href")
#             if link:
#                 all_links.add(link)
#     except Exception as e:
#         print(f"Error on page {page_num}: {e}")
#         break

#     try:
#         next_button = driver.find_element(By.CSS_SELECTOR, "a.forward.button")
#         next_page_url = next_button.get_attribute("href")
#         if next_page_url:
#             driver.get(next_page_url)
#             page_num += 1
#             time.sleep(2)
#         else:
#             print("No further pages found. Stopping.")
#             break
#     except Exception:
#         print("No forward button found or last page reached.")
#         break

# print(f"Total links collected: {len(all_links)}")
# driver.quit()

# # %%
# # --- Define Extraction Logic ---
# def extract_features(url):
#     try:
#         response = requests.get(url, timeout=10)
#         response.raise_for_status()
#         soup = BeautifulSoup(response.text, 'lxml')

#         name = soup.find("h1", class_="title")
#         name = name.text.strip() if name else ""

#         desc_div = soup.find("div", class_="rich--text")
#         description = desc_div.find("p").text.strip() if desc_div and desc_div.find("p") else ""

#         domain_dt = soup.find("dt", string=re.compile("F\u00f6rderbereich"))
#         domain = domain_dt.find_next("dd").text.strip() if domain_dt else ""

#         eligibility_dt = soup.find("dt", string=re.compile("F\u00f6rderberechtigte"))
#         eligibility = eligibility_dt.find_next("dd").text.strip() if eligibility_dt else ""

#         location_dt = soup.find("dt", string=re.compile("F\u00f6rdergebiet"))
#         location = location_dt.find_next("dd").text.strip() if location_dt else ""

#         contact_dt = soup.find("dt", string=re.compile("Ansprechpunkt"))
#         if contact_dt:
#             contact_info = contact_dt.find_next("dd")
#             contact_name = contact_info.find("span", class_="link--label")
#             contact_email = contact_info.find("a", href=lambda x: x and "mailto" in x)
#             contact_phone = contact_info.find("p", class_="tel")
#             contact_address = contact_info.find("p", class_="locality")
#             contact = f"Name: {contact_name.text.strip() if contact_name else 'N/A'}, Email: {contact_email.text.strip() if contact_email else 'N/A'}, Phone: {contact_phone.text.strip() if contact_phone else 'N/A'}, Address: {contact_address.text.strip() if contact_address else 'N/A'}"
#         else:
#             contact = ""

#         return [name, description, domain, eligibility, location, contact, url]
#     except Exception as e:
#         print(f"Error processing {url}: {e}")
#         return [""] * 7

# # %%
# # --- Extract features from all links ---
# data = []
# all_links_list = list(all_links)
# for i, link in enumerate(tqdm(all_links_list, desc="Scraping details")):
#     features = extract_features(link)
#     data.append(features)
#     time.sleep(1.2)  # be nice to the server

# # %%
# # --- Save to CSV ---
# columns = ["name", "description", "domain", "eligibility", "location", "contact", "url"]
# df = pd.DataFrame(data, columns=columns)
# df.to_csv(OUTPUT_FILE, index=False)
# print(f"✅ Data saved to {OUTPUT_FILE}")

# # %%
# df.head()
