# 🧼 ISB Scraper (Full Serial + First Entry Fix)
Fetches all funding programs including edge entries in order.

In [None]:

# import asyncio
# from playwright.async_api import async_playwright
# from bs4 import BeautifulSoup
# import pandas as pd


In [None]:

# async def get_all_links_ordered():
#     url = "https://isb.rlp.de/service/foerderung.html"
#     async with async_playwright() as p:
#         browser = await p.chromium.launch(headless=True)
#         page = await browser.new_page()
#         await page.goto(url, timeout=60000)
#         await page.wait_for_selector(".isb-foerderfinder-list2")  # ensure list loads

#         # Scroll manually a few times (force all entries to render)
#         for _ in range(15):
#             await page.mouse.wheel(0, 1500)
#             await page.wait_for_timeout(800)

#         html = await page.content()
#         browser.close()

#         soup = BeautifulSoup(html, "html.parser")
#         anchor_tags = soup.select(".isb-foerderfinder-list2-title a")

#         links = []
#         for a in anchor_tags:
#             href = a.get("href")
#             if href and "/foerderung/" in href:
#                 full = "https://isb.rlp.de" + href if href.startswith("/") else href
#                 links.append(full)

#         return links


In [None]:

# async def scrape_funding_page(playwright, url):
#     data = {
#         "name": None, "description": None, "eligibility": None,
#         "amount": None, "procedure": None, "contact": None, "url": url
#     }

#     browser = await playwright.chromium.launch(headless=True)
#     page = await browser.new_page()
#     try:
#         await page.goto(url, timeout=60000)
#         await page.wait_for_timeout(2000)
#         html = await page.content()
#         soup = BeautifulSoup(html, "html.parser")

#         name = soup.select_one("h1.col-md-10.col-sm-12.col-xs-12")
#         if name: data["name"] = name.get_text(strip=True)

#         desc = soup.select_one("div.ce-textpic.ce-center.ce-above div.ce-bodytext")
#         if desc: data["description"] = desc.get_text(strip=True)

#         # Accordion Panels
#         for panel in soup.select("div.panel.panel-default"):
#             try:
#                 title = panel.select_one("h4.panel-title a").get_text(strip=True).lower()
#                 content = panel.select_one("div.panel-collapse .ce-bodytext").get_text(strip=True)
#                 if "wer wird" in title:
#                     data["eligibility"] = content
#                 elif "wie wird" in title or "finanziert" in title:
#                     data["amount"] = content
#                 elif "beantrag" in title or "antrag" in title:
#                     data["procedure"] = content
#             except:
#                 continue

#         # Contact info
#         contacts = []
#         for div in soup.select("div.col-xs-12.col-lg-3 div.isb-contact-item"):
#             lines = div.select("div.isb-contact__line .isb-contact__value")
#             parts = [l.get_text(strip=True) for l in lines]
#             contacts.append(" | ".join(parts))
#         if contacts:
#             data["contact"] = " || ".join(contacts)

#     except Exception as e:
#         print(f"❌ Error scraping {url}: {e}")
#     await browser.close()
#     return data


In [None]:

# async def run_full_scraper():
#     links = await get_all_links_ordered()
#     print(f"🔗 Total links found: {len(links)}")

#     all_data = []
#     async with async_playwright() as p:
#         for i, link in enumerate(links):
#             print(f"⏳ [{i+1}/{len(links)}] Scraping: {link}")
#             entry = await scrape_funding_page(p, link)
#             all_data.append(entry)

#     df = pd.DataFrame(all_data)
#     df.to_csv("funding-isb.csv", index=False)
#     print("✅ Data saved to funding-isb.csv")
#     return df


In [None]:

# # Run it
# df = await run_full_scraper()
# df.head()


In [None]:
# !pip install nest_asyncio

In [29]:
import nest_asyncio
nest_asyncio.apply()

In [30]:
# %% [markdown]
# # 🧼 ISB Scraper (Improved, Robust, Clean)

# %%
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd



In [31]:
# %%
async def get_isb_links():
    url = "https://isb.rlp.de/service/foerderung.html"
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=60000)
        await page.wait_for_selector(".isb-foerderfinder-list2")
        # Scroll to load all entries
        for _ in range(15):
            await page.mouse.wheel(0, 1500)
            await page.wait_for_timeout(800)
        html = await page.content()
        await browser.close()
    soup = BeautifulSoup(html, "html.parser")
    links = []
    for a in soup.select(".isb-foerderfinder-list2-title a"):
        href = a.get("href")
        if href and "/foerderung/" in href:
            full = "https://isb.rlp.de" + href if href.startswith("/") else href
            links.append(full)
    return links



In [32]:
# %%
async def scrape_isb_funding_page(page, url):
    data = {
        "name": None, "description": None, "eligibility": None,
        "amount": None, "procedure": None, "contact": None, "url": url
    }
    try:
        await page.goto(url, timeout=60000)
        await page.wait_for_timeout(2000)
        html = await page.content()
        soup = BeautifulSoup(html, "html.parser")

        name = soup.select_one("h1.col-md-10.col-sm-12.col-xs-12")
        if name: data["name"] = name.get_text(strip=True)

        desc = soup.select_one("div.ce-textpic.ce-center.ce-above div.ce-bodytext")
        if desc: data["description"] = desc.get_text(strip=True)

        # Accordion Panels
        for panel in soup.select("div.panel.panel-default"):
            try:
                title = panel.select_one("h4.panel-title a").get_text(strip=True).lower()
                content = panel.select_one("div.panel-collapse .ce-bodytext").get_text(strip=True)
                if "wer wird" in title:
                    data["eligibility"] = content
                elif "wie wird" in title or "finanziert" in title:
                    data["amount"] = content
                elif "beantrag" in title or "antrag" in title:
                    data["procedure"] = content
            except Exception:
                continue

        # Contact info
        contacts = []
        for div in soup.select("div.col-xs-12.col-lg-3 div.isb-contact-item"):
            lines = div.select("div.isb-contact__line .isb-contact__value")
            parts = [l.get_text(strip=True) for l in lines]
            contacts.append(" | ".join(parts))
        if contacts:
            data["contact"] = " || ".join(contacts)

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
    return data



In [33]:
# %%
async def run_isb_scraper():
    links = await get_isb_links()
    print(f"🔗 Total links found: {len(links)}")
    all_data = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        for i, link in enumerate(links):
            print(f"⏳ [{i+1}/{len(links)}] Scraping: {link}")
            entry = await scrape_isb_funding_page(page, link)
            all_data.append(entry)
        await browser.close()
    df = pd.DataFrame(all_data)
    df.to_csv("data/funding_isb_data.csv", index=False)
    print("✅ Data saved to data/funding_isb_data.csv")
    return df



In [34]:
import asyncio
df = await run_isb_scraper()
df.head()



🔗 Total links found: 37
⏳ [1/37] Scraping: https://isb.rlp.de/foerderung/136.html
⏳ [2/37] Scraping: https://isb.rlp.de/foerderung/134.html
⏳ [3/37] Scraping: https://isb.rlp.de/foerderung/betriebsansiedlung-investservice.html
⏳ [4/37] Scraping: https://isb.rlp.de/foerderung/605.html
⏳ [5/37] Scraping: https://isb.rlp.de/foerderung/138.html
⏳ [6/37] Scraping: https://isb.rlp.de/foerderung/900.html
⏳ [7/37] Scraping: https://isb.rlp.de/foerderung/603.html
⏳ [8/37] Scraping: https://isb.rlp.de/foerderung/269.html
⏳ [9/37] Scraping: https://isb.rlp.de/foerderung/142.html
⏳ [10/37] Scraping: https://isb.rlp.de/foerderung/245i.html
⏳ [11/37] Scraping: https://isb.rlp.de/foerderung/358.html
⏳ [12/37] Scraping: https://isb.rlp.de/foerderung/361.html
⏳ [13/37] Scraping: https://isb.rlp.de/foerderung/255.html
⏳ [14/37] Scraping: https://isb.rlp.de/foerderung/665-667.html
⏳ [15/37] Scraping: https://isb.rlp.de/foerderung/119.html
⏳ [16/37] Scraping: https://isb.rlp.de/foerderung/364.html
⏳ [17/3

Unnamed: 0,name,description,eligibility,amount,procedure,contact,url
0,Beratungsprogramm für den Mittelstand,Bei dem Programm handelt es sich um einen verl...,Kleine und mittlere Unternehmen gemäß der jewe...,Die Förderung erfolgt durch die Vergabe eines ...,Anträge sind postalisch an die im Antragsformu...,Beratung Wirtschaftsförderung | 06131 6172-133...,https://isb.rlp.de/foerderung/136.html
1,Beratungsprogramm für Existenzgründung,Durch den Zuschuss sollen Existenzgründende un...,"Natürliche Personen, die eine Existenzgründung...",Nicht zurückzahlbarer Zuschuss als Anteilsfina...,Anträge müssenvor Beauftragung einer Beraterin...,Beratung Wirtschaftsförderung | 06131 6172-133...,https://isb.rlp.de/foerderung/134.html
2,"Betriebsansiedlung, Investitionsservice","Unternehmen der gewerblichen Wirtschaft, Handw...","Unternehmen der gewerblichen Wirtschaft, Handw...",Der Investorin oder dem Investor wird ein umfa...,,Tim Sandrock | 06131 6172-1226 | tim.sandrock@...,https://isb.rlp.de/foerderung/betriebsansiedlu...
3,Betriebsmittelkredit RLP,Mit dem Programm der Investitions- und Struktu...,Unternehmen der gewerblichen Wirtschaft (Handw...,Die Kreditgewährung erfolgt durch die ISB an d...,Die Antragstellung erfolgt über die Hausbank b...,Beratung Wirtschaftsförderung | 06131 6172-133...,https://isb.rlp.de/foerderung/605.html
4,BITT-Technologieberatung,Bei dem Programm handelt es sich um eine verlo...,Kleine und mittlere Unternehmen gemäß der jewe...,Die Förderung erfolgt durch die Vergabe einer ...,Anträge sind über die für das antragstellende ...,Beratung Wirtschaftsförderung | 06131 6172-133...,https://isb.rlp.de/foerderung/138.html
