# Importing packaging

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from IPython.display import display
import re

# Setting up base components for web scraping rental apartment

In [2]:
# Base URL without page number (page number will be added dynamically later)
base_url = "https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE8634&page="


# Setting up headers

The headers dictionary contains a "User-Agent" field, which is used to mimic a real browser to avoid getting blocked by the website's anti-bot mechanisms.

In [None]:
# Define headers to mimic a browser
headers = {
    "User-Agent": random.choice([
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
    ])
}

# Defining functions for proper webscraping

## Defining the function "get_first_valid_match"

In [3]:
# Function to get the first valid match from multiple selectors
def get_first_valid_match(soup, selectors):
    """
    This function  attempts to find and return the first valid match from an HTML document using multiple CSS selectors. 
    If none of the selectors match an element, it returns "N/A". 
    It does that by iterating through a list of selectors.
    """
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            return element.text.strip()
    return "N/A"

## Defining the function "get_price_value"

In [None]:
# Function to extract price-related values correctly
def get_price_value(soup, selectors):
    """
    This function extracts price-related values from a HTML document using a list of CSS selectors. 
    It prioritizes extracting data from the "aria-label" attribute if present, returning it, stripped of whitespace. 
    Otherwise, it returns the visible text (element.text.strip()). 
    If no elements match any selector, it returns "N/A".
    """
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            if element.has_attr("aria-label"):  # Check if aria-label exists
                return element["aria-label"].strip()
            return element.text.strip()
    return "N/A"

## Defining the function "extract_zip_code"


In [None]:
def extract_zip_code(soup):
    """
    This function extracts a 5-digit ZIP code from an address section within a HTML document. 
    It does that by finding an HTML <div> element with the class "css-1ytyjyb", 
    which is assumed to contain the address (where the ZIP code is located). 
    If the address element is found there, 
    it uses a regular expression (re.search()) to look for a 5-digit ZIP code enclosed in parentheses (\d{5}). 
    If a ZIP code match is found, the function extracts and returns the first captured group (group(1)), 
    which contains the ZIP code.
    If no ZIP code is found or the address element is missing, it returns "N/A".
    """
    address_element = soup.find("div", class_="css-1ytyjyb")
    if address_element:
        zip_code_match = re.search(r"\((\d{5})\)", address_element.text)
        if zip_code_match:
            return zip_code_match.group(1)
    return "N/A"

## Defining the function "extract_additional_price_info"

In [None]:
def extract_additional_price_info(soup):
    """
    This function extracts the "Nettokaltmiete" (Net Cold Rent) price information 
    which sometimes stored in another location on the webpage. 
    It first finds a <div> element with the class "css-1tmr9l6". 
    If the element exists, it retrieves and strippes its text of unnecessary whitespace. 
    It looks for the term "Nettokaltmiete" followed by a properly formatted Euro amount. 
    If a match is found, it returns the extracted text. 
    If no match is found, it returns "N/A".
    """
    price_info_element = soup.find("div", class_="css-1tmr9l6")
    if price_info_element:
        price_info_text = price_info_element.get_text(strip=True)
        match = re.search(r"(Nettokaltmiete:\s*\d{1,3}(?:\.\d{3})*,\d{2}\s*EUR)", price_info_text)
        if match:
            return match.group(1)
    return "N/A"

## Defining the function "extract_heizkosten"

In [None]:
def extract_heizkosten(price_container):
    """
    Look for a block with "Heizkosten" within the 'davon' section.
    If not found, return None.
    This function first attempts to get the value from the standard container (css-1djk842)
    and, if empty, then checks the alternate container (css-u7w3u5).
    """
    heiz_value = None
    davon_container = price_container.find("div", class_="css-18s98dq")
    if davon_container:
        heiz_block = davon_container.find("div", class_="css-cxt05v")
        if heiz_block:
            # Try first container
            value_container = heiz_block.find("div", class_="css-1djk842")
            if not value_container or not value_container.get_text(strip=True):
                # Fallback: try the alternate container
                value_container = heiz_block.find("div", class_="css-u7w3u5")
            if value_container:
                heiz_value = value_container.get_text(strip=True)
    return heiz_value

## Defining the function "extract_stellplatz"

In [None]:
def extract_stellplatz(price_container):
    """
    Look for the 'Zusätzliche Kosten' section and try to retrieve the
    "Miete pro Stellplatz" value.
    Instead of relying solely on finding a parent with a specific class,
    we search for any div whose text contains the phrase.
    """
    stellplatz_value = None
    # Find a container that contains "Zusätzliche Kosten"
    zusatz_header = price_container.find(lambda tag: tag.name == "div" and "Zusätzliche Kosten" in tag.get_text())
    if zusatz_header:
        # Look for a div (anywhere after the header) with the text "Miete pro Stellplatz"
        stellplatz_block = price_container.find(lambda tag: tag.name == "div" and "Miete pro Stellplatz" in tag.get_text())
        if stellplatz_block:
            # Try the standard container for the value first
            value_elem = stellplatz_block.find("div", class_="css-1djk842")
            if not value_elem or not value_elem.get_text(strip=True):
                # Fallback: try an alternate container if available (if the structure is different)
                value_elem = stellplatz_block.find("div", class_="css-u7w3u5")
            if value_elem:
                stellplatz_value = value_elem.get_text(strip=True)
    return stellplatz_value

## Defining the function "normalize_label"

In [None]:
def normalize_label(text):
    """
    Replace multiple whitespace characters (including non-breaking spaces) with a single space,
    strip leading/trailing spaces, and convert to lowercase.
    """
    return re.sub(r'\s+', ' ', text).strip().lower()

# Defining the function "scrape_property"
This function is a web scraper that extracts real estate property details from a given URL. It uses BeautifulSoup to parse the HTML and retrieve relevant details like rental prices, location, heating costs, features, and real estate agency information.

1. Retry Mechanism
The function implements a retry mechanism in case of a network error. If all attempts fail, it skips the URL and returns None.

2. Sending an HTTP Request
It makes a GET request to the property_url using predefined headers and waits up to 15 seconds for a response.
If the status code is 200 (Success), it processes the page.

3. Extracting Property Details
It uses CSS selectors to extract specific details from the parsed BeautifulSoup object (soup).


#### It procedes the following way to extract data:
4. Basic Property Information
Title,  Warmmiete, Kaltmiete zzgl. Nebenkosten, Nebenkosten, Heizkosten_1 (Basic Info), Heizkosten_2 (Numeric Value) and Miete pro Stellplatz (Rent per parking space).

5. Address & Location
Extracts the address (css-1ytyjyb).
Extracts ZIP code using extract_zip_code(soup).

6. Property Features
Number of images (pics_number).
Rooms, surface area, and floor.
Availability status.
Merkmale (Features): It extracts up to 8 key features present on the flat advertisement default page.

7. Energy & Construction Details
Energy Efficiency Class (data-testid='aviv.CDP.Sections.Energy.Preview.EfficiencyClass').
Year of Construction.
Condition (new/old/etc.).
Heating Type & System.

8. Price Per Square Meter
Extracts price_per_sqm (Euro per square meter) using an ARIA label filter.

9. Caution Deposit & Extra Costs
Looks for caution deposits (Kaution) in different sections.

10. Real Estate Agency Info
Extracts real estate agency details from multiple potential locations.


In [4]:
# Function to scrape property details with retries
def scrape_property(property_url, max_retries=3):
    print(f"Scraping: {property_url}")

    for attempt in range(max_retries):
        try:
            response = requests.get(property_url, headers=headers, timeout=15)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")

                # Extract details
                title = get_first_valid_match(soup, ["span.css-1s5ldev"])
                rent_price = get_first_valid_match(soup, ["span.css-9wpf20"])
                kalt_o_warm = get_first_valid_match(soup, [
                    "span.css-1jte76a[data-testid='aviv.CDP.Sections.Hardfacts.Price.Informations']"
                ])

                # ---- PRICE DETAILS EXTRACTION (from Primary Component) ----
                price_details = {
                    "Warmmiete": "N/A",
                    "Kaltmiete zzgl. Nebenkosten": "N/A",
                    "Nebenkosten": "N/A",
                    "Heizkosten_1": "N/A",
                    "Heizkosten_2": "N/A",
                    "Miete pro Stellplatz": "N/A"
                }

                price_container = soup.find("div", {"data-testid": "aviv.CDP.Sections.Price.PrimaryComponent"})
                if price_container:
                    # Extract Warmmiete
                    warmmiete_container = price_container.select_one("div.css-47miec div.css-cxt05v")
                    if warmmiete_container:
                        label_elem = warmmiete_container.find("div", class_=lambda x: x and "css-2bd70b" in x)
                        if label_elem and "Warmmiete" in label_elem.get_text():
                            value_container = warmmiete_container.find_next("div", class_="css-1djk842")
                            if value_container:
                                span_val = value_container.find("span", {"aria-hidden": "true"})
                                if span_val:
                                    price_details["Warmmiete"] = span_val.get_text(strip=True)
                                else:
                                    price_details["Warmmiete"] = value_container.get_text(strip=True)
                                    
                    # Process the 'davon' section for further details
                    davon_container = price_container.find("div", class_="css-18s98dq")
                    if davon_container:
                        # Process standard blocks (usually Kaltmiete and Nebenkosten)
                        blocks = davon_container.find_all("div", class_="css-1dnagp7", recursive=False)
                        for block in blocks:
                            label_elem = block.find("div", class_="css-1s5ldev")
                            if label_elem:
                                label_text = label_elem.get_text(strip=True)
                                value_elem = block.find("div", class_="css-1djk842")
                                if value_elem:
                                    span_val = value_elem.find("span", {"aria-hidden": "true"})
                                    if span_val:
                                        value_text = span_val.get_text(strip=True)
                                    else:
                                        value_text = value_elem.get_text(strip=True)
                                else:
                                    value_text = "N/A"
                                # Only assign if the label exactly matches the expected string:
                                norm_label = normalize_label(label_text)
                                if norm_label == "kaltmiete zzgl. nebenkosten":
                                    price_details["Kaltmiete zzgl. Nebenkosten"] = value_text
                                elif norm_label == "nebenkosten":
                                    price_details["Nebenkosten"] = value_text

                        
                        # Specifically look for a Heizkosten block within 'davon'
                        heiz_value = extract_heizkosten(price_container)
                        if heiz_value:
                            # Determine placement: if value contains digits, assume numeric -> Heizkosten_2; otherwise, non-numeric -> Heizkosten_1.
                            if re.search(r'\d', heiz_value):
                                price_details["Heizkosten_2"] = heiz_value
                            else:
                                price_details["Heizkosten_1"] = heiz_value

                    # If no Heizkosten was found in 'davon', you can optionally check in an alternate location:
                    if price_details["Heizkosten_1"] == "N/A" and price_details["Heizkosten_2"] == "N/A":
                        # Optional additional check, similar to before:
                        heiz_alternate = extract_heizkosten(price_container)
                        if heiz_alternate:
                            if re.search(r'\d', heiz_alternate):
                                price_details["Heizkosten_2"] = heiz_alternate
                            else:
                                price_details["Heizkosten_1"] = heiz_alternate
                
                    # Extract 'Miete pro Stellplatz'
                    stellplatz_value = extract_stellplatz(price_container)
                    if stellplatz_value:
                        price_details["Miete pro Stellplatz"] = stellplatz_value
                # ---- END PRICE DETAILS EXTRACTION ----
                
                address = get_first_valid_match(soup, ["div.css-1ytyjyb"])
                zip_code = extract_zip_code(soup)
                pics_number = get_first_valid_match(soup, ["div.css-1f03yfv"])
                additional_price_info = extract_additional_price_info(soup)

                energy_efficiency = get_first_valid_match(soup, ["[data-testid='aviv.CDP.Sections.Energy.Preview.EfficiencyClass']"])
                year_of_construction = get_first_valid_match(soup, ["span[data-testid='aviv.CDP.Sections.Energy.Features.yearOfConstruction']"])
                condition = get_first_valid_match(soup, ["[data-testid='aviv.CDP.Sections.Energy.Features.state']"])
                heating_type = get_first_valid_match(soup, ["[data-testid='aviv.CDP.Sections.Energy.Features.energySource']"])
                heating_system = get_first_valid_match(soup, ["[data-testid='aviv.CDP.Sections.Energy.Features.heatingSystem']"])

                price_per_sqm = get_price_value(soup, ["div.css-u7w3u5[aria-label*='Euro pro Quadratmeter']"])
                
                room_elements = soup.find_all("span", class_="css-2bd70b")
                rooms_text = room_elements[0].text.strip() if len(room_elements) > 0 else "N/A"
                surface_area_text = room_elements[1].text.strip() if len(room_elements) > 1 else "N/A"
                floor = room_elements[2].text.strip() if len(room_elements) > 2 else "N/A"

                if len(floor) > 5:
                    availability = floor
                    floor = "N/A"
                else:
                    availability = room_elements[3].text.strip() if len(room_elements) > 3 else "N/A"
                            
                # Extract features (Merkmale) details
                features_container = soup.find("ul", {"data-testid": "aviv.CDP.Sections.Features.Preview"})
                features_dict = {f"Merkmale_{i+1}": "N/A" for i in range(8)}  # Initialize with N/A
                if features_container:
                    feature_items = features_container.find_all("li", recursive=False)
                    for idx, li in enumerate(feature_items):
                        span_elem = li.find("span", class_="css-1az3ztj")
                        if span_elem:
                            features_dict[f"Merkmale_{idx+1}"] = span_elem.get_text(strip=True)

                 # Extract caution details with specific selectors
                caution_1 = get_first_valid_match(
                    soup, ["div.css-wnttuu[data-testid='aviv.CDP.Sections.Price.SecondaryComponent'] div.css-33wh8u div.css-yxuej5"]
                )
                caution_2 = get_first_valid_match(
                    soup, ["div.css-wnttuu[data-testid='aviv.CDP.Sections.Price.SecondaryComponent'] div.css-33wh8u div div.css-18s98dq div.css-cxt05v div.css-1djk842 span[aria-hidden='true'].css-2bd70b"])

                     # Extract real estate information with specific selectors
                real_estate_1 = get_first_valid_match(soup, ["span.css-1kzvz8a"])
                real_estate_2 = get_first_valid_match(soup, ["div.css-ce566o[data-testid='aviv.CDP.Sections.ContactCard.Subtitle']"])
                real_estate_3 = get_first_valid_match(soup, ["span.css-2bd70b[data-testid='aviv.CDP.Contacting.ProviderSection.IntermediaryCard.Title']"])
                real_estate_4 = get_first_valid_match(soup, ["span.css-2bd70b[data-testid='aviv.CDP.Contacting.ProviderSection.ContactCard.Title']"])
                real_estate_5_subtitle = get_first_valid_match(
                    soup, ["div.css-8v8ftk div.css-ce566o[data-testid='aviv.CDP.Contacting.ContactCard.Subtitle']"]
                )
                real_estate_5_title = get_first_valid_match(
                    soup, ["div.css-8v8ftk div.css-dvw792[data-testid='aviv.CDP.Contacting.ContactCard.Title']"]
                )

                result = {
                    "Title": title,
                    "Kaltmiete zzgl. Nebenkosten": price_details["Kaltmiete zzgl. Nebenkosten"],
                    "Warmmiete": price_details["Warmmiete"],
                    "Nebenkosten": price_details["Nebenkosten"],
                    "Heizkosten_1": price_details["Heizkosten_1"],
                    "Heizkosten_2": price_details["Heizkosten_2"],
                    "Miete pro Stellplatz": price_details["Miete pro Stellplatz"],
                    "Rooms_number": rooms_text,
                    "Surface Area": surface_area_text,
                    "Floor": floor,
                    "availability": availability,
                    "Adress": address,
                    "Zip Code": zip_code,
                    "Additional Price Info": additional_price_info,
                    "pics_number": pics_number,
                    "energy_efficiency": energy_efficiency,
                    "Year of Construction": year_of_construction,
                    "Condition": condition,
                    "Heating type": heating_type,
                    "Heating system": heating_system,
                    "Price per sqm": price_per_sqm,
                    "Caution 1": caution_1,
                    "Caution 2": caution_2,
                    "Real Estate 1": real_estate_1,
                    "Real Estate 2": real_estate_2,
                    "Real Estate 3": real_estate_3,
                    "Real Estate 4": real_estate_4,
                    "Real Estate 5 Subtitle": real_estate_5_subtitle,
                    "Real Estate 5 Title": real_estate_5_title,
                    "URL": property_url
                }
                # Update the result with each extracted Merkmale
                result.update(features_dict)

                return result
                
        except requests.exceptions.RequestException as e:
            print(f"Retrying {property_url} due to: {e}")
            time.sleep(random.uniform(10, 20))

    print(f"Skipping {property_url} after {max_retries} failed attempts")
    return None

# Define the range of pages to scrape here

In [None]:
# Define the range of pages to scrape (modify these values as needed)
start_page = 133  # Change this to set the starting page
end_page = 134   # Change this to set the ending page

# Execute webscraping

In [11]:
# Store collected links
all_property_links = []

# Loop through selected pages
for page in range(start_page, end_page + 1):
    url = base_url + str(page)  # Append page number to the URL
    print(f"Scraping page {page}: {url}")

    # Send the request
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all <a> tags
        links = soup.find_all("a", href=True)

        # Extract only links that contain "immowelt.de/expose/"
        property_links = [link["href"] for link in links if "immowelt.de/expose/" in link["href"]]

        # Add to the full list
        all_property_links.extend(property_links)
    else:
        print(f"Failed to load page {page}, status code: {response.status_code}")
        break  # Stop scraping if a page fails to load

# Remove duplicates (if the same property appears on multiple pages).
all_property_links = list(set(all_property_links))

# Display results
print(f"\nTotal properties found: {len(all_property_links)}")

# Scrape all properties
scraped_data = [scrape_property(url) for url in all_property_links if scrape_property(url) is not None]

# Save results to CSV
df = pd.DataFrame(scraped_data)
filename = f"immowelt_listings{start_page}-{end_page}.csv"
df.to_csv(filename, index=False)
print("################DONE!!####################")

Scraping page 133: https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE8634&page=133
Scraping page 134: https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE8634&page=134

Total properties found: 60
Scraping: https://www.immowelt.de/expose/f5a2480b-5833-4d9e-9e4b-c6adad39b4d6
Scraping: https://www.immowelt.de/expose/f5a2480b-5833-4d9e-9e4b-c6adad39b4d6
Scraping: https://www.immowelt.de/expose/0b07c287-ff21-46c0-9095-e31ff3b00513
Scraping: https://www.immowelt.de/expose/0b07c287-ff21-46c0-9095-e31ff3b00513
Scraping: https://www.immowelt.de/expose/30f6e626-b576-41f1-9013-e15db6281fec
Scraping: https://www.immowelt.de/expose/30f6e626-b576-41f1-9013-e15db6281fec
Scraping: https://www.immowelt.de/expose/c7b28eb2-5856-469e-b3e0-535ae14dd95b
Scraping: https://www.immowelt.de/expose/c7b28eb2-5856-469e-b3e0-535ae14dd95b
Scraping: https://www.immowelt.de/expose/370a410c-c446-426a-8fe4-4552e60ddcd