[original code](https://sharetext.me/vbn33yx45g)  

[article](https://towardsdatascience.com/housing-rental-market-in-germany-exploratory-data-analysis-with-python-3975428d07d2)

In [None]:
import time
import requests
import json
import bs4 as bs  # pip3 install beautifulsoup4 lxml
from selenium import webdriver
from typing import List, Optional
from dataclasses import dataclass
from multiprocessing import Process
import glob
import time
import datetime
import re
import os


base_url = "https://www.immobilienscout24.de"

# Sort=4: price low first
# Berlin (East): 178 pages
url_berlin = "https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-mieten?sorting=4&pagenumber="
# Dresden (East): 72 pages
url_dresden = "https://www.immobilienscout24.de/Suche/de/sachsen/dresden/wohnung-mieten?sorting=4&&pagenumber="
# Munchen (South): 90 pages
url_munchen = "https://www.immobilienscout24.de/Suche/de/bayern/muenchen/wohnung-mieten?sorting=4&pagenumber="
# Frankfurt (Center): 90 pages
url_frankfurt = "https://www.immobilienscout24.de/Suche/de/hessen/frankfurt-am-main/wohnung-mieten?sorting=4&pagenumber="
# Koeln (West): 55 pages
url_koeln = "https://www.immobilienscout24.de/Suche/de/nordrhein-westfalen/koeln/wohnung-mieten?sorting=4&pagenumber="
# Hamburg (North): 62 pages
url_hamburg = "https://www.immobilienscout24.de/Suche/de/hamburg/hamburg/wohnung-mieten?sorting=4&pagenumber="


def page_has_loaded(driver: webdriver.Chrome):
    """ Check if the page is ready """
    page_state = driver.execute_script('return document.readyState;')
    return page_state == 'complete'


def page_get(url: str, driver: webdriver.Chrome, delay_sec=1.0, cookies: List=[]):
    """ Get the page content """
    driver.get(url)
    time.sleep(delay_sec)
    while not page_has_loaded(driver):
        time.sleep(1)

    return driver.page_source


def page_save(file_name: str, body: str):
    # with open(file_name, "w") as fileToWrite:
    #     fileToWrite.write(body)
    #     print(f"{len(body)} bytes saved to {file_name}")

    fileToWrite = open(file_name, "w")
    fileToWrite.write(body)
    fileToWrite.close()
    print(f"{len(body)} bytes saved to {file_name}")


def get_data(city_name: str, city_url: str, start_page: int, end_page: int):
    # Chromedriver:
    # google-chrome --version
    # https://chromedriver.chromium.org/downloads

    page_num = start_page

    csv_log_lines = []

    print(f"Processing the city: {city_name}, pages {start_page}-{end_page}")

    # Restart with new options
    options = webdriver.ChromeOptions()
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(executable_path="./chromedriver", chrome_options=options)
    page_get(url_berlin, driver, delay_sec=30, cookies=[])
    cookies = driver.get_cookies()

    while True:
        page_url = city_url if page_num == 1 else city_url + str(page_num)
        print("Parsing page:", page_url)
        # if page_num == 1:
        #     print("Confirm the check before loading the first page...")
        links = []
        for _ in range(5):
            delay = 5  # 20 if page_num == 1 else 5
            html_data = page_get(page_url, driver, delay_sec=delay, cookies=cookies)
            # page_save(f"data/{city_name}/{city_name}_page_{page_num}.html", html_data)
            # Parse apartments on that page
            links = get_page_links(html_data)
            if links is not None:
                break

        # print("Links:", len(links))
        for link in links:
            print("  Parsing:", link)
            for _ in range(10):
                try:
                    # Get the page from browser
                    html_apartment = page_get(link, driver, delay_sec=0.5, cookies=cookies)

                    apt_data = parse_apartment(html_apartment)
                    apt_data.city = city_name
                    print("    ", apt_data.to_csv())
                    csv_log_lines.append(apt_data.to_csv())
                    break
                except:
                    print("Cannot parse data, try again after 10s")
                    time.sleep(10)

            # Save for local testing
            # page_save(f"data/{city_name}/{city_name}_apartment_{item_id}.html", html_apartment)
            # time.sleep(0.5)

        if len(links) == 0:
            break

        # save_log(csv_log_lines, city_name, page_num)
        page_num += 1
        if page_num >= end_page:
            break

    driver.quit()

    # Save log to file
    if len(csv_log_lines) > 0:
        save_log(csv_log_lines, city_name, page=f"{start_page:03d}-{end_page-1:03d}", header=start_page == 1)


def get_page_links(s_html: str) -> Optional[List]:
    """ Parse page with search results """
    try:
        soup = bs.BeautifulSoup(s_html, "lxml")
        # r = soup.find("div", {"id": "listings"})
        li = soup.find(id="resultListItems")
        # children = li.findChildren("li", {"class": "result-list__listing result-list__listing--xl"}, recursive=False)
        children = li.find_all("li", {"class": "result-list__listing"})
        # print("Links found:", len(children))
        links_all = []
        for child in children:
            for link in child.find_all("a"):
                # print("  ", link.attrs)
                if 'data-go-to-expose-id' in link.attrs:
                    # print(f"  {base_url}{link['href']}")
                    links_all.append(base_url + link['href'])
                    break
        return links_all
    except Exception as e:
        print("get_page_links error:", e)
        return None


def value_from_str_float(price_str: str) -> Optional[float]:
    """ Convert string price like "7.935,60 EUR" to 7936 """
    try:
        s_filtered = ''.join(c for c in price_str if c in "0123456789,.")
        # 7.935,60 => 7935.60
        return float(s_filtered.replace(".", "").replace(",", "."))
    except ValueError as _:
        # print("value_from_str warning: cannot parse str:", price_str)
        return None


def value_from_str(price_str: str) -> Optional[int]:
    """ Convert string price like "7.935,60 EUR" to 7936 """
    try:
        # 7.935,60 => 7935.60 => 7936
        return int(round(value_from_str_float(price_str)))
    except ValueError as _:
        return None
    except TypeError as __:
        return None


def str_to_csv(data_str: Optional[str]) -> Optional[str]:
    """ Remove ";" from string """
    if data_str is not None:
        return data_str.replace(';', ' ').replace('\n', ' ').strip()
    return None


def get_log_date() -> str:
    """ Use current date as a logging date """
    return datetime.datetime.now().strftime('%Y-%m-%d')


def save_log(log_data: List, city: str, page: str, header=False):
    """ Save log to file """
    log_filename = f'{city}_{page}.csv'
    print(f"Saving log to {log_filename}")

    with open(log_filename, "a", encoding="utf-8") as f_log:
        if header:
            f_log.write(Apartment.get_header() + "\n")
        for log_str in log_data:
            f_log.write(log_str + "\n")


@dataclass 
class Apartment:
    apt_id: int = 0
    apt_type: Optional[str] = None
    publisher: str = ""
    title: str = ""
    city: Optional[str] = None
    address: Optional[str] = None
    region: Optional[str] = None
    price_cold: Optional[int] = None
    price_warm: Optional[int] = None
    deposit: Optional[str] = None
    area: Optional[float] = None
    rooms: Optional[float] = None
    floor: Optional[int] = None
    floor_max: Optional[int] = None

    @staticmethod
    def get_header():
        return (
                "property_id;logging_date;"
                "property_area;num_rooms;floor;floors_in_building;"
                "price_cold_eur;price_warm_eur;deposit_eur;property_type;"
                "publisher;city;title;address;region;"
                )

    def to_csv(self):
        """ Convert data to CSV"""
        return (
                f"{self.apt_id};{get_log_date()};"
                f"{self.area};{self.rooms};{self.floor};{self.floor_max};"
                f"{self.price_cold};{self.price_warm};{self.deposit};{self.apt_type};"
                f"{str_to_csv(self.publisher)};{self.city};{str_to_csv(self.title)};{str_to_csv(self.address)};{str_to_csv(self.region)};"
                ) # str_to_csv removes ";" from string to avoid CSV errors


def parse_apartment(s_html: str):
    soup = bs.BeautifulSoup(s_html, "lxml")
    result = Apartment()

    # ID
    div_id = soup.find("div", {"class": "is24-scoutid"})
    if div_id is not None:
        item_id = re.search(r"Scout-ID:\s*[0-9]*", div_id.get_text())
        if item_id is not None:
            # " Scout-ID: 140609619 " => ["Scout-ID", "140609619"] => "140609619"
            result.apt_id = item_id.group(0).split(' ')[1]
            # print("ID:", result.apt_id)

    # Title
    title = soup.find_all("h1", id="expose-title")
    if len(title) > 0:
        result.title = title[0].get_text().strip()
        # print("Title:", result.title)

    # Publisher
    company = soup.find_all(attrs={"data-qa": "company-name"})
    if len(company) > 0:
        result.publisher = company[0].get_text().strip(".- ") #
    else:
        item = soup.find("div", {"class": "brandLogoPrivate_dnns4"})
        if item is not None:
            result.publisher = "Private"
    # print("Publisher:", result.publisher)

    # Details
    item = soup.find("div", {"class": "is24-ex-details"})
    for part in item.find_all("div", {"class": "criteriagroup"}):
        # Address and region
        apt_address = part.find_all("span", {"class": "block font-nowrap print-hide"})
        if result.address is None and len(apt_address) > 0:
            value_str = apt_address[0].get_text().strip().strip(",")
            result.address = value_str
            # print("Address:", value_str)
        apt_region = part.find_all("span", {"class": "zip-region-and-country"})
        if result.region is None and len(apt_region) > 0:
            value_str = apt_region[0].get_text().strip()
            result.region = value_str
            # print("Region:", value_str)

        # Price: Kaltmiete
        apt_price_k = part.find_all("dd", {"class": "is24qa-kaltmiete"})
        if result.price_cold is None and len(apt_price_k) > 0:
            value_str = apt_price_k[0].get_text().strip()
            result.price_cold = value_from_str(value_str)
            # print("Price Kaltmiete:", value_str, result.price_cold)
        # Price: Gesamtmiete (warmmiete)
        apt_price_w = part.find_all("dd", {"class": "is24qa-gesamtmiete"})
        if result.price_warm is None and len(apt_price_w) > 0:
            value_str = apt_price_w[0].get_text().strip()
            result.price_warm = value_from_str(value_str)
            # print("Price Warmmiete:", value_str, result.price_warm)
        # Price: Geschätzte Gesamtmiete (warmmiete)
        apt_price_w2 = part.find_all("dd", {"class": "is24qa-geschaetzte-gesamtmiete"})
        if result.price_warm is None and len(apt_price_w2) > 0:
            value_str = apt_price_w2[0].get_text().strip()
            result.price_warm = value_from_str(value_str)
            # print("Price Geschätzte Warmmiete:", value_str, result.price_warm)

        # Type:
        apt_type = part.find_all("dd", {"class": "is24qa-typ"})
        if result.apt_type is None and len(apt_type) > 0:
            value_str = apt_type[0].get_text().strip()
            result.apt_type = value_str
            # print("Type:", result.apt_type)

        # Etage
        apt_etage = part.find_all("dd", {"class": "is24qa-etage"})
        if result.floor is None and len(apt_etage) > 0:
            value_str = apt_etage[0].get_text().strip()
            values = value_str.split(" ")  # "5 von 8" => 5, 8
            if "von" in value_str and len(values) == 3:
                result.floor = value_from_str(values[0])
                result.floor_max = value_from_str(values[2])
            elif len(values) == 1:
                result.floor = value_from_str(values[0])
            # print("Etage:", result.floor, "/", result.floor_max)

        # Surface
        apt_area = part.find_all("dd", {"class": "is24qa-wohnflaeche-ca"})
        if result.area is None and len(apt_area) > 0:
            value_str = apt_area[0].get_text().strip()
            result.area = value_from_str_float(value_str)
            # print("Area:", result.area, value_str)

        # Rooms
        apt_rooms = part.find_all("dd", {"class": "is24qa-zimmer"})
        if result.rooms is None and len(apt_rooms) > 0:
            value_str = apt_rooms[0].get_text().strip()
            result.rooms = value_from_str(value_str)
            # print("Rooms:", result.rooms)

        # Kaution
        apt_kaution = part.find_all("div", {"class": "is24qa-kaution-o-genossenschaftsanteile"})
        if result.deposit is None and len(apt_kaution) > 0:
            value_str = apt_kaution[0].get_text().strip()
            result.deposit = value_str
            # print("Kaution:", value_str, result.deposit)

    # print(item)
    return result


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--city", default="", help="City to parse")
    args, _ = parser.parse_known_args()

    city_name = args.city
    if city_name.lower() == "berlin":
        city_url = url_berlin
        Process(target=get_data, args=(city_name, city_url, 1, 30)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 30, 60)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 60, 90)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 90, 120)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 120, 150)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 150, 185)).start()
    elif city_name.lower() == "frankfurt":
        city_url = url_frankfurt
        Process(target=get_data, args=(city_name, city_url, 1, 20)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 20, 40)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 40, 60)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 60, 80)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 80, 110)).start()
    elif city_name.lower() == "koeln":
        city_url = url_koeln
        Process(target=get_data, args=(city_name, city_url, 1, 20)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 20, 40)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 40, 70)).start()
    elif city_name.lower() == "hamburg":
        city_url = url_hamburg
        Process(target=get_data, args=(city_name, city_url, 1, 25)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 25, 50)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 50, 75)).start()
    elif city_name.lower() == "dresden":
        city_url = url_dresden
        Process(target=get_data, args=(city_name, city_url, 1, 25)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 25, 50)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 50, 80)).start()
    elif city_name.lower() == "munchen":
        city_url = url_munchen
        Process(target=get_data, args=(city_name, city_url, 1, 20)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 20, 40)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 40, 60)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 60, 80)).start()
        time.sleep(35)
        Process(target=get_data, args=(city_name, city_url, 80, 110)).start()
    else:
        print(f"City {args.city} is not found")
        exit()

    while True:
        time.sleep(10)