In [2]:
import pandas as pd
import numpy as np
import json
import requests
import re
import matplotlib.pyplot as plt


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

import geohash2
import pyproj
import geopandas as gpd
from shapely.geometry import Point

# Immoscout mit Selenium aufrufen und Daten scrapen

In [3]:
bundesland = [
    'rheinland-pfalz'
    #'bayern',
    #'hessen'
]

region = [
    'donnersbergkreis'
    #'wuerzburg-kreis',
    #'wuerzburg'
    #'frankfurt-am-main'
]
# https://www.immobilienscout24.de/gewerbe-flaechen/de/bayern/wuerzburg-kreis/hallenproduktion-mieten/

url = 'https://www.immobilienscout24.de/gewerbe-flaechen/de/' + bundesland[0] + '/' + region[0] + '/hallenproduktion-mieten/'

# Starten Sie den Chrome-Driver mit Selenium
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Navigieren Sie zur URL und lösen Sie das Captcha manuell
driver.get(url)
input("Bitte lösen Sie das Captcha und drücken Sie Enter, um fortzufahren...")

# Extrahieren Sie den HTML-Inhalt der Seite
html_content = driver.page_source

# Schließen Sie den Chrome-Driver
driver.quit()

# Verarbeiten Sie den HTML-Inhalt mit BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')


# Relevante Daten mit BeautifulSoup filtern und in Df laden

In [4]:
script_tags = soup.find_all('script')

for script_tag in script_tags:
    script_content = script_tag.string
    if script_content and 'tilesResult' in script_content:
        tiles_result_content = script_content
        break


# Extrahieren Sie den Inhalt des tilesResult-Objekts
if tiles_result_content:
    start_index = tiles_result_content.find('{', tiles_result_content.find('tilesResult')) + 10
    end_index = tiles_result_content.find('numberOfHits') + len('numberOfHits') -18
    tiles_result_data = tiles_result_content[start_index:end_index]
    json_content = re.sub(r'"[^"]*"', lambda m: m.group(0).replace(':', ''),  tiles_result_data)
    json_content = re.sub(r'(\w+):', r'"\1":', json_content)
    df_json = pd.read_json(json_content)
    exposes_df = pd.concat([pd.json_normalize(exp) for exp in df_json['exposes']], ignore_index=True)

  df_json = pd.read_json(json_content)


In [None]:
exposes_df.to_csv('./Immobilien_Data/Wuerzburg.csv', index=False)

In [5]:
warehouse_data = exposes_df

# Die Geodaten ins CSR3035 Format transformieren und in einen Gdf laden

In [6]:
def geohash_to_csr3035(geohash):
    lat, lon = geohash2.decode(geohash)
    wgs84 = pyproj.Proj(init='epsg:4326')
    csr3035 = pyproj.Proj(init='epsg:3035')
    x_csr3035, y_csr3035 = pyproj.transform(wgs84, csr3035, lon, lat)
    return x_csr3035, y_csr3035


def convert_price_string_to_float(price_string):
  # Remove non-numeric characters
  numeric_part = re.sub(r"[^\d,.]", "", price_string)

  # Replace comma with decimal
  numeric_part = numeric_part.replace(",", ".")

  # Convert to float
  return float(numeric_part)


def extract_largest_float(area_string):
  # Remove non-numeric characters and split the string into parts
  area_parts = re.sub(r"[^\d\-\s]", "", area_string).split()

  # Handle different formats
  if len(area_parts) == 1:  # Single value format (3.000 m²)
    largest_value = float(area_parts[0])

  elif len(area_parts) == 3 and area_parts[1] == '-':  # Range format (5.000 - 15.000 m²)
    largest_value = max(float(area_parts[0]), float(area_parts[2]))

  else:  # Invalid format
    largest_value = None

  return largest_value

def extract_smallest_float(area_string):
  # Remove non-numeric characters and split the string into parts
  area_parts = re.sub(r"[^\d\-\s]", "", area_string).split()

  if len(area_parts) == 1:  # Single value format (3.000 m²)
    smallest_value = float(area_parts[0])

  elif len(area_parts) == 3 and area_parts[1] == '-':  # Range format (5.000 - 15.000 m²)
    smallest_value = min(float(area_parts[0]), float(area_parts[2]))

  else:  # Invalid format
    smallest_value = None

  return smallest_value

In [7]:
# # Read warehouse data from CSV files
# warehouse_data_wuerzburg = pd.read_csv('./Immobilien_Data/Wuerzburg.csv')  # Read data from Würzburg CSV
# warehouse_data_wuerzburg_kreis = pd.read_csv('./Immobilien_Data/Wuerzburg-Kreis.csv')  # Read data from Würzburg-Kreis CSV

# # Combine data from both CSV files
# warehouse_data = pd.concat([warehouse_data_wuerzburg, warehouse_data_wuerzburg_kreis], axis=0)  # Concatenate DataFrames

# Extract x and y coordinates from 'geoGrid' column using a function (assumed 'geohash_to_csr3035' is defined elsewhere)
warehouse_data[['x_csr3035', 'y_csr3035']] = warehouse_data['geoGrid'].apply(lambda x: pd.Series(geohash_to_csr3035(x)))

# Create a geometry column using Point objects from x and y coordinates
warehouse_data['geometry'] = warehouse_data.apply(lambda row: Point(row['x_csr3035'], row['y_csr3035']), axis=1)

# Convert the data into a GeoDataFrame with specified geometry column and CRS
warehouses_gdf = gpd.GeoDataFrame(warehouse_data, geometry='geometry', crs='EPSG:3035')

# Reset index to avoid potential issues with duplicate indices
warehouses_gdf = warehouses_gdf.reset_index(drop=True)  # Reset index and optionally drop the old column

# Clean and process 'pricePerSquareMetre' column
warehouses_gdf['pricePerSquareMetre'] = warehouses_gdf['pricePerSquareMetre'].replace('', np.nan)
warehouses_gdf['pricePerSquareMetre'] = warehouses_gdf[warehouses_gdf['pricePerSquareMetre'].notna()]['pricePerSquareMetre'].apply(convert_price_string_to_float)  # Apply function to non-NaN values to convert price format
warehouses_gdf['pricePerSquareMetre'].fillna(warehouses_gdf['pricePerSquareMetre'].median(), inplace=True)  # Fill NaN values with the mean
warehouses_gdf['pricePerSquareMetre'] = warehouses_gdf['pricePerSquareMetre'].apply(lambda x: round(x, 2))  # Round price values to 2 decimal places

# Clean and process 'floorSpace' column
warehouses_gdf['floorSpace_big'] = warehouses_gdf['floorSpace'].apply(extract_largest_float)
warehouses_gdf['floorSpace_small'] = warehouses_gdf['floorSpace'].apply(extract_smallest_float)

#Create new column with the corresponding price element
warehouses_gdf['total_price_big'] = warehouses_gdf['floorSpace_big'] * warehouses_gdf['pricePerSquareMetre'] * 12
warehouses_gdf['total_price_small'] = warehouses_gdf['floorSpace_small'] * warehouses_gdf['pricePerSquareMetre'] * 12

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  x_csr3035, y_csr3035 = pyproj.transform(wgs84, csr3035, lon, lat)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  x_csr3035, y_csr3035 = pyproj.transform(wgs84, csr3035, lon, lat)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  x_csr3035, y_csr3035 = pyproj.transform(wgs84, csr3035, lon, lat)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  x_csr3035, y_csr3035 = pyproj.transform(wgs84, csr3035, lon, lat)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  x_csr3035, y_csr3035 = pyproj.transform(wgs84, csr3035, lon, lat)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_st

In [8]:
warehouses_gdf = warehouses_gdf.drop(columns=['features', 'pictureUrls'])

In [9]:
warehouses_gdf.to_file('./Donner_Data/warehouses_donner.gpkg', driver='GPKG')