In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to fetch and parse the HTML content of a single search results page to get ad URLs
def get_ad_urls(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    ad_urls = []
    for link in soup.find_all('a', class_='detailansicht'):
        ad_url = link.get('href')
        # Ensure the URL starts with 'https://www.wg-gesucht.de'
        if ad_url.startswith('/'):
            ad_url = 'https://www.wg-gesucht.de' + ad_url
        elif not ad_url.startswith('http'):
            ad_url = 'https://www.wg-gesucht.de/' + ad_url
        ad_urls.append(ad_url)
    return ad_urls

# Function to fetch and parse the HTML content of an individual ad page
def get_ad_details(ad_url):
    response = requests.get(ad_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    try:
        title = soup.find('h1').text.strip()
        details = []
        price = ""
        size = ""
        apartment_size = ""
        availability = ""
        landlord = soup.find('div', class_='contact_info_name').text.strip() if soup.find('div', class_='contact_info_name') else ''
        
        for fact in soup.find_all('div', class_='col-xs-6 text-center'):
            detail_label = fact.find('span', class_='key_fact_detail').text.strip()
            detail_value = fact.find('b', class_='key_fact_value').text.strip()
            
            if detail_label == "Gesamtmiete":
                price = detail_value
            elif detail_label == "Größe" or detail_label == "Zimmergröße":
                size = detail_value
        
        # Extracting apartment size
        for item in soup.find_all('span', class_='section_panel_detail'):
            if "Wohnungsgröße" in item.text:
                apartment_size = item.text.split(":")[1].strip()
        
        # Extracting details section
        details_section = soup.find('ul', class_='pl15 mb15')
        if details_section:
            for item in details_section.find_all('li'):
                detail = item.text.strip()
                details.append(detail)
        details = ", ".join(details)
        
        # Extracting availability
        availability_section = soup.find('div', class_='col-xs-6')
        if availability_section:
            online_status = availability_section.find('b', class_='noprint')
            if online_status:
                availability = online_status.text.strip()
        
        return {
            'title': title,
            'details': details,
            'price': price,
            'size': size,
            'apartment_size': apartment_size,
            'availability': availability,
            'landlord': landlord,
            'url': ad_url
        }
    except AttributeError as e:
        return {}

# Base URL of the WG-Gesucht search results for Hamburg with pagination parameter
base_url = 'https://www.wg-gesucht.de/wg-zimmer-und-1-zimmer-wohnungen-und-wohnungen-in-Hamburg.55.0+1+2.1.{}.html?offer_filter=1&city_id=55&sort_order=0&noDeact=1&categories%5B%5D=0&categories%5B%5D=1&categories%5B%5D=2&pagination=1&pu='

# Initialize an empty list to store all ad details
all_ads = []

# Number of pages to scrape
total_pages = 89

# Loop through each page to get the ad URLs
for page in range(1, total_pages + 1):
    page_url = base_url.format(page)
    print(f'Scraping page {page} of {total_pages}...')
    ad_urls = get_ad_urls(page_url)
    
    # Loop through each ad URL to get the ad details
    for ad_url in ad_urls:
        print(f'Scraping ad: {ad_url}')
        ad_details = get_ad_details(ad_url)
        if ad_details:
            all_ads.append(ad_details)
        time.sleep(1)  # Adding delay to avoid overloading the server

# Convert all ad details to a DataFrame
df = pd.DataFrame(all_ads)

# Save DataFrame to an Excel file
df.to_excel('hamburg_apartment_listings.xlsx', index=False)

print('Data has been saved to hamburg_apartment_listings.xlsx')

Scraping page 1 of 89...
Scraping page 2 of 89...
Scraping page 3 of 89...
Scraping page 4 of 89...
Scraping page 5 of 89...
Scraping page 6 of 89...
Scraping page 7 of 89...
Scraping page 8 of 89...
Scraping page 9 of 89...
Scraping page 10 of 89...
Scraping page 11 of 89...
Scraping page 12 of 89...
Scraping page 13 of 89...
Scraping page 14 of 89...
Scraping page 15 of 89...
Scraping page 16 of 89...
Scraping page 17 of 89...
Scraping page 18 of 89...
Scraping page 19 of 89...
Scraping page 20 of 89...
Scraping page 21 of 89...
Scraping page 22 of 89...
Scraping page 23 of 89...
Scraping page 24 of 89...
Scraping page 25 of 89...
Scraping page 26 of 89...
Scraping page 27 of 89...
Scraping page 28 of 89...
Scraping page 29 of 89...
Scraping page 30 of 89...
Scraping page 31 of 89...
Scraping page 32 of 89...
Scraping page 33 of 89...
Scraping page 34 of 89...
Scraping page 35 of 89...
Scraping page 36 of 89...
Scraping page 37 of 89...
Scraping page 38 of 89...
Scraping page 39 of 8