In [1]:
import requests
import unidecode
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url = 'https://www.homegate.ch/rent/apartment/city-zurich/matching-list'
house_data = []

In [3]:
while url:
    # GET request
    response = requests.get(url)
    html_content = response.text
    
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Select the part of the html where all the houses are insert
    body = soup.select_one('div[data-test="result-list"]')
    houses = body.contents
    
    # Loop through all the houses
    for house in houses:
        tr = house.find('a')        
        div = tr.find('div')
        p_contents = div.find_all('p')

        # Prices info
        try:
            price_info = p_contents[0].select_one(".ListItemPrice_price_1o0i3").find_all("span")[1].text
            price_info = price_info.replace(',', '').replace('.–', '')
        except:
            price_info = None

        # Living Space
        try:
            living_space_info = p_contents[0].select_one(".ListItemLivingSpace_value_2zFir").text.split('m')[0]
        except:
            living_space_info = None

        # Rooms info
        try:
            rooms_info = p_contents[0].select_one(".ListItemRoomNumber_value_Hpn8O").text.split('r')[0]
        except:
            rooms_info = None

        # Location info
        try:
            # This library converts/approximates unicode characters to their ascii representation 
            # This way the csv will only have ascii characters and we avoid encoding problems during table imports in mysql                                            
            location_info = unidecode.unidecode(p_contents[1].select_one("p span").text)
        except:
            location_info = None
        
        # Save the house URL and also the ID of each house that is the same as the correspondent website adress
        house_url = 'https://www.homegate.ch' + tr['href']
        house_id = tr['href'].replace('/rent/','')    
                                                                 
        # Save all the informations into a dictionary to then transform in a Data Frame
        house_data.append({
            'House_ID': house_id,
            'Price': price_info,
            'Area': living_space_info,
            'Rooms': rooms_info,
            'Location': location_info,
            'More information': house_url
        })
        
    # Get the URL for the next page
    next_page_element = soup.find("a", {"aria-label": "Go to next page"})
    if next_page_element:
        url = 'https://www.homegate.ch' + next_page_element["href"]
    else:
        url = None
    

In [4]:
df = pd.DataFrame(house_data)

In [5]:
df.index = np.arange(1, len(df) + 1)

In [6]:
df.head()

Unnamed: 0,House_ID,Price,Area,Rooms,Location,More information
1,3002472265,5190,102,4.5,"Spitalgasse 2, 8001 Zurich",https://www.homegate.ch/rent/3002472265
2,3002552074,5480,112,2.5,"Spitalgasse 2, 8001 Zurich",https://www.homegate.ch/rent/3002552074
3,3002567931,3910,75,3.0,"Engelstrasse 62, 8004 Zurich",https://www.homegate.ch/rent/3002567931
4,3002184266,3950,70,2.5,"Ankerstrasse, 8004 Zurich",https://www.homegate.ch/rent/3002184266
5,3002336810,1450,26,1.0,"Langstrasse 213, 8005 Zurich",https://www.homegate.ch/rent/3002336810


In [7]:
df.to_csv('apartments_zurich_feb.csv', index=False)