In [3]:
import re
import time
import logging

logging.info = print

import numpy as np
import pandas as pd
from urllib.parse import urlencode
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select

In [9]:
class Immobilienscout24Scraper():
    def __init__(self, path_to_chromedriver: str = '', headless: bool = True, sleep_time: float = 1):
        """
        Parameters
        ----------
        path_to_chromedriver: str 
            Path to executable of chromedriver
        headless: bool
            Browser headles
        sleep_time: float
            Sleep time in seconds between requests
        """
        self.BASE_URL = 'https://www.immobilienscout24.at/'
        self.AVAILABLE_OPTIONS = ['BUY','RENT', 'TEMPORARY_RENT']
        self.PATH_TO_CHROMEDRIVER = path_to_chromedriver
        self.HEADLESS = headless
        self.SLEEP_TIME = sleep_time
    
    def _accept_cookies(self,browser: webdriver):
        """
        Accept cookies if form pops up
        """
        try:
            root = browser.find_element_by_id('usercentrics-root')
            if 'Alle akzeptieren' in root.text:
                ac = ActionChains(browser)
                ac.move_to_element(root)\
                  .pause(self.SLEEP_TIME)\
                  .click()\
                  .perform()
                for i in range(7):
                    ac.send_keys(Keys.TAB).perform()
                ac.send_keys(Keys.ENTER).perform()
        except:
            pass
        
    def _search(self, browser: webdriver, location: str, offer_type: str = 'BUY'):
        """
        Write location and offer_type into the search fields and click search button

        Parameters
        ----------
        browser: webdriver 
            Instance of webdriver
        offer_type: str
            Wanted offer type
        locacation: str
            Wanted location
        """
        location_element = browser.find_element_by_id('downshift-0-input')
        location_element.clear()
        location_element.send_keys(location)
        location_element.send_keys(Keys.ENTER)

        select_element = Select(browser.find_element_by_class_name('pT3G8'))
        select_element.select_by_value(offer_type)
        
        button = browser.find_element_by_class_name('aZs4H').click()
        time.sleep(self.SLEEP_TIME)
        
    def _load_url(self, browser: webdriver, url: str, max_retries: int = 1) -> None:
        """
        Load url

        Parameters
        ----------
        browser: webdriver 
            Instance of webdriver
            
        url: str 
            Url to load
            
        max_retries: int
            How many retries is possible
        """
        logging.info(f'Loading: {url}')
        
        sleep_time = self.SLEEP_TIME
        retries = 1
        while True:
            try:
                browser.get(url)
                break
            except:
                if retry == max_retries:
                    raise
                else:
                    sleep_time *= 2
                    retries += 1
                    time.sleep(sleep_time)
                           
        time.sleep(self.SLEEP_TIME)
        browser.maximize_window()
    
    def _init_browser(self) -> webdriver:
        """
        Initialize browser

        Returns
        -------
        set
            webdriver
        """
        chrome_options = Options()
        chrome_options.add_argument("--disable-popup-blocking")
        chrome_options.add_argument("--incognito")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        if self.HEADLESS:
            chrome_options.add_argument("--headless")
        
        if self.PATH_TO_CHROMEDRIVER == '':
            return webdriver.Chrome(options=chrome_options)
  
        return webdriver.Chrome(self.PATH_TO_CHROMEDRIVER, options=chrome_options)

    def _get_adverts_v1(self, browser) -> list:
        """
        Gets the data from adverts version 1
        
        Parameters
        ----------
        browser: webdriver 
            Instance of webdriver
        """
        adverts = browser.find_elements_by_class_name('WgMl2')
                
        new_data = []
        for advert in adverts:
            self._accept_cookies(browser)
            new_data.append(
                {
                    'url': advert.find_element_by_class_name('Skhfr').get_attribute('href') if advert.find_elements_by_class_name('Skhfr') else np.nan,
                    'title': advert.find_element_by_class_name('YOneO').text.strip() if advert.find_elements_by_class_name('YOneO') else np.nan,
                    'location': advert.find_element_by_class_name('gOIzK').text.strip() if advert.find_elements_by_class_name('gOIzK') else np.nan,
                    'rooms_text': advert.find_elements_by_class_name('TGYEW')[0].text.strip().replace('\n',' ') if advert.find_elements_by_class_name('TGYEW') else np.nan,
                    'area_text':  advert.find_elements_by_class_name('TGYEW')[1].text.strip().replace('\n',' ') if len(advert.find_elements_by_class_name('TGYEW')) > 1 else np.nan,
                    'price_text': advert.find_element_by_class_name('spDjI').text.strip() if advert.find_elements_by_class_name('spDjI') else np.nan,
                    'contact': advert.find_element_by_class_name('OA93m').text.strip().replace('\n',' ') if advert.find_elements_by_class_name('OA93m') else np.nan

                }
            )
        return new_data
    
    def _get_adverts_v2(self, browser):
        """
        Gets the data from adverts version 2
        
        Parameters
        ----------
        browser: webdriver 
            Instance of webdriver
        """
        adverts = browser.find_elements_by_class_name('eTh4Y')
                
        new_data = []
        for advert in adverts:
            self._accept_cookies(browser)
            new_data.append(
                {
                    'url': advert.find_element_by_class_name('NCjTr').get_attribute('href') if advert.find_elements_by_class_name('NCjTr') else np.nan,
                    'title': advert.find_element_by_class_name('NCjTr').text.strip() if advert.find_elements_by_class_name('NCjTr') else np.nan,
                    'location': advert.find_element_by_class_name('u2szU').text.strip() if advert.find_elements_by_class_name('u2szU') else np.nan,
                    'rooms_text': advert.find_elements_by_class_name('TGYEW')[0].text.strip().replace('\n',' ') if advert.find_elements_by_class_name('TGYEW') else np.nan,
                     'area_text':  advert.find_elements_by_class_name('TGYEW')[1].text.strip().replace('\n',' ') if len(advert.find_elements_by_class_name('TGYEW')) > 1 else np.nan,
                    'price_text': advert.find_element_by_class_name('spDjI').text.strip() if advert.find_elements_by_class_name('spDjI') else np.nan,
                    'contact': advert.find_element_by_class_name('OA93m').text.strip().replace('\n',' ') if advert.find_elements_by_class_name('OA93m') else np.nan
                }
            )
            
        return new_data

    def scrape_adverts(self, location: str, offer_type: str, page_limit: int = 10):
        """
        Scrape adverts for required location and offer type
        
        Parameters
        ----------
        offer_type: str
            Wanted offer type
        locacation: str
            Wanted location
        page_limit: int
            How many page to scrape
        """
        if offer_type not in self.AVAILABLE_OPTIONS:
            raise Exception(f'Offer type {offer_type} not supported! Please choose one from: BUY,RENT,TEMPORARY_RENT')
            
        data = []
        with self._init_browser() as browser:
            self._load_url(browser, self.BASE_URL)
            self._accept_cookies(browser)
            self._search(browser, location, offer_type)
            self._accept_cookies(browser)
            
            search_url = browser.current_url
            page_num = 1
            while True:
                new_data = (
                    self._get_adverts_v1(browser)
                  + self._get_adverts_v2(browser)
                )
                
                if not new_data or page_limit <= page_num:
                    break
                
                data += new_data
                page_num += 1
                new_url = search_url + f'/seite-{page_num}'
                
                self._load_url(browser, new_url)
                
                
        if data:
            df = pd.DataFrame(data)
            df['offer_type'] = offer_type
            return df
        
        return pd.DataFrame()
            

In [10]:
scraper = Immobilienscout24Scraper(headless=False, sleep_time=5)

In [11]:
scraper.AVAILABLE_OPTIONS

['BUY', 'RENT', 'TEMPORARY_RENT']

In [14]:
to_scrape = [
    ('Wien','BUY', 11),
    ('Wien','RENT', 11)
]

In [15]:
data = []
for combination in to_scrape:
    data.append(scraper.scrape_adverts(*combination))

Loading: https://www.immobilienscout24.at/
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-2
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-3
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-4
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-5
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-6
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-7
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-8
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-9
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-10
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-11
Loading: https://www.immobilienscout24.at/
Loading: https://www.immobilienscout24.at/regional/wien/wien/wohnung-mie

In [16]:
df_scraped = pd.concat(data)

In [20]:
df_scraped.head()

Unnamed: 0,url,title,location,rooms_text,area_text,price_text,contact,offer_type
0,https://www.immobilienscout24.at/expose/613b30...,Lemonie,"Albrechtskreithgasse 32, 1160 Wien",2 – 3 Zimmer,"42,41 – 46,04 m² Fläche","ab 229.193,39 €",Mario Häring 3SI Makler GmbH,BUY
1,https://www.immobilienscout24.at/expose/6319f8...,AM MÜHLWASSER - Lobaugasse 116 - PROVISIONSFREI,1220 Wien,2 – 3 Zimmer,"49,4 – 105,1 m² Fläche",ab 299.000 €,Dennis Bernard WOLF & SOHN Immobilien GmbH,BUY
2,https://www.immobilienscout24.at/expose/627249...,PARK & BLOOM - Exklusives Neubauprojekt am Ran...,1180 Wien,2 – 4 Zimmer,"47,86 – 189,23 m² Fläche",Preis auf Anfrage,Ing. Dominic Lorenz Lorenz Real Construct Immo...,BUY
3,https://www.immobilienscout24.at/expose/627379...,Garten–Städtchen–22 - Wohnen am Fliedergarten,"Marlen-Haushofer-Weg 2, 1220 Wien",3 – 4 Zimmer,"78,55 – 92,56 m² Fläche",ab 382.000 €,Mischek Bauträger Service GmbH Mischek Bauträg...,BUY
4,https://www.immobilienscout24.at/expose/627248...,"Purer Wohntraum im 15.Bezirk! Exklusive, perfe...",1150 Wien,2 – 3 Zimmer,"62,02 – 70,02 m² Fläche",ab 469.900 €,Ehsan Karimian Immocity Real Estate GmbH,BUY


In [28]:
locations = [loc for loc in df_scraped['location'].unique().tolist() if loc!= '']

In [36]:
import requests 

class GeocodingApi:
    def __init__(self, key: str, sleep_time: int = 1):
        self.base_url = 'http://api.positionstack.com/v1/forward'
        self.key = key
        self.sleep_time = sleep_time
        
    def get_geocoding_data(self, locations: list) -> pd.DataFrame:
        """
        Get geocoding data from the locations list
        
        Parameters
        ----------
        locations: list 
            List with all locations to geocode
        """
        data = []
        for location in locations:
            logging.info(f'Getting goecoding data for: {location}')
            params = {
                'access_key': self.key,
                'query': location
            }
            
            r = requests.get(self.base_url, params = params)
            
            if r.status_code != 200:
                logging.info(f'Something went wrong... Status code is:{r.status_code}')
                continue
            
            response = r.json().get('data')
            if response:
                most_confident_result = sorted(response, key=lambda x: x['confidence'], reverse=True)[0]
                most_confident_result['requested_location'] = location
                
                data.append(most_confident_result)
            
            time.sleep(self.sleep_time)
        
        if data:
            return pd.DataFrame(data)
        
        return pd.DataFrame() 

In [37]:
geo_api = GeocodingApi('my_super_secret_key')

In [38]:
df_geo = geo_api.get_geocoding_data(locations)

Getting goecoding data for: Albrechtskreithgasse 32, 1160 Wien
Getting goecoding data for: 1220 Wien
Getting goecoding data for: 1180 Wien
Getting goecoding data for: Marlen-Haushofer-Weg 2, 1220 Wien
Getting goecoding data for: 1150 Wien
Getting goecoding data for: Nordbahnanlage 4, 1210 Wien
Getting goecoding data for: Langobardenstraße 3, 1220 Wien
Getting goecoding data for: Schlachthausgasse, 1030 Wien
Getting goecoding data for: 1030 Wien
Getting goecoding data for: Einsiedlergasse 27, 1050 Wien
Getting goecoding data for: Hauptstraße 104, 1140 Wien
Getting goecoding data for: Donaufelder Straße 205, 1220 Wien
Getting goecoding data for: Maximilian-Reich-Weg 4, 1210 Wien
Getting goecoding data for: Attemsgasse 44, 1220 Wien
Getting goecoding data for: Schweizertalstraße, 1130 Wien
Getting goecoding data for: Donaufelder Straße 207, 1220 Wien
Getting goecoding data for: Dißlergasse 8, 1030 Wien
Getting goecoding data for: Petzvalgasse 4, 1040 Wien
Getting goecoding data for: 1140 

In [45]:
df_final = df_scraped.merge(df_geo, left_on='location', right_on='requested_location', how='left')

In [48]:
df_final.head()

Unnamed: 0,url,title,location,rooms_text,area_text,price_text,contact,offer_type,latitude,longitude,...,region_code,county,locality,administrative_area,neighbourhood,country,country_code,continent,label,requested_location
0,https://www.immobilienscout24.at/expose/613b30...,Lemonie,"Albrechtskreithgasse 32, 1160 Wien",2 – 3 Zimmer,"42,41 – 46,04 m² Fläche","ab 229.193,39 €",Mario Häring 3SI Makler GmbH,BUY,48.220629,16.316295,...,WI,,Vienna,Wien,Dornbach,Austria,AUT,Europe,"Albrechtskreithgasse 32, Vienna, WI, Austria","Albrechtskreithgasse 32, 1160 Wien"
1,https://www.immobilienscout24.at/expose/6319f8...,AM MÜHLWASSER - Lobaugasse 116 - PROVISIONSFREI,1220 Wien,2 – 3 Zimmer,"49,4 – 105,1 m² Fläche",ab 299.000 €,Dennis Bernard WOLF & SOHN Immobilien GmbH,BUY,48.198674,16.348388,...,WI,,Vienna,Wien,,Austria,AUT,Europe,"Vienna, WI, Austria",1220 Wien
2,https://www.immobilienscout24.at/expose/627249...,PARK & BLOOM - Exklusives Neubauprojekt am Ran...,1180 Wien,2 – 4 Zimmer,"47,86 – 189,23 m² Fläche",Preis auf Anfrage,Ing. Dominic Lorenz Lorenz Real Construct Immo...,BUY,48.198674,16.348388,...,WI,,Vienna,Wien,,Austria,AUT,Europe,"Vienna, WI, Austria",1180 Wien
3,https://www.immobilienscout24.at/expose/627379...,Garten–Städtchen–22 - Wohnen am Fliedergarten,"Marlen-Haushofer-Weg 2, 1220 Wien",3 – 4 Zimmer,"78,55 – 92,56 m² Fläche",ab 382.000 €,Mischek Bauträger Service GmbH Mischek Bauträg...,BUY,48.198674,16.348388,...,WI,,Vienna,Wien,,Austria,AUT,Europe,"Vienna, WI, Austria","Marlen-Haushofer-Weg 2, 1220 Wien"
4,https://www.immobilienscout24.at/expose/627248...,"Purer Wohntraum im 15.Bezirk! Exklusive, perfe...",1150 Wien,2 – 3 Zimmer,"62,02 – 70,02 m² Fläche",ab 469.900 €,Ehsan Karimian Immocity Real Estate GmbH,BUY,48.198674,16.348388,...,WI,,Vienna,Wien,,Austria,AUT,Europe,"Vienna, WI, Austria",1150 Wien


In [49]:
df_final.to_excel('immobilienscout_scraped.xlsx', index=False)