In [None]:
import time
import logging

logging.info = print

import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

class SRealityScraper():
    def __init__(self, path_to_chromedriver: str = '', headless: bool = True, sleep_time: float = 3):
        """
        Parameters
        ----------
        path_to_chromedriver: str 
            Path to executable of chromedriver
        headless: bool
            Browser headles
        sleep_time: float
            Sleep time in seconds between requests
        additional_sleep_tim: float
            Additional sleep time for time consuming reloads
        """
        self.PATH_TO_CHROMEDRIVER = path_to_chromedriver
        self.HEADLESS = headless
        self.SLEEP_TIME = sleep_time
    
    def _load_url(self, browser: webdriver, url: str, max_retries: int = 1) -> None:
        """
        Load url

        Parameters
        ----------
        browser: webdriver 
            Instance of webdriver
            
        url: str 
            Url to load
            
        max_retries: int
            How many retries is possible
        """
        logging.info(f'Loading: {url}')
        
        sleep_time = self.SLEEP_TIME
        retries = 1
        while True:
            try:
                browser.get(url)
                break
            except:
                if retry == max_retries:
                    raise
                else:
                    sleep_time *= 2
                    retries += 1
                    time.sleep(sleep_time)
                           
        time.sleep(self.SLEEP_TIME)
        browser.maximize_window()
        
    def _accept_cookies(self, browser: webdriver):
        """
        Accept cookies if form pops up
        """
        try:
            ac = ActionChains(browser)
            ac.move_to_element(browser.find_element_by_class_name('szn-cmp-dialog-container'))\
              .pause(3)\
              .click()\
              .perform()
            for i in range(9):
                ac.send_keys(Keys.TAB).perform()
            ac.send_keys(Keys.ENTER).perform()
        except:
            pass
    
    def _init_browser(self) -> webdriver:
        """
        Initialize browser

        Returns
        -------
        set
            webdriver
        """
        chrome_options = Options()
        chrome_options.add_argument("--disable-popup-blocking")
        chrome_options.add_argument("--incognito")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--deny-permission-prompts')
        
        if self.HEADLESS:
            chrome_options.add_argument("--headless")
        
        if self.PATH_TO_CHROMEDRIVER == '':
            return webdriver.Chrome(options=chrome_options)
  
        return webdriver.Chrome(self.PATH_TO_CHROMEDRIVER, options=chrome_options)
        
    def _next_page(self, browser: webdriver):
        """
        Click on the next page button if exists
        
        Parameters
        ----------
        browser: webdriver 
            Instance of webdriver
            
        Returns
        -------
        bool
            If page contains next page returns True ale False
        """
        try:
            next_btn = browser.find_element_by_class_name('paging-next')

            if 'disabled' not in next_btn.get_attribute('class'):
                next_btn.click()
                time.sleep(self.SLEEP_TIME)
                return True
        except Exception as e:
            pass

        return False
    
    def get_adverts_urls(self, url: str) -> list[str]:
        """
        Returns urls of advertised jobs
        
        Parameters
        ----------
        url: str 
            Url with adverts to load
        location: str
            Where to search (British Columbia, ...)
        Returns
        -------
        list
            List with urls to advertised jobs
        """
        urls = []
        with self._init_browser() as browser:
            self._load_url(browser, url)
            self._accept_cookies(browser)
            
            while True: 
                urls += [e.find_element_by_tag_name('a').get_attribute('href') 
                         for e in browser.find_elements_by_class_name('property')
                         if e.find_elements_by_tag_name('a')]
                
                if not self._next_page(browser):
                    break
                    
                time.sleep(self.SLEEP_TIME)
                
        return list(set(urls))
    
    def get_adverts_details(self, urls: list[str]) -> pd.DataFrame:
        data = []
        
        with self._init_browser() as browser:
            for url in urls:
                self._load_url(browser, url)
                self._accept_cookies(browser)
                data.append(
                    {
                        'url': url,
                        'title': browser.find_element_by_class_name('name').text.strip() if  browser.find_elements_by_class_name('name') else np.nan, 
                        'location': browser.find_element_by_class_name('location').text.strip() if browser.find_elements_by_class_name('location') else np.nan,
                        'price': browser.find_element_by_class_name('norm-price').text.strip() if browser.find_elements_by_class_name('norm-price') else np.nan,
                        'description': browser.find_element_by_class_name('description').text.strip() if browser.find_elements_by_class_name('description') else np.nan
                    }
                )
        
        if data:
            return pd.DataFrame(data)
        
        return pd.DataFrame()
    
    def get_adverts(self, url: str) -> pd.DataFrame:
        """
        Scrape all the jobs for specified position and location
        
        Parameters
        ----------
        url: str 
            Url with adverts to load
            
        Returns
        -------
        DataFrame
            DataFrame with scraped data
        """
        urls = self.get_adverts_urls(url)
        
        return self.get_adverts_details(urls)

In [None]:
scraper = SRealityScraper(headless=False)

In [None]:
url = 'https://www.sreality.cz/hledani/prodej/domy?region=obec%20Ivančice&region-id=5762&region-typ=municipality&vzdalenost=1'

In [None]:
df = scraper.get_adverts(url)

In [None]:
df.dropna(subset=['title','location','price','description'])#.to_excel('sreality.xlsx', index=False)