In [4]:
import datetime
import logging
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


SCRAPING_TIMEOUT = 30
URL_FILE_PATH = 'C:/dev/re_kz/bi_urls.txt'
logging.getLogger('WDM').setLevel(logging.NOTSET)

def get_selenium_scraping_options():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('start-maximized')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    return options


def get_webdriver():
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=get_selenium_scraping_options())
    user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {'userAgent': user_agent})
    return driver

def analyse_url(url):
    driver = get_webdriver()
    driver.get(url)
    element_price = WebDriverWait(driver, SCRAPING_TIMEOUT).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//div[contains(text(),'Стоимость')]//following::div[1]")
                    )
                )
    price = int(element_price.text.replace(' ₸','').replace(",",""))

    element_etage = WebDriverWait(driver, SCRAPING_TIMEOUT).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//div[contains(text(),'Этаж')]//following::div[1]")
                    )
                )
    etage = element_etage.text

    element_surface = WebDriverWait(driver, SCRAPING_TIMEOUT).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//div[contains(text(),'Площадь')]//following::div[1]")
                    )
                )
    surface = element_surface.text.replace(' м²','')
    
    element_entrance = WebDriverWait(driver, SCRAPING_TIMEOUT).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//div[contains(text(),'Подъезд')]//following::div[1]")
                    )
                )
    entrance = element_entrance.text
    return pd.DataFrame([[etage, surface, price,entrance,url]],columns=['Floor','Surface','Price','Entrance', 'Link'])

def read_urls():
    urls = []
    with open(URL_FILE_PATH) as url_file:
        for line in url_file:
            urls.append(str(line)[:-1])
    return urls

def analyze_urls():
    urls = read_urls()
    result = pd.DataFrame(columns=['Floor','Surface','Price','Entrance', 'Link'])
    for url in urls:
        elements = analyse_url(url)
        result= pd.concat([result,elements])
    result= result.reset_index(drop=True)
    today = datetime.datetime.today().strftime('%Y-%m-%d')
    result.to_csv('C:/dev/re_kz/'+today+'bi_analysis.csv')
    return result

In [3]:
read_urls()

['https://bi.group/ru/flats?placementUUID=28c044c0-5978-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=0996a343-5978-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=b113b433-614e-11ec-a81f-001dd8b72708&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=9db21e85-614e-11ec-a81f-001dd8b72708&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=21d2558c-5980-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=19de357c-5980-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=ebf66800-597f-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=078a1225-5980-11ec-a829-001

In [5]:
result = analyze_urls()
result

















Unnamed: 0,Floor,Surface,Price,Entrance,Link
0,7 из 12,40.91,25314739,5,https://bi.group/ru/flats?placementUUID=28c044...
1,4 из 12,40.91,25314739,5,https://bi.group/ru/flats?placementUUID=0996a3...
2,6 из 12,41.39,25611759,3,https://bi.group/ru/flats?placementUUID=b113b4...
3,4 из 12,41.39,25611759,3,https://bi.group/ru/flats?placementUUID=9db21e...
4,7 из 9,46.22,26229850,6,https://bi.group/ru/flats?placementUUID=21d255...
5,6 из 9,46.22,26229850,6,https://bi.group/ru/flats?placementUUID=19de35...
6,6 из 9,42.82,26272340,6,https://bi.group/ru/flats?placementUUID=ebf668...
7,4 из 9,46.22,26484060,6,https://bi.group/ru/flats?placementUUID=078a12...
8,5 из 9,46.24,26495520,7,https://bi.group/ru/flats?placementUUID=339491...
9,7 из 9,45.81,28106863,4,https://bi.group/ru/flats?placementUUID=cc66d4...


In [3]:
read_urls()

['https://bi.group/ru/flats?placementUUID=28c044c0-5978-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=0996a343-5978-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=b113b433-614e-11ec-a81f-001dd8b72708&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=9db21e85-614e-11ec-a81f-001dd8b72708&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=21d2558c-5980-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=19de357c-5980-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=ebf66800-597f-11ec-a829-001dd8b726aa&realEstateUUID=c68f11e7-48d1-11eb-a83d-00155d106579',
 'https://bi.group/ru/flats?placementUUID=078a1225-5980-11ec-a829-001

In [5]:
today = datetime.datetime.today()
today.month+1

10