In [1]:
import undetected_chromedriver as uc
import re
from datetime import datetime


In [2]:
url = 'https://www.aruodas.lt/butai-vilniuje-santariskese-dangerucio-g-naujas-pilnai-irengtas-butas-i-kuri-gali-1-3305167/'

In [3]:
def get_html(url):
    options = uc.ChromeOptions()
    options.add_argument('--headless')
    with uc.Chrome(options=options) as driver:
        driver.get(url)
        html = driver.page_source
    return html

In [4]:
def extract_element(tree, class_name, index=0):
    """
    Extract the text content of an HTML element with the given class name.
    Returns an empty string if the element does not exist.
    """
    elements = tree.body.find_class(class_name)
    if elements:
        return elements[index].text.strip()
    else:
        return ""

def extract_table(tree):
    """
    Extract the table information from the HTML tree.
    Returns a dictionary containing the table information.
    """
    table_dict = {}
    table_elements = tree.body.find_class('obj-details')[0].findall('dd')
    table_names = text_strip_list(tree.body.find_class('obj-details')[0].findall('dt'))
    table_names = [name for name in table_names if name != '']
    for i, name in enumerate(table_names):
        table_dict[name] = table_elements[i].text.strip()
    return table_dict

def extract_ad_stats(tree):
    """
    Extract the ad stats information from the HTML tree.
    Returns a dictionary containing the ad stats information.
    """
    ad_stats_dict = {}
    ad_stats_names = text_strip_list(tree.find_class('obj-stats simple')[0].find('dl').findall('dt'))
    ad_stats_values = text_strip_list(tree.find_class('obj-stats simple')[0].find('dl').findall('dd'))
    for i, name in enumerate(ad_stats_names):
        ad_stats_dict[name] = ad_stats_values[i]
    return ad_stats_dict


def text_strip_list(element_list):
    return [element.text_content().strip() for element in element_list]


def while_replace(string):
    while '  ' in string:
        string = string.replace('  ', ' ')

    return string

def extract_thumbs(tree):
    thumbs = tree.find_class('link-obj-thumb')
    thumbs = [thumb.get('href') for thumb in thumbs]
    thumbs = set(thumbs)
    thumbs.discard(None)
    return thumbs

def extract_photos(tree):
    urls = extract_thumbs(tree)
    photos = []
    for url in urls:
        if 'img.dgn' in url:
            photos.append(url)
    return photos


def extract_coordinates(tree):
    urls = extract_thumbs(tree)
    for url in urls:
        if 'maps.google' in url:
            return eval(url.split('=')[2])
        

def extract_number(tree):
    try:
        phone = extract_element(tree, 'phone_item_0')
        broker = True
    except Exception as e:
        print(e)
        phone = extract_element(tree, 'phone')
        broker = False
    return phone, broker

def extract_address(tree):
    address = tree.find_class('obj-header-text')[0].text_content().strip()
    address = re.split(', \d+ kamb', address)[0]
    return address



In [5]:
source = get_html(url)

In [6]:
# parse html with lxml
from lxml import html

In [7]:
tree = html.fromstring(source)

property_info = {
    'Price': extract_element(tree, 'price-eur'),
    'Address': extract_address(tree),
    'Phone': extract_number(tree)[0],
    'Broker': extract_number(tree)[1],
    'Coordinates': extract_coordinates(tree),
    'Date_scraped': datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
    'Description': extract_element(tree, 'collapsedText'),
    'Misc': text_strip_list(tree.body.find_class('special-comma')),
    'Photos': extract_photos(tree)
}

ad_table = extract_table(tree)
ad_stats = extract_ad_stats(tree)

property_info.update(ad_table)
property_info.update(ad_stats)

property_info.pop('Ypatybės:', None)
property_info.pop('Papildomos patalpos:', None)
property_info.pop('Papildoma įranga:', None)
property_info.pop('Apsauga:', None)


''