In [1]:
import undetected_chromedriver as uc
import re
from datetime import datetime
from lxml import html
from bs4 import BeautifulSoup

In [2]:
from scraping_tools import get_html, scrape_property
from scraping_tools import get_property_links


In [3]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://Kiwisaki:Mh0i0qZPJ3AXbkRy@real-estate.aaszr.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

db = client['Real-Estate']
collection = db['butai/kaune']

In [4]:
from typing import Tuple, Dict, Any, List

def preprocess_property(property: Dict) -> Dict:
    rename_dict = {
        'Namo_numeris': 'House_number',
        'Plotas': 'Area',
        'Kambarių_sk.': 'Number_of_rooms',
        'Aukštas': 'Floor',
        'Aukštų_sk.': 'Number_of_floors',
        'Metai': 'Year',
        'Pastato_tipas': 'Building_type',
        'Šildymas': 'Heating',
        'Įrengimas': 'Furnishing',
        'Pastato_energijos_suvartojimo_klasė': 'Energy_consumption_class',
        'Nuoroda': 'Link',
        'Įdėtas': 'Uploaded',
        'Redaguotas': 'Edited',
        'Aktyvus_iki': 'Active_until',
        'Įsiminė': 'Saved',
        'Peržiūrėjo': 'Viewed',
        'Sklypo_plotas': 'Plot_area',
        'Namo_tipas': 'House_type',
        'Artimiausias_vandens_telkinys': 'Nearest_water_reservoir',
        'Iki_vandens_telkinio_(m)': 'Distance_to_water_reservoir',
        
        
        
    }
    # renamee keys if in rename_dict else keep the same
    property = {rename_dict[key] if key in rename_dict else key: property[key] for key in property.keys()}

    # remove symbol and convert to float
    if 'Price' in property:
        property['Price'] = int(property['Price'].replace(' €', '').replace(' ', ''))

    if 'Area' in property:
        property['Area'] = float(property['Area'].replace(' m²', '').replace(',', '.'))

    if 'Number_of_floors' in property:
        property['Number_of_floors'] = int(property['Number_of_floors'])

    if 'Number_of_rooms' in property:
        property['Number_of_rooms'] = int(property['Number_of_rooms'])

    if 'Year' in property:
        try:
            property['Year'] = int(property['Year'])
        except ValueError:
            years = re.findall(r'\d{4}', property['Year'])

            property['Year'] = int(years[0])
            property['Renovation_year'] = int(years[1])

    if 'Viewed' in property:
        property['Viewed'] = int(property['Viewed'].split('/')[0])

    if 'Saved' in property:
        property['Saved'] = int(property['Saved'])

    # convert to datetime
    if 'Uploaded' in property:
        property['Uploaded'] = datetime.strptime(property['Uploaded'], '%Y-%m-%d')

    if 'Edited' in property:
        property['Edited'] = datetime.strptime(property['Edited'], '%Y-%m-%d')

    if 'Active_until' in property:
        property['Active_until'] = datetime.strptime(property['Active_until'], '%Y-%m-%d')

    if 'Date_scraped' in property:
        property['Date_scraped'] = datetime.strptime(property['Date_scraped'], '%d/%m/%Y %H:%M:%S')

    if 'Address' in property:
        property['City'] = property['Address'].split(',')[0]

    if 'Reserved' in property:
        property['Reserved'] = property['Reserved'] != ''

    if 'Distance_to_water_reservoir' in property:
        property['Distance_to_water_reservoir'] = int(property['Distance_to_water_reservoir'])
        

    return property

def get_thumbnail_links(url: str) -> List[str]:
    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    thumbnails = soup.find_all('div', class_='list-photo-v2')
    srcs = [thumbnail.find('img')['src'] for thumbnail in thumbnails]
    return srcs

def get_max_page(url: str) -> int:
    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    buttons = soup.find_all('a', class_='page-bt')
    button_texts = [button.text for button in buttons]
    # filter with regex if contains number
    button_texts = [button_text for button_text in button_texts if re.search(r'\d', button_text)]
    # get max number
    max_page = max([int(button_text) for button_text in button_texts])
    return max_page


In [5]:
import re

def extract_ad_id(string: str) -> str:
    pattern = r"\d-\d{7}"
    matches = re.findall(pattern, string)
    if matches:
        return matches[0]
    else:
        return None
    


In [6]:
def save_property(property: Dict) -> None:
    # Upload to MongoDB
    print(property)
    # Return Id of the uploaded property
    pass


from typing import Set
def get_scraped_properties() -> Set:
    # Get all properties from MongoDB
    # Return list of properties
    return set(['test', 'testas'])

class Scraper:
    def __init__(self):
        self.max_page = get_max_page('https://www.aruodas.lt/butai/')
        self.scraped_ids = get_scraped_properties()

    def scrape(self, ad_type: str = 'butai'):
        for page in range(1, self.max_page + 1):
            print(f'Page {page}/{self.max_page}')
            page_url = f'https://www.aruodas.lt/{ad_type}/puslapis/{page}/'
            property_ids = get_property_links(page_url)
            property_ids = [extract_ad_id(link) for link in property_ids]
            property_thumbs = get_thumbnail_links(page_url) # might be useful to combine into one function

            thumbs_match = len(property_thumbs) == len(property_ids)
            if not thumbs_match:
                print(f'Page {page}: Thumbs and ids do not match')

            for i, property_id in enumerate(property_ids):                
                if property_id in self.scraped_ids:
                    continue

                url = f'https://www.aruodas.lt/{property_id}/'
                print(f'Scraping: {url}')
                
                if thumbs_match:
                    property = scrape_property(url, Thumbnail=property_thumbs[i])
                else:
                    property = scrape_property(url)

                property = preprocess_property(property)
                save_property(property)
                
                
                self.scraped_ids.add(property_id)


In [7]:
scraper = Scraper()

types = ['butai', 'namai', 'butu-nuoma', 'namu-nuoma', 'patalpos', 'patalpu-nuoma']

scraper.scrape(types[1])

Page 1/302
Scraping: https://www.aruodas.lt/2-1502898/
{'Price': 245300, 'Address': 'Vilniaus r. sav., Balandiškių k., Gervių g., mūrinis sublokuotas namas', 'Phone': '+370 661 02882', 'Broker': True, 'Coordinates': (54.733656, 25.150875), 'Reserved': False, 'Date_scraped': datetime.datetime(2023, 6, 18, 15, 22, 54), 'Description': '', 'Misc': ['Kraštinis sklypas', 'Greta miško', 'Asfaltuotas privažiavimas', 'Elektra', 'Rūbinė', 'Terasa', 'Šildomos grindys', 'Plastikiniai vamzdžiai', 'Aptverta teritorija', 'Šarvuotos durys'], 'Photos': ['https://aruodas-img.dgn.lt/object_62_110064807/vilniaus-r-sav-balandiskiu-k-gerviu-g.jpg', 'https://aruodas-img.dgn.lt/object_62_109886313/vilniaus-r-sav-balandiskiu-k-gerviu-g.jpg', 'https://aruodas-img.dgn.lt/object_62_110064805/vilniaus-r-sav-balandiskiu-k-gerviu-g.jpg', 'https://aruodas-img.dgn.lt/object_62_109886301/vilniaus-r-sav-balandiskiu-k-gerviu-g.jpg', 'https://aruodas-img.dgn.lt/object_62_110064831/vilniaus-r-sav-balandiskiu-k-gerviu-g.jpg

KeyboardInterrupt: 