In [14]:
import chromedriver_autoinstaller
from selenium import webdriver
from bs4 import BeautifulSoup
import bs4

In [15]:
chromedriver_autoinstaller.install()



In [16]:
webdriver_options = webdriver.ChromeOptions()
webdriver_options.add_argument('--headless')
webdriver_options.add_argument('--no-sandbox')
webdriver_options.add_argument('--disable-dev-shm-usage')
webdriver_options.add_argument('--disable-gpu')
webdriver_options.add_argument('--disable-extentions')

In [17]:
from datetime import date
from datetime import datetime
import os
import urllib

city = "utrecht"
url = f"https://www.pararius.com/apartments/{city}/0-1750/1-bedrooms/furnished/50m2"
date_now = date.today().strftime("%Y-%m-%d")
ts_now = datetime.now().timestamp()

DATA_OUTPUTS_DIR = "data/apartments-list/city={city}/date={date}"
os.makedirs(DATA_OUTPUTS_DIR.format(city=city, date=date_now), exist_ok=True)

base_url = urllib.parse.urlparse(url).scheme + "://" + urllib.parse.urlparse(url).netloc
base_url

'https://www.pararius.com'

In [21]:
driver = webdriver.Chrome(options=webdriver_options)
driver.get(url)

In [22]:
html = driver.page_source

with open(f"{DATA_OUTPUTS_DIR.format(city=city, date=date_now)}/{ts_now}.html", "w") as f:
    f.write(html)

In [23]:
driver.close()

In [8]:
from typing import Any

def extract_info_from_listing(listing:bs4.element.Tag) -> dict[str, Any]:
    """
    Extracts information from a singular listing.
    """
    title = listing.find("a", class_="listing-search-item__link listing-search-item__link--title").text.strip()
    address = listing.find("div", class_="listing-search-item__sub-title'").text.strip()
    price_text = listing.find("div", class_="listing-search-item__price").text.strip()
    features_item = listing.find("div", class_="listing-search-item__features")
    surface_area = features_item.find("li", class_="illustrated-features__item illustrated-features__item--surface-area")
    n_rooms = features_item.find("li", class_="illustrated-features__item illustrated-features__item--number-of-rooms")
    interior_type = features_item.find("li", class_="illustrated-features__item illustrated-features__item--interior")
    listing_info_item = listing.find("div", class_="listing-search-item__info").find("a", class_="listing-search-item__link")
    real_estate_company = listing_info_item.text.strip()
    real_estate_company_url = listing_info_item.get("href")
    listing_url = listing.find("a", class_="listing-search-item__link listing-search-item__link--title").get("href")
    listing_thumbnail = listing.find("img").get("src")
    return {
        "title": title,
        "address": address,
        "price_text": price_text,
        "url": base_url + listing_url,
        "thumbnail": listing_thumbnail,
        "real_estate_company": real_estate_company,
        "real_estate_company_url": base_url + real_estate_company_url,
        "features":{
            "surface_area": surface_area.text.strip(),
            "n_rooms": n_rooms.text.strip(),
            "interior_type": interior_type.text.strip()
        }
    }

In [9]:
# Read the HTML text with BeautifulSoup:
soup = BeautifulSoup(html, "html.parser")
# Retrieve the element at: body > div.page__content > div.page__results > div:nth-child(3) > div > ul > li:nth-child(18)
# Get all the elements of the unordered list that have the 'search-list__item--listing' class:
listings = soup.find_all("li", class_="search-list__item--listing")
print(len(listings))

13


In [10]:
listings_collection = []
for listing in listings:
    listings_collection.append(extract_info_from_listing(listing))
len(listings_collection)

13

In [11]:
import polars as pl

df= pl.DataFrame(listings_collection).unnest("features")
df

title,address,price_text,url,thumbnail,real_estate_company,real_estate_company_url,surface_area,n_rooms,interior_type
str,str,str,str,str,str,str,str,str,str
"""Flat Aziëlaan""","""3526 SN Utrech…","""€1,595 per mon…","""https://www.pa…","""https://casco-…","""B&S Rental Ser…","""https://www.pa…","""72 m²""","""2 rooms""","""Furnished"""
"""House Minahass…","""3531 KW Utrech…","""€1,470 per mon…","""https://www.pa…","""https://casco-…","""Lemoo""","""https://www.pa…","""60 m²""","""2 rooms""","""Part-furnished…"
"""Flat Wittevrou…","""3572 CA Utrech…","""€1,200 per mon…","""https://www.pa…","""https://casco-…","""Home by Theres…","""https://www.pa…","""60 m²""","""2 rooms""","""Furnished"""
"""Flat Nieuwe Pi…","""3513 XT Utrech…","""€1,650 per mon…","""https://www.pa…","""https://casco-…","""Matton Residen…","""https://www.pa…","""55 m²""","""2 rooms""","""Furnished"""
"""Flat Pieter Ni…","""3514 HG Utrech…","""€1,550 per mon…","""https://www.pa…","""https://casco-…","""Your-House""","""https://www.pa…","""58 m²""","""2 rooms""","""Furnished"""
"""Flat Flamingos…","""3582 SW Utrech…","""€1,395 per mon…","""https://www.pa…","""https://casco-…","""Amstel Housing…","""https://www.pa…","""58 m²""","""3 rooms""","""Furnished"""
"""Flat Herenstra…","""3512 KC Utrech…","""€1,725 per mon…","""https://www.pa…","""https://casco-…","""Covers Wonen""","""https://www.pa…","""58 m²""","""3 rooms""","""Furnished"""
"""Flat Willem Ba…","""3572 PB Utrech…","""€1,400 per mon…","""https://www.pa…","""https://casco-…","""Rotsvast Utrec…","""https://www.pa…","""71 m²""","""2 rooms""","""Furnished"""
"""Flat Kanaalstr…","""3531 CH Utrech…","""€1,750 per mon…","""https://www.pa…","""https://casco-…","""EU-Makelaardij…","""https://www.pa…","""55 m²""","""2 rooms""","""Furnished"""
"""Flat Niasstraa…","""3531 WR Utrech…","""€1,635 per mon…","""https://www.pa…","""https://casco-…","""123Wonen Utrec…","""https://www.pa…","""85 m²""","""3 rooms""","""Furnished"""


In [12]:
parsed_price_text = df["price_text"].str.extract_groups(r"€(?<price>\d+,?\d+) per (?<period>\w+)")
parsed_surface_area = df["surface_area"].str.extract_groups(r"(?<amount>\d+)\s*(?<unit>.*)")
parsed_df = df.with_columns(
    city=pl.lit(city),
    website=pl.lit(base_url),
    price=parsed_price_text.struct.field("price").str.replace(",", "").cast(pl.Float64),
    price_period=parsed_price_text.struct.field("period"),
    surface_area_amount=parsed_surface_area.struct.field("amount").cast(pl.Float64),
    surface_area_unit=parsed_surface_area.struct.field("unit"),
    n_rooms=df["n_rooms"].str.extract(r"(\d+) rooms").cast(pl.Float64),
    date=pl.lit(date_now),
    extracted_at=pl.from_epoch(pl.lit(ts_now)),
).select(pl.exclude("price_text"))
parsed_df

title,address,url,thumbnail,real_estate_company,real_estate_company_url,surface_area,n_rooms,interior_type,city,website,price,price_period,surface_area_amount,surface_area_unit,date,extracted_at
str,str,str,str,str,str,str,f64,str,str,str,f64,str,f64,str,str,datetime[μs]
"""Flat Aziëlaan""","""3526 SN Utrech…","""https://www.pa…","""https://casco-…","""B&S Rental Ser…","""https://www.pa…","""72 m²""",2.0,"""Furnished""","""utrecht""","""https://www.pa…",1595.0,"""month""",72.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""House Minahass…","""3531 KW Utrech…","""https://www.pa…","""https://casco-…","""Lemoo""","""https://www.pa…","""60 m²""",2.0,"""Part-furnished…","""utrecht""","""https://www.pa…",1470.0,"""month""",60.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""Flat Wittevrou…","""3572 CA Utrech…","""https://www.pa…","""https://casco-…","""Home by Theres…","""https://www.pa…","""60 m²""",2.0,"""Furnished""","""utrecht""","""https://www.pa…",1200.0,"""month""",60.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""Flat Nieuwe Pi…","""3513 XT Utrech…","""https://www.pa…","""https://casco-…","""Matton Residen…","""https://www.pa…","""55 m²""",2.0,"""Furnished""","""utrecht""","""https://www.pa…",1650.0,"""month""",55.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""Flat Pieter Ni…","""3514 HG Utrech…","""https://www.pa…","""https://casco-…","""Your-House""","""https://www.pa…","""58 m²""",2.0,"""Furnished""","""utrecht""","""https://www.pa…",1550.0,"""month""",58.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""Flat Flamingos…","""3582 SW Utrech…","""https://www.pa…","""https://casco-…","""Amstel Housing…","""https://www.pa…","""58 m²""",3.0,"""Furnished""","""utrecht""","""https://www.pa…",1395.0,"""month""",58.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""Flat Herenstra…","""3512 KC Utrech…","""https://www.pa…","""https://casco-…","""Covers Wonen""","""https://www.pa…","""58 m²""",3.0,"""Furnished""","""utrecht""","""https://www.pa…",1725.0,"""month""",58.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""Flat Willem Ba…","""3572 PB Utrech…","""https://www.pa…","""https://casco-…","""Rotsvast Utrec…","""https://www.pa…","""71 m²""",2.0,"""Furnished""","""utrecht""","""https://www.pa…",1400.0,"""month""",71.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""Flat Kanaalstr…","""3531 CH Utrech…","""https://www.pa…","""https://casco-…","""EU-Makelaardij…","""https://www.pa…","""55 m²""",2.0,"""Furnished""","""utrecht""","""https://www.pa…",1750.0,"""month""",55.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08
"""Flat Niasstraa…","""3531 WR Utrech…","""https://www.pa…","""https://casco-…","""123Wonen Utrec…","""https://www.pa…","""85 m²""",3.0,"""Furnished""","""utrecht""","""https://www.pa…",1635.0,"""month""",85.0,"""m²""","""2023-11-25""",2023-11-25 19:09:08


In [13]:
parsed_df.write_parquet("data/apartments-list/")

IsADirectoryError: expected a file path; 'data/apartments-list/' is a directory

In [None]:
parsed_df["extracted_at"][0].timestamp()

1700930459.0

In [None]:
parsed_df.write_parquet(f"s3://my-bucket/{DATA_OUTPUTS_DIR.format(city=city, date=date_now)}/{ts_now}.parquet")

thread '<unnamed>' panicked at py-polars/src/dataframe.rs:804:46:
called `Result::unwrap()` on an `Err` value: Os { code: 2, kind: NotFound, message: "No such file or directory" }
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: called `Result::unwrap()` on an `Err` value: Os { code: 2, kind: NotFound, message: "No such file or directory" }