In [12]:
import requests, random, re, json
from bs4 import BeautifulSoup
import pandas as pd
import pickle
import numpy as np
import statsmodels.api as sm
from datetime import datetime
import ast
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from IPython.display import display
import warnings
warnings.filterwarnings("ignore", message="Could not find the number of physical cores")


#### Imports

In [17]:
## importing model
with open("model.pkl", "rb") as f:
    model = pickle.load(f)


BASE = "https://www.aruodas.lt/butu-nuoma/vilniuje/puslapis/{page}/"

# A small pool of real-world browser UAs
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
      "AppleWebKit/537.36 (KHTML, like Gecko) "
      "Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
      "AppleWebKit/605.1.15 (KHTML, like Gecko) "
      "Version/14.1.2 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) "
      "AppleWebKit/537.36 (KHTML, like Gecko) "
      "Chrome/114.0.0.0 Safari/537.36",
]

# Extended headers to mimic a real browser
COMMON_HEADERS = {
    "Accept":                    "text/html,application/xhtml+xml,application/xml;"
                                 "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language":           "en-GB,en;q=0.9",
    "Accept-Encoding":           "gzip, deflate, br",
    "Upgrade-Insecure-Requests": "1",
    "Referer":                   "https://www.aruodas.lt/",
    "Connection":                "keep-alive",
    "Sec-Fetch-Site":            "same-origin",
    "Sec-Fetch-Mode":            "navigate",
    "Sec-Fetch-Dest":            "document",
}

def make_session():
    sess = requests.Session()
    sess.headers.update(COMMON_HEADERS)
    # Prime cookies or JS challenges
    sess.get("https://www.aruodas.lt/butu-nuoma/vilniuje/", timeout=5)
    return sess

def _parse_dl_block(dl):
    """
    Extract <dt>/<dd> pairs from a <dl> block.
    Returns a dict of {key: [values]}.
    """
    out = {}
    if not dl:
        return out
    for dt in dl.find_all("dt"):
        key = dt.get_text(strip=True).rstrip(":")
        dd  = dt.find_next_sibling("dd")
        if not dd:
            continue

        spans = [s.get_text(strip=True) for s in dd.find_all("span") if s.get_text(strip=True)]
        if spans:
            out[key] = spans
        else:
            text = dd.get_text(strip=True)
            out[key] = [text] if text else []
    return out


def add_primary_heating_dummies(df, source_col="Šildymas"):
    """
    From df[source_col] (list or JSON-string list of heating types), extract
    the first word of the first list entry and one-hot encode:
      - Centrinis
      - Dujinis
      - Elektra
    using 'Kita' as the reference (i.e. no dummy for 'Kita').

    Returns a new DataFrame with the dummy columns added.
    """
    
    
    def get_primary(s):
        if pd.isna(s):
            return "Kita"
        # Case 1: already a list
        if isinstance(s, (list, tuple, set)):
            items = list(s)
        else:
            # Case 2: string
            try:
                items = json.loads(s)
            except Exception:
                try:
                    items = ast.literal_eval(str(s))
                except Exception:
                    return "Kita"
        if not items:
            return "Kita"
        first = str(items[0])
        # grab the first token before space or comma
        m = re.match(r"^([^ ,]+)", first)
        return m.group(1) if m else "Kita"

    # 1) build a Series of the primary heating type
    prim = df[source_col].map(get_primary).astype("category")

    # 2) manually create dummies for the 3 you want
    df = df.copy()
    df["heat_Centrinis"] = (prim == "Centrinis").astype(int)
    df["heat_Dujinis"]   = (prim == "Dujinis").astype(int)
    df["heat_Elektra"]   = (prim == "Elektra").astype(int)

    # 'Kita' is the implicit case when all three are 0
    return df

def add_window_orientation_dummies(df, source_col="Langų orientacija"):
    """
    From df[source_col] (JSON‑string lists of orientations), extract
    the first word of the first list entry (Pietūs, Vakarai, Rytai, Šiaurė, etc.),
    then one‑hot encode:
      - orient_Pietus
      - orient_Vakarai
      - orient_Rytai
    using 'Šiaurė' as the implicit reference (all zeros).
    """
    def get_primary_orient(s):
        if pd.isna(s):
            return "Šiaurė"
        try:
            items = json.loads(s)
            if not items:
                return "Šiaurė"
            first = items[0]
            # grab the first token before space or comma
            m = re.match(r"^([^ ,]+)", first)
            return m.group(1) if m else "Šiaurė"
        except Exception:
            return "Šiaurė"

    prim = df[source_col].map(get_primary_orient).astype("category")

    out = df.copy()
    out["orient_Pietus"] = (prim == "Pietūs").astype(int)
    out["orient_Vakarai"] = (prim == "Vakarai").astype(int)
    out["orient_Rytai"]  = (prim == "Rytai").astype(int)
    # Šiaurė is the reference (when all three dummies are zero)
    return out


### Defining new functions

In [25]:
def _first_value(d, col):
    """Return the first value for `col` (handles list/tuple/str)."""
    v = d.get(col, pd.Series([None])).iloc[0]
    if isinstance(v, (list, tuple, set)):
        v = list(v)[0] if len(v) else None
    return v

def _parse_number(text):
    """Return float from text like '129 m²' or '45,5 m²'; NaN if none."""
    if text is None or (isinstance(text, float) and pd.isna(text)):
        return np.nan
    s = str(text).replace("\u00A0", " ").replace(",", ".")
    m = re.search(r"[-+]?\d+(?:\.\d+)?", s)
    return float(m.group(0)) if m else np.nan

def _extract_location_from_title(soup):
    """
    Parse <h1 class="obj-header-text">Vilnius, Žirmūnai, Olimpiečių g., ...</h1>
    → return (city, district, street).
    """
    h1 = soup.select_one("h1.obj-header-text")
    if not h1:
        return None, None, None

    # Get clean text
    txt = h1.get_text(" ", strip=True)

    # Keep only the part before 'buto nuoma' (or similar trailing phrase)
    # e.g. "Vilnius, Žirmūnai, Olimpiečių g."
    head = re.split(r"\b(buto|būsto)\s+nuoma\b", txt, flags=re.IGNORECASE)[0]

    # Split by commas and strip
    parts = [p.strip(" ,") for p in head.split(",") if p.strip(" ,")]

    # Expect at least 3 parts: city, district, street
    city     = parts[0] if len(parts) >= 1 else None
    district = parts[1] if len(parts) >= 2 else None
    street   = parts[2] if len(parts) >= 3 else None
    return city, district, street


# def scrape_listing(url, session=None):
#     if session is None:
#         session = make_session()
#     session.headers['User-Agent'] = random.choice(USER_AGENTS)
#     resp = session.get(url, timeout=10)
#     resp.raise_for_status()
#     soup = BeautifulSoup(resp.text, "html.parser")

#     # --- parse main obj-details ---
#     details = _parse_dl_block(soup.find("dl", class_="obj-details"))

#     # --- parse obj-stats (Įdėtas, Redaguotas, Aktyvus iki, etc.) ---
#     stats = _parse_dl_block(soup.find("div", class_="obj-stats").find("dl"))

#     # merge them
#     details.update(stats)

#     out = {"url": url}
#     out.update(details)
#     return out

def scrape_listing(url, session=None):
    if session is None:
        session = make_session()
    session.headers['User-Agent'] = random.choice(USER_AGENTS)
    resp = session.get(url, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Parse both <dl> blocks
    details = _parse_dl_block(soup.find("dl", class_="obj-details"))
    stats   = _parse_dl_block(soup.find("div", class_="obj-stats").find("dl")) if soup.find("div", class_="obj-stats") else {}
    details.update(stats)

    # NEW: city / district / street from header
    city, district, street = _extract_location_from_title(soup)
    if city:     details["city"]     = [city]
    if district: details["district"] = [district]
    if street:   details["street"]   = [street]

    out = {"url": url}
    out.update(details)
    return out

# centre of Vilnius
city_center = (54.6872, 25.2797)

# single shared geocoder + a tiny cache
_geocoder = Nominatim(user_agent="rent_model_geocoder", timeout=10)
_GEOCODE_CACHE = {}

def _geocode_addr(addr: str):
    if not addr:
        return None, None
    if addr in _GEOCODE_CACHE:
        return _GEOCODE_CACHE[addr]
    try:
        loc = _geocoder.geocode(addr)
        if loc:
            _GEOCODE_CACHE[addr] = (loc.latitude, loc.longitude)
            return loc.latitude, loc.longitude
    except Exception:
        pass
    _GEOCODE_CACHE[addr] = (None, None)
    return None, None


def featurise(raw_dict):
    """
    Take one scraped listing (dict) and transform into model-ready features.
    Replicates your training logic but tolerates missing fields.
    """
    df = pd.DataFrame([raw_dict])

    # ---- Ensure columns exist (create with safe defaults) ----
    # columns that are list-like in training
    listlike_defaults = {
        "Šildymas": [],
        "Ypatybės": [],
        "Papildomos patalpos": [],
        # "Langų orientacija": []   # keep only if you still use it
    }
    for col, default in listlike_defaults.items():
        if col not in df.columns:
            df[col] = [json.dumps(default)]  # your downstream uses json.loads
        else:
            # normalise: if it's already a list, turn into JSON string for consistency
            if isinstance(df.at[df.index[0], col], (list, tuple, set)):
                df[col] = df[col].map(lambda v: json.dumps(list(v), ensure_ascii=False))

    # ---- Clean/convert columns used by your model ----
    df['Metai'] = (
        df.get('Metai', pd.Series([None]))
          .astype(str).str.extract(r'(\d{4})', expand=False).astype(float)
    )
    df['year_centered'] = df['Metai'] - 2000

    # Heating dummies (robust version you just adopted)
    df = add_primary_heating_dummies(df)

    # If you kept window orientation, wrap it with a missing-column guard
    # df = add_window_orientation_dummies(df)  # only if still in your feature set

    # has_lift
    df['has_lift'] = (
        df['Ypatybės']
          .map(lambda s: 'Yra liftas' in (json.loads(s) if pd.notnull(s) else []))
          .astype(int)
    )

    # has_balcony_terrace
    df['has_balcony_terrace'] = (
        df['Papildomos patalpos']
          .map(lambda s: any(x in {'Balkonas','Terasa'} for x in (json.loads(s) if pd.notnull(s) else [])))
          .astype(int)
    )

    # has_parking_spot
    df['has_parking_spot'] = (
        df['Papildomos patalpos']
          .map(lambda s: 'Vieta automobiliui' in (json.loads(s) if pd.notnull(s) else []))
          .astype(int)
    )

    # --- Build address → geocode → distance ---
    city     = _first_value(df, "city") or "Vilnius"
    district = _first_value(df, "district")
    street   = _first_value(df, "street")
    house    = _first_value(df, "Namo numeris")  

    lat = df.get('latitude', pd.Series([None])).iloc[0]
    lon = df.get('longitude', pd.Series([None])).iloc[0]

    if pd.isna(lat) or pd.isna(lon):
        # 1. First try with district
        parts_with = [p for p in [city, district, street] if p]
        base_with = ", ".join(parts_with)
        addr_with = f"{base_with} {house}" if (base_with and house) else base_with

        lat, lon = _geocode_addr(addr_with)
        if (lat is None or lon is None) and house:
            lat, lon = _geocode_addr(base_with)

        # 2. If still nothing, retry WITHOUT district
        if (lat is None or lon is None):
            parts_no = [p for p in [city, street] if p]
            base_no = ", ".join(parts_no)
            addr_no = f"{base_no} {house}" if (base_no and house) else base_no

            lat, lon = _geocode_addr(addr_no)
            if (lat is None or lon is None) and house:
                lat, lon = _geocode_addr(base_no)

    df["latitude"]  = lat
    df["longitude"] = lon
    df["dist_to_center_km"] = (
        geodesic((lat, lon), city_center).km
        if pd.notnull(lat) and pd.notnull(lon) else np.nan)

    # --- Įdėtas → age_days ---
    posted_txt = _first_value(df, "Įdėtas")   # e.g. "2025-08-18"
    
    try:
        posted_date = pd.to_datetime(str(posted_txt), errors="coerce")
        if pd.notnull(posted_date):
            today = pd.to_datetime(datetime.today().date())
            df["age_days"] = (today - posted_date).days   # <-- no .dt
        else:
            df["age_days"] = np.nan
    except Exception:
        df["age_days"] = np.nan

    # --- Plotas → area_m2 ---
    pl_txt = _first_value(df, "Plotas")   # e.g. "129 m²" or "45,5 m²"
    df["area_m2"] = _parse_number(pl_txt)

    # --- Aukštų sk. → floor_total ---
    ft_txt = _first_value(df, "Aukštų sk.")   # e.g. [5] or "5"
    df["floor_total"] = _parse_number(ft_txt)

    # --- Aukštas → floor_current ---
    fc_txt = _first_value(df, "Aukštas")   # e.g. [3] or "3"
    df["floor_current"] = _parse_number(fc_txt)

    # --- Kambarių sk. → rooms ---
    rm_txt = _first_value(df, "Kambarių sk.")   # e.g. [3] or "3"
    df["rooms"] = _parse_number(rm_txt)


    # Final feature list (without orientation, per your last decision)
    numeric_feats = [
        'rooms','floor_current','floor_total','age_days','area_m2',
        'year_centered','dist_to_center_km',
        'heat_Centrinis','heat_Dujinis','heat_Elektra',
        'has_lift','has_balcony_terrace','has_parking_spot'
    ]

    # Ensure presence, fill missing numerics with NaN → your model/pipe should handle or you can fill here
    for col in numeric_feats:
        if col not in df.columns:
            df[col] = np.nan
    

    nice = (
        df[numeric_feats]
          .T.rename_axis("Feature")
          .reset_index()
          .rename(columns={0: "Value"})
    )
    
    display(
        nice.style
            .format({"Value": "{:.2f}"})
            .set_properties(subset=["Feature"], **{"font-weight": "600"})
            .hide(axis="index")  # removes the default 0..N index
    )
    
    return df[numeric_feats]


def predict_from_url(url, model, session=None):
    raw = scrape_listing(url, session=session)
    feats = featurise(raw)
    X = feats  # LightGBM version (no const)
    pred_pm2 = model.predict(X)[0]

    # total price
    area = feats["area_m2"].iloc[0]
    total_price = pred_pm2 * area if pd.notnull(area) else None
    return pred_pm2, total_price

In [26]:
pd.set_option("display.max_columns", None)   # show all columns
pd.set_option("display.max_rows", None)      # show all rows (careful with large dfs)
pd.set_option("display.width", 1000)         # widen the printout so it doesn't wrap

url = "https://www.aruodas.lt/butu-nuoma-vilniuje-zirmunuose-olimpieciu-g-ypatingai-patogioje-vietoje-karaliaus-4-1304811/?search_pos=3"

pred_price_pm2, total_price = predict_from_url(url, model)

print(f"Predicted price/m²: {pred_price_pm2:.2f} EUR/m²")
print(f"Predicted total monthly rent: {total_price:.0f} EUR")


Feature,Value
rooms,3.0
floor_current,3.0
floor_total,5.0
age_days,1.0
area_m2,129.0
year_centered,6.0
dist_to_center_km,0.68
heat_Centrinis,1.0
heat_Dujinis,0.0
heat_Elektra,0.0


Predicted price/m²: 15.74 EUR/m²
Predicted total monthly rent: 2030 EUR


In [30]:
url = "https://www.aruodas.lt/butu-nuoma-vilniuje-baltupiuose-baltupio-g-isnuomojamas-sviesus-ir-siltas-ju-kambariu-4-1275570/?search_pos=12"
pred_price_pm2, total_price = predict_from_url(url, model)

print(f"Predicted price/m²: {pred_price_pm2:.2f} EUR/m²")
print(f"Predicted total monthly rent: {total_price:.0f} EUR")


Feature,Value
rooms,2.0
floor_current,2.0
floor_total,5.0
age_days,547.0
area_m2,55.0
year_centered,-4.0
dist_to_center_km,5.1
heat_Centrinis,1.0
heat_Dujinis,0.0
heat_Elektra,0.0


Predicted price/m²: 9.74 EUR/m²
Predicted total monthly rent: 536 EUR
