<a href="https://colab.research.google.com/github/MitjaGo/YT/blob/main/booking_scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1️⃣ Install packages (if not done already)
!pip install --quiet selenium webdriver-manager beautifulsoup4 lxml pandas tabulate ipywidgets

# 2️⃣ Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate
import ipywidgets as widgets
from IPython.display import display
import datetime
import re

# 3️⃣ Configure headless Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# 4️⃣ Currency conversion table (static rates for simplicity)
# If symbol unknown, will leave as N/A
currency_rates = {
    "€": 1,       # EUR
    "EUR": 1,
    "$": 0.93,    # USD → EUR (example)
    "£": 1.13     # GBP → EUR (example)
}

# Function to normalize price string to float in EUR
def normalize_price(price_str):
    if not price_str or price_str.strip() == "":
        return "N/A"

    # Remove non-breaking spaces and common unicode
    price_str = price_str.replace("\xa0", "").replace(",", ".").strip()

    # Extract currency symbol and numeric part
    match = re.search(r"([€$£]|EUR|GBP|USD)?\s*([\d\.]+)", price_str)
    if not match:
        return "N/A"

    symbol, amount = match.groups()
    amount = float(amount)
    rate = currency_rates.get(symbol, None)

    if rate is None:
        return "N/A"

    price_eur = round(amount * rate, 2)
    return f"€{price_eur}"

# 5️⃣ Robust scraper function
def get_booking_data(url, checkin, checkout):
    # Format dates for Booking.com (YYYY-MM-DD)
    checkin_fmt = datetime.datetime.strptime(checkin, "%d-%m-%Y").strftime("%Y-%m-%d")
    checkout_fmt = datetime.datetime.strptime(checkout, "%d-%m-%Y").strftime("%Y-%m-%d")

    if "checkin" not in url:
        joiner = "&" if "?" in url else "?"
        url = f"{url}{joiner}checkin={checkin_fmt}&checkout={checkout_fmt}"

    print(f"🔗 Loading: {url}")
    driver.get(url)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='room-name']"))
        )
    except:
        print("⚠️ Room elements not found. Page may have loaded differently.")

    soup = BeautifulSoup(driver.page_source, "lxml")
    rooms = []

    # Find all room blocks
    room_blocks = soup.select("[data-testid='room-name']")
    if not room_blocks:
        room_blocks = soup.select(".hprt-roomtype-icon-link")

    if not room_blocks:
        rooms.append({"room": "N/A", "price": "N/A"})
        return rooms

    for room in room_blocks:
        name = room.get_text(strip=True) if room else "N/A"
        price_el = room.find_next("span", attrs={"data-testid": "price-and-discounted-price"})
        if not price_el:
            price_tag = room.find_next(["span", "div"], class_=["bui-price-display__value"])
            price = normalize_price(price_tag.get_text(strip=True)) if price_tag else "N/A"
        else:
            price = normalize_price(price_el.get_text(strip=True))
        rooms.append({"room": name, "price": price})

    return rooms

# 6️⃣ Interactive widgets
print("Enter 3 Booking.com property URLs:")
url_widgets = [widgets.Text(description=f"URL {i+1}", placeholder="https://") for i in range(3)]
for w in url_widgets:
    display(w)

# Date pickers for multiple check-in/check-out
checkin_widgets = [widgets.DatePicker(description=f"Check-in {i+1}") for i in range(2)]
checkout_widgets = [widgets.DatePicker(description=f"Check-out {i+1}") for i in range(2)]
print("\nEnter multiple check-in and check-out dates:")
for ci, co in zip(checkin_widgets, checkout_widgets):
    display(ci, co)

# Button to start scraping
button = widgets.Button(description="Scrape Data", button_style="success")
output = widgets.Output()
display(button, output)

# 7️⃣ Scrape on button click
def on_button_clicked(b):
    with output:
        output.clear_output()
        # Collect URLs
        urls = [w.value.strip() for w in url_widgets if w.value]
        if not urls:
            print("⚠️ Please enter at least one property URL.")
            return

        # Collect dates
        date_pairs = []
        for ci, co in zip(checkin_widgets, checkout_widgets):
            if ci.value and co.value:
                ci_str = ci.value.strftime("%d-%m-%Y")
                co_str = co.value.strftime("%d-%m-%Y")
                date_pairs.append((ci_str, co_str))

        if not date_pairs:
            print("⚠️ Please enter at least one check-in/check-out pair.")
            return

        # Scrape all combinations
        results = []
        for i, url in enumerate(urls):
            for checkin, checkout in date_pairs:
                print(f"\n🔍 Scraping property {i+1} for {checkin} → {checkout}")
                data = get_booking_data(url, checkin, checkout)
                for item in data:
                    results.append({
                        "Property": f"Property {i+1}",
                        "Room Type": item.get("room", "N/A"),
                        "Price (EUR)": item.get("price", "N/A"),
                        "Check-in": checkin,
                        "Check-out": checkout,
                        "URL": url
                    })

        driver.quit()

        if results:
            df = pd.DataFrame(results)
            df = df[["Property", "Room Type", "Price (EUR)", "Check-in", "Check-out", "URL"]]
            print("\n✅ Scraping completed!\n")
            print(tabulate(df, headers='keys', tablefmt='fancy_grid', showindex=False))
        else:
            print("\n⚠️ No data found. Try increasing wait time or check your URLs.")

button.on_click(on_button_clicked)
