In [None]:
#@markdown # Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@markdown # Install requirements
# --- Setup: Install ChromeDriver and required packages ---
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium pandas openpyxl


In [1]:
#@markdown # Fetch the information
import time
from datetime import date
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")  # Run headless
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome("chromedriver", options=chrome_options)

# ----------------- CONFIGURATION -----------------
# Define the location to search for (change as needed)
location_search = "Lévis" #@param{type:"string"}
# Homepage URL for Centris (French)
url = "https://www.centris.ca/fr"
property_limit = 0 #@param{type:"integer"}

# ----------------- SETUP WEBDRIVER -----------------
driver.maximize_window()
wait = WebDriverWait(driver, 15)

# ----------------- STEP 1: LOAD THE HOMEPAGE -----------------
driver.get(url)
time.sleep(3)  # Allow a few seconds for the page to load

# ----------------- STEP 2: CLOSE PRIVACY CONSENT POPUP -----------------
try:
    accept_button = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.ID, "didomi-notice-agree-button"))
    )
    accept_button.click()
    print("Privacy consent popup closed.")
    time.sleep(2)  # Give it a moment to disappear before continuing
except Exception:
    print("No privacy popup detected, continuing...")


# ----------------- STEP 3: INPUT THE SEARCH LOCATION -----------------
try:
    # Click the main search bar to activate it
    main_search_bar = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//span[contains(@class, 'select2-selection')]"))
    )
    main_search_bar.click()
    time.sleep(1)  # Allow dropdown to appear

    # Locate the actual input field inside the dropdown
    search_input = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "select2-search__field"))
    )
    search_input.clear()
    search_input.send_keys(location_search)
    time.sleep(1)  # Allow time for search results to appear
    search_input.send_keys(Keys.ENTER)  # Press Enter to confirm
    print(f"Searched for location: {location_search}")

except Exception as e:
    driver.quit()
    raise Exception("Failed to interact with search bar. Check the XPath or class selectors.") from e


# ----------------- STEP 4: CLICK THE SUBMIT BUTTON -----------------
try:
    time.sleep(1)
    # Locate the button using its CSS class (or other appropriate selector)
    submit_button = driver.find_element(By.CSS_SELECTOR, "button.js-trigger-search")
    # Use JavaScript to click the button
    driver.execute_script("arguments[0].click();", submit_button)
    print("Search submitted successfully using JavaScript click.")
    time.sleep(3)  # Allow the page to load results
except Exception as e:
    print("Failed to click the submit button. Error:", e)




# ----------------- STEP 5: COLLECT PROPERTY LINKS FROM ALL SEARCH RESULT PAGES -----------------
property_links = []

while True:
    # Wait until the container of results is present
    try:
        wait.until(EC.presence_of_element_located((By.ID, "divMainResult")))
    except Exception as e:
        print("Results container not found.")
        break

    # Collect all property cards on the current page
    property_cards = driver.find_elements(By.CSS_SELECTOR, "div.property-thumbnail-item a.property-thumbnail-summary-link")
    for card in property_cards:
        link = card.get_attribute("href")
        if link and link not in property_links:
            property_links.append(link)

    print(f"Collected {len(property_links)} property links so far.")

    if property_limit != 0 and len(property_links) > property_limit:
        break  # For testing, limit the number of properties to collect

    # Check if a next page exists by examining the <li class="next"> element.
    try:
        # Get the <li> element that wraps the next button.
        next_li = driver.find_element(By.CSS_SELECTOR, "li.next")
        li_classes = next_li.get_attribute("class")

        # If the next button's container has 'inactive', we're on the last page.
        if "inactive" in li_classes:
            print("Reached the last page. Exiting pagination loop.")
            break

        # Otherwise, click the next button
        next_button = next_li.find_element(By.TAG_NAME, "a")
        if next_button.is_displayed() and next_button.is_enabled():
            print("Clicking the next page button...")
            next_button.click()
            time.sleep(3)  # Adjust sleep time as needed for the new page to load
        else:
            print("Next button is not clickable. Exiting pagination loop.")
            break
    except Exception as e:
        print("No next page button found. Pagination complete.")
        break

print(f"Total property links collected: {len(property_links)}")



# ----------------- STEP 6: ITERATE OVER EACH PROPERTY & EXTRACT DATA -----------------
properties_data = []
for idx, link in enumerate(property_links, start=1):
    print(f"Processing property {idx} of {len(property_links)}")
    if property_limit != 0 and idx > property_limit:
        break  # For testing, limit the number of properties to process
    try:
        driver.get(link)
        time.sleep(3)  # Allow the property detail page to load

        # Extract the address from <h2 itemprop="address" class="pt-1">
        try:
            address_elem = wait.until(EC.presence_of_element_located(
                (By.XPATH, "//h2[@itemprop='address']")
            ))
            address = address_elem.text.strip()
        except Exception as e:
            print(f"Address not found on {link}")
            address = "N/A"

        # Extract the price from <span id="BuyPrice" class="text-nowrap">
        try:
            price_elem = wait.until(EC.presence_of_element_located(
                (By.ID, "BuyPrice")
            ))
            price = price_elem.text.strip()
        except Exception as e:
            print(f"Price not found on {link}")
            price = "N/A"

        # Extract additional details from the teaser row (rooms, bedrooms, bathrooms)
        try:
            teaser_elem = driver.find_element(By.CSS_SELECTOR, "div.row.teaser")
            try:
                piece_elem = teaser_elem.find_element(By.CSS_SELECTOR, "div.piece")
                rooms = piece_elem.text.strip()
            except Exception:
                rooms = "#N/A"
            try:
                cac_elem = teaser_elem.find_element(By.CSS_SELECTOR, "div.cac")
                bedrooms = cac_elem.text.strip()
            except Exception:
                bedrooms = "#N/A"
            try:
                sdb_elem = teaser_elem.find_element(By.CSS_SELECTOR, "div.sdb")
                bathrooms = sdb_elem.text.strip()
            except Exception:
                bathrooms = "#N/A"
        except Exception as e:
            print(f"Teaser info not found on {link}")
            rooms = "#N/A"
            bedrooms = "#N/A"
            bathrooms = "#N/A"

        # Extract additional characteristics (Style de bâtiment, Année de construction, Superficie habitable, Superficie du terrain)
        try:
            # Find all characteristic containers on the page
            carac_containers = driver.find_elements(By.CSS_SELECTOR, "div.carac-container")
            # Initialize fields with default value
            style_batiment = "#N/A"
            annee_construction = "#N/A"
            superficie_habitable = "#N/A"
            superficie_terrain = "#N/A"
            # Loop through each container to pick out the wanted fields
            for container in carac_containers:
                try:
                    title = container.find_element(By.CSS_SELECTOR, "div.carac-title").text.strip()
                    value = container.find_element(By.CSS_SELECTOR, "div.carac-value span").text.strip()
                    if title == "Style de bâtiment":
                        style_batiment = value
                    elif title == "Année de construction":
                        annee_construction = value
                    elif title == "Superficie habitable":
                        superficie_habitable = value
                    elif title == "Superficie du terrain":
                        superficie_terrain = value
                except Exception as inner_e:
                    continue
        except Exception as e:
            print(f"Carac data not found on {link}")
            style_batiment = "#N/A"
            annee_construction = "#N/A"
            superficie_habitable = "#N/A"
            superficie_terrain = "#N/A"

        # Save the data with the current page link
        properties_data.append({
            "Addresse": address,
            "Prix": price,
            "Pieces": rooms,
            "Chambres a coucher": bedrooms,
            "Salles de bain": bathrooms,
            "Style de bâtiment": style_batiment,
            "Année de construction": annee_construction,
            "Superficie habitable": superficie_habitable,
            "Superficie du terrain": superficie_terrain,
            "URL": link
        })
    except Exception as e:
        print(f"Failed to process property at {link}: {e}")



# ----------------- STEP 7: SAVE DATA TO EXCEL -----------------
df = pd.DataFrame(properties_data)
excel_filename = f"/content/drive/MyDrive/properties-{date.today()}.xlsx"
df.to_excel(excel_filename, index=False)
print(f"Data saved to {excel_filename}")

driver.quit()


ModuleNotFoundError: No module named 'webdriver_manager'