In [3]:
import os 
import logging
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.options import Options

In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
def extract_listing_for_url(url):
    """
    Scrape the listing for the given URL
    """
    try:
        logger.info(f"Processing URL: {url}")

        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        browser = Browser('chrome', options=chrome_options)
        browser.visit(url)

        # Handle pop-ups
        if browser.is_element_present_by_css('div[aria-label="Close"]', wait_time=5):
            logger.info("Closing pop-up")
            browser.find_by_css('div[aria-label="Close"]').first.click()

        browser.execute_script("window.scrollTo(0, 400);")
        
        # Click the "See More" button if it exists
        try:
            see_more_button = browser.find_by_css('div[role="button"] span.x193iq5w.xeuugli.x13faqbe.x1vvkbs.x1xmvt09.x6prxxf.xvq8zen.x1s688f.xzsf02u')
            if see_more_button:
                see_more_button.click()
            else:
                logger.info("See more button not found")
        except Exception as e:
            logger.warning(f"See more button error: {e}")

        # Parse the HTML
        html = browser.html
        market_soup = soup(html, 'html.parser')

        # Extract relevant data
        div_element = market_soup.find('div', class_='xckqwgs x26u7qi x2j4hbs x78zum5 xnp8db0 x5yr21d x1n2onr6 xh8yej3 xzepove x1stjdt1')
        div_text = div_element.get_text(separator=' ', strip=True) if div_element else ''

        # Close browser
        browser.quit()
        print(div_text)
        return div_text

    except Exception as e:
        logger.error(f"Error processing {url}: {e}")
        return None

In [6]:
description = extract_listing_for_url("https://www.facebook.com/marketplace/item/1312770963176477/")

INFO:__main__:Processing URL: https://www.facebook.com/marketplace/item/1312770963176477/
INFO:__main__:Closing pop-up


Private room for rent $800 / Month Rentals Salem, MA Listed 14 hours ago Message Message Save Share Save Share Unit Details Apartment 3 beds · 1 bath Unfurnished 2 persons live here Rental Location Salem, MA Location is approximate Description Private room for rent available May 1st!
$800 rent month to month plus roughly $150 for utilities (WiFi/cable/gas/electric)
Spacious apartment with a front and back porch and washer/dryer in the basement (coin-op) 

There is a great landlord that is very on top of managing the house.
Shared driveway parking and street parking 

About the current roomie: In early 30s and enjoys yoga, plants, cooking, movie nights, going out dancing and being out in nature. Very quiet and clean and looking for someone similar! 

Please message with what you do for work and your rental history See less Getting Around Provided by Walk Score®︎ Walk Score®︎ 86 out of 100 Most errands can be accomplished on foot. Transit Score®︎ 30 out of 100 A few nearby public transpo

In [7]:
import configparser
from snowflake.snowpark import Session
from snowflake.cortex import complete

def load_snowflake_config():
    """Load Snowflake connection parameters from config file."""
    config = configparser.ConfigParser()
    config.read('configuration.properties')
    return {
        "user": config['snowflake']['user'],
        "password": config['snowflake']['password'],
        "account": config['snowflake']['account'],
        "warehouse": config['snowflake']['warehouse'],
        "database": config['snowflake']['database'],
        "schema": config['snowflake']['schema']
    }

In [8]:
connection_params = load_snowflake_config()

In [9]:
# LLM Prompt
prompt = """
I will provide a property listing with multiple attributes. 
Please extract the following details from the listing and return them in JSON format. If a field is missing or not found, leave the field blank or set it to `null`.

- **Room Count**: The number of rooms/beds (ROOM_COUNT).
- **Bathroom Count**: The number of bathrooms (BATH_COUNT).
- **People Count**: The number of people living in the property (PEOPLE_COUNT).
- **Description**: The short description summary of the property. Translate to English language wherever necessary (DESCRIPTION_summary).
- **Contact**: The contact number for the listing (CONTACT).
- **Laundry Availability**: if laundry available then 1 or else 0 (LAUNDARY_AVAILABLE).
- **Room Type**: Type of room (e.g., Private or Shared) (ROOM_TYPE).
- **Other Details**: Any additional details in a dictionary format (OTHER_details).

Ensure the JSON response follows this structure:
{
    "room_count": number <room_count or null>,
    "bath_count": number <bath_count or null>,
    "people_count": number <people_count or null>,
    "description": string "<description_summary or null>",
    "contact": number "<contact_number or null>",
    "laundry_available": integer (0/1)<laundry_available or null>,
    "room_type": string "<room_type or null>",
    "other_details": dict <other_details_dict or null>
}
"""

In [10]:
session = Session.builder.configs(connection_params).create()

INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.14.0, Python Version: 3.10.0, Platform: Windows-10-10.0.26100-SP0
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
INFO:snowflake.connector.connection:This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.
INFO:snowflake.snowpark.session:Snowpark Session information: 
"version" : 1.29.1,
"python.version" : 3.10.0,
"python.connector.version" : 3.14.0,
"python.connector.session.id" : 1079046110701910,
"os.name" : Windows



In [11]:
response = complete(
                "claude-3-5-sonnet",
                # "mistral-7b",
                f"{prompt} \n\n listing description: \n {description}",
                session=session
            )

In [12]:
response

'{\n    "room_count": 3,\n    "bath_count": 1,\n    "people_count": 2,\n    "description": "Private room for rent available May 1st. Spacious apartment with front and back porch. Located in Salem, MA. Current roommate is in early 30s, enjoys yoga, plants, cooking, movie nights, dancing and nature. Looking for quiet and clean roommate.",\n    "contact": null,\n    "laundry_available": 1,\n    "room_type": "Private",\n    "other_details": {\n        "rent": "$800/month",\n        "utilities": "$150/month",\n        "amenities": [\n            "front porch",\n            "back porch",\n            "coin-op washer/dryer",\n            "shared driveway parking",\n            "street parking"\n        ],\n        "furnished": "Unfurnished",\n        "walk_score": 86,\n        "transit_score": 30,\n        "bike_score": 70\n    }\n}'