In [17]:
# web scraper to get the data from mls.com
import requests
from bs4 import BeautifulSoup
import json

# URL of the page to scrape
url = 'https://mls.foreclosure.com/listing/search.html?ci=austin&st=tx'

# Send an HTTP GET request to the URL
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Look for script tags since the data is embedded in a script as JSON
scripts = soup.find_all('script', type='application/ld+json')

# This will store each property's details
properties = []

for script in scripts:
    # The JSON data is inside a <script> tag, extract and parse it
    try:
        data = json.loads(script.string)
        properties.append({
            'street_address': data.get('streetAddress'),
            'locality': data.get('addressLocality'),
            'region': data.get('addressRegion'),
            'postal_code': data.get('postalCode'),
            'latitude': data.get('geo', {}).get('latitude'),
            'longitude': data.get('geo', {}).get('longitude'),
            'details_url': data.get('url'),
            # If more details like bedrooms are consistently formatted, add them here
        })
    except json.JSONDecodeError:
        # If JSON decoding fails, skip to the next script tag
        continue

# # Now, `properties` list will contain all the property details that could be extracted
# for property in properties:
#     print(property)

print('Number of properties:', len(properties))

Number of properties: 25


In [20]:
from tomlkit import key


property = properties[0]
details_url = property['details_url']

response = requests.get(details_url, headers=headers)
# `print(response.text)` is displaying the raw HTML content of the response received after sending an
# HTTP GET request to the URL specified in the `details_url` variable. This can help you inspect the
# structure of the webpage and understand how the data is organized, which is useful for further
# parsing and extracting specific information from the webpage.
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
# Dictionary to hold the property information
property_info = {}

# Find all the 'li' elements with class 'lci'
info_items = soup.find_all('li', class_='lci')

for item in info_items:
    # The label is contained within a span with class 'attr-label'
    label_span = item.find('span', class_='attr-label')
    # The value is the text of a following 'a' tag, but it's behind a login
    value_a = item.find('a')
    
    # Extract the text from the span and 'a' tag
    label = label_span.get_text(strip=True) if label_span else None
    value = value_a.get_text(strip=True) if value_a else 'Details require login'

    # Add to the dictionary
    if label:
        property_info[label] = value

# print(f'{key}: {value}' for key, value in property_info.items())
print(property_info)

{'Day(s) On Site:': 'Join now for details', 'Trustee Name:': 'Join now for details', 'Trustee Address:': 'Join now for details', 'Trustee City:': 'Join now for details', 'Trustee State:': 'Join now for details', 'Trustee Zip:': 'Join now for details', 'Trustee Phone:': 'Join now for details', 'Court Name:': 'Join now for details', 'Court Address:': 'Join now for details', 'Attorney Law Firm:': 'Join now for details', 'Attorney Name:': 'Join now for details', 'Attorney Address:': 'Join now for details', 'Attorney City:': 'Join now for details', 'Attorney State:': 'Join now for details', 'Attorney Zip:': 'Join now for details', 'Case Number:': 'Join now for details', 'Filing Date:': 'Join now for details', 'Filing Type:': 'Join now for details', 'Debtor Name:': 'Join now for details', 'Record Type:': 'Join now for details', 'County Site:': 'Join now for details', 'County Appraiser Site:': 'Join now for details'}


In [3]:
# not working
image_urls = []

# Find all image tags and extract their 'src' attributes
for img_tag in soup.find_all('img'):
    img_url = img_tag.get('src')
    if img_url:  # Only add if the URL is not None
        image_urls.append(img_url)

# Now you have all image URLs in the image_urls list
for url in image_urls:
    print(url)

//dlvp94zy6vayf.cloudfront.net/static/css/cobrands/mls/mls_logo-78f50000b9bad0793cf75e320e633cb5.png
//dlvp94zy6vayf.cloudfront.net/static/css/cobrands/fdc/img/big-fdc-logo-print-f8a6264f13df20b3ebda402031729639.png
//dlvp94zy6vayf.cloudfront.net/static/css/img/FDC_logo_109x30-93d8fad80ef25c8ba28ca11651b9cba4.png


In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time

# Set up the Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode, without a UI
options.add_argument('--disable-gpu')  # Disable GPU hardware acceleration
options.add_argument('--no-sandbox')  # Disable the sandbox for the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Navigate to the page URL
driver.get(details_url)

# Wait for JavaScript to load
time.sleep(5)  # Adjust the sleep time as necessary

# Find the image tag with the class 'img-responsive'
img_tags = driver.find_elements(By.CLASS_NAME, 'img-responsive')

# List to store image source URLs
img_sources = []

# Get the 'src' attribute from each image tag
for img in img_tags:
    img_src = img.get_attribute('src')
    if img_src:  # Only add if the src is not None or empty
        img_sources.append(img_src)

# Print out the image source URLs
for src in img_sources:
    print(src)

# Quit the browser
driver.quit()


https://dlvp94zy6vayf.cloudfront.net/listingphoto/GJYFSQ3XPFGUK6SYHBPXSSLDGB4XA5DNGU2WC422NQZTSUDSIRVVIWRQOFMWESDVOJZGG===.jpg
https://dlvp94zy6vayf.cloudfront.net/listingphoto/L5HFGZTMKRSS25SOKFNDOSTHNE3XMUDMJU2UOTJWLJLDC3CRONMFIWRQOFMWESDVOJZGG===.jpg
https://dlvp94zy6vayf.cloudfront.net/listingphoto/KVYGSMDQGF2WUY2TKRJFAYLOINSVGTCVNBLXC4DKGZVDM33MNN3VIWRQOFMWESDVOJZGG===.jpg
https://dlvp94zy6vayf.cloudfront.net/listingphoto/JRJG66SUMQ2WCS3FKV2TGWKLLBRHCLLMGYYGU327IRLUOV2MOJUFIWRQOFMWESDVOJZGG===.jpg
https://dlvp94zy6vayf.cloudfront.net/listingphoto/JI3HMT2RGBIDGTSCOJVHKMDVKRCTOVDYJI3UUUJTNEWVOWTTNBGQ====.jpg
https://dlvp94zy6vayf.cloudfront.net/listingphoto/KVJUQZTDPIYDK5DPPFHTOTSGGM2G65DQOZTFMV2GLF3VKMCTMZQVIWRQOFMWESDVOJZGG===.jpg
https://dlvp94zy6vayf.cloudfront.net/listingphoto/KI2HSRSYGRHFKZKUGF2W4X3GKZBTARLCKBLXQMSONJTDI3TYFVUVIWRQOFMWESDVOJZGG===.jpg
https://dlvp94zy6vayf.cloudfront.net/listingphoto/ORAWI3SXOE2DC6DDGZGC2QSONEZXG3RYKVRDSQLRJJDEUV2VJ5RFIWRQOFMWESDVOJZGG===.jpg


In [21]:
import requests
from bs4 import BeautifulSoup
import json

# Define the base URL for relative links
base_url = 'https://mls.foreclosure.com'

# URL of the page to scrape
search_url = 'https://mls.foreclosure.com/listing/search.html?ci=austin&st=tx'

# Send an HTTP GET request to the URL
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(search_url, headers=headers)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Look for script tags since the data is embedded in a script as JSON
scripts = soup.find_all('script', type='application/ld+json')

# This will store each property's basic details
properties = []

# Extract basic details from JSON data
for script in scripts:
    try:
        data = json.loads(script.string)
        properties.append({
            'street_address': data.get('streetAddress'),
            'locality': data.get('addressLocality'),
            'region': data.get('addressRegion'),
            'postal_code': data.get('postalCode'),
            'latitude': data.get('geo', {}).get('latitude'),
            'longitude': data.get('geo', {}).get('longitude'),
            'details_url': data.get('url'),
        })
    except json.JSONDecodeError:
        continue

# Iterate over each property to get detailed information
for property in properties:
    details_url = property['details_url']
    if not details_url.startswith('http'):
        details_url = base_url + details_url  # Make sure the URL is complete
    
    response = requests.get(details_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Dictionary to hold the property information
    property_info = {}
    
    # Find all the 'li' elements with class 'lci' for detailed info
    info_items = soup.find_all('li', class_='lci')
    for item in info_items:
        label_span = item.find('span', class_='attr-label')
        value_a = item.find('a')
        label = label_span.get_text(strip=True) if label_span else None
        value = value_a.get_text(strip=True) if value_a else 'Details require login'
        if label:
            property_info[label] = value

    # Add the detailed info to the property's data
    property.update(property_info)

# Now, `properties` list will contain all the property details
for property in properties:
    print(property)


{'street_address': None, 'locality': None, 'region': None, 'postal_code': None, 'latitude': '30.209', 'longitude': '-97.793', 'details_url': 'https://mls.foreclosure.com/address/Homespun-Rd-Austin-TX-78745/61749800_lid', 'Day(s) On Site:': 'Join now for details', 'Trustee Name:': 'Join now for details', 'Trustee Address:': 'Join now for details', 'Trustee City:': 'Join now for details', 'Trustee State:': 'Join now for details', 'Trustee Zip:': 'Join now for details', 'Trustee Phone:': 'Join now for details', 'Court Name:': 'Join now for details', 'Court Address:': 'Join now for details', 'Attorney Law Firm:': 'Join now for details', 'Attorney Name:': 'Join now for details', 'Attorney Address:': 'Join now for details', 'Attorney City:': 'Join now for details', 'Attorney State:': 'Join now for details', 'Attorney Zip:': 'Join now for details', 'Case Number:': 'Join now for details', 'Filing Date:': 'Join now for details', 'Filing Type:': 'Join now for details', 'Debtor Name:': 'Join now 

In [29]:
for property in properties:
    for key, value in property.items():
        if value == None:
            continue
        print(f'{key}: {value}')

latitude: 30.209
longitude: -97.793
details_url: https://mls.foreclosure.com/address/Homespun-Rd-Austin-TX-78745/61749800_lid
Day(s) On Site:: Join now for details
Trustee Name:: Join now for details
Trustee Address:: Join now for details
Trustee City:: Join now for details
Trustee State:: Join now for details
Trustee Zip:: Join now for details
Trustee Phone:: Join now for details
Court Name:: Join now for details
Court Address:: Join now for details
Attorney Law Firm:: Join now for details
Attorney Name:: Join now for details
Attorney Address:: Join now for details
Attorney City:: Join now for details
Attorney State:: Join now for details
Attorney Zip:: Join now for details
Case Number:: Join now for details
Filing Date:: Join now for details
Filing Type:: Join now for details
Debtor Name:: Join now for details
Record Type:: Join now for details
County Site:: Join now for details
County Appraiser Site:: Join now for details
latitude: 30.302
longitude: -97.932
details_url: https://mls.

In [24]:
# test

details_url = 'https://mls.foreclosure.com/address/Ed-Bluestein-Blvd-Austin-TX-78721/61742859_lid'

response = requests.get(details_url, headers=headers)
# `print(response.text)` is displaying the raw HTML content of the response received after sending an
# HTTP GET request to the URL specified in the `details_url` variable. This can help you inspect the
# structure of the webpage and understand how the data is organized, which is useful for further
# parsing and extracting specific information from the webpage.
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
# Dictionary to hold the property information
property_info = {}

# Find all the 'li' elements with class 'lci'
info_items = soup.find_all('li', class_='lci')

for item in info_items:
    # The label is contained within a span with class 'attr-label'
    label_span = item.find('span', class_='attr-label')
    # The value is the text of a following 'a' tag, but it's behind a login
    value_a = item.find('a')
    
    # Extract the text from the span and 'a' tag
    label = label_span.get_text(strip=True) if label_span else None
    value = value_a.get_text(strip=True) if value_a else 'Details require login'

    # Add to the dictionary
    if label:
        property_info[label] = value

# print(f'{key}: {value}' for key, value in property_info.items())
print(property_info)
print(response.text)

{}
<!doctype html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>
Ed Bluestein Blvd Austin Texas 78721 | mls.foreclosure.com </title>
<style><!--
//-->
</style>
<!--<link rel="stylesheet" type="text/css" href="//dlvp94zy6vayf.cloudfront.net/static/css/printstyle-612c7575c94c32eb6e97a2da48e8ca8e.css" media="print" />-->
<meta name="description" content="This is a preforeclosure property located at in Austin, . View all the latest property details for homes in Austin, to get a feel for real estate in the neighborhood and/or surrounding area(s).">
<meta name="keywords" content="Austin preforeclosure, Travis preforeclosure home, Travis County TX preforeclosure, preforeclosure property, Austin new preforeclosure">
<meta property="og:type" content="foreclosure:home"/>
<meta property="og:latitude" content="30.273388"/>
<meta property="og:longitude" content="-97.662539"/>
<meta property="og:street-address" content="Ed Bluestein Blvd"/>
<meta property