In [2]:
import logging
import pandas as pd
import time

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

logging.basicConfig(level=logging.INFO)

In [5]:
edge_options = Options()
user_agent = "Edge/122.0.2365.92"
edge_options.add_argument(f'user-agent={user_agent}')
url = "https://www.facebook.com/marketplace/104146386288393/propertyforsale/?sortBy=creation_time_descend"
driver = webdriver.Edge(options=edge_options)
driver.get(url)
time.sleep(3)

for _ in range(10):
    driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
    time.sleep(1)
    
html_content = driver.page_source
driver.quit()

soup = BeautifulSoup(html_content, 'html.parser')

# List of tags to remove
unwanted_tags = ['script', 'style', 'button', 'input', 'img', 'video', 'head', 'svg']

for tag in unwanted_tags:
    for unwanted_tag in soup.find_all(tag):
        unwanted_tag.decompose()  # Remove the tag from the soup


In [18]:
section = soup.find("div", class_="x8gbvx8 x78zum5 x1q0g3np x1a02dak x1nhvcw1 x1rdy4ex xcud41i x4vbgl9 x139jcc6")
listings = section.find_all("div", class_="x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24")

In [33]:
# Initialize lists to store the extracted data
data = []

# Extract information for each listing
for listing in listings:

    link_element = listing.find('a', class_='x1i10hfl')
    link_href = link_element['href'] if link_element else None
    link_href = f"https://www.facebook.com{link_href}" if link_href else None

    # Extracting the price
    price_element = listing.find('span', class_='x193iq5w')
    price = price_element.get_text() if price_element else None

    # Extracting the property details
    details_element = listing.find('span', style=lambda value: value and '-webkit-line-clamp: 2' in value)
    details = details_element.get_text() if details_element else None

    # Extracting the address
    address_element = listing.find('span', class_='x1nxh6w3')
    address = address_element.get_text() if address_element else None

    data.append({
        "Price": price,
        "Details": details,
        "Address": address,
        "Link": link_href
    })

df = pd.DataFrame(data)

In [38]:
df

Unnamed: 0,Price,Details,Address,Link
0,"$15,000",1 Bed 1 Bath - House,"Worcester, MA",https://www.facebook.com/marketplace/item/9618...
1,"$629,900",4 Beds 2 Baths House,"Tyngsboro, MA",https://www.facebook.com/marketplace/item/2607...
2,"$950,000",4 Beds 2.5 Baths - House,"Hampstead, NH",https://www.facebook.com/marketplace/item/7921...
3,$750,2 Beds 1 Bath - House,"Roxbury, MA",https://www.facebook.com/marketplace/item/1370...
4,$300,3 Drawer Night Stand,"Pawtucket, RI",https://www.facebook.com/marketplace/item/7195...
5,$620,Drawer Chest,"Pawtucket, RI",https://www.facebook.com/marketplace/item/9418...
6,"$689,900",4 Beds 2.5 Baths - House,"East Bridgewater, MA",https://www.facebook.com/marketplace/item/7501...
7,"$489,900",3 Beds 2 Baths - Townhouse,"Whitman, MA",https://www.facebook.com/marketplace/item/3768...
8,"$15,000",1 Bed 1 Bath - House,"Worcester, MA",https://www.facebook.com/marketplace/item/1899...
9,"$1,379,000",4 Beds 3.5 Baths House,"Acton, MA",https://www.facebook.com/marketplace/item/3501...


In [39]:
# 1. Remove entries with price < $100,000
# 2. Remove entries where Link is None
# - We can remove entries based out of MA
df_filtered = df[df['Link'].notna()]
df_filtered = df_filtered[df_filtered['Price'].replace('[\$,]', '', regex=True).astype(float) >= 100000]
df_filtered

Unnamed: 0,Price,Details,Address,Link
1,"$629,900",4 Beds 2 Baths House,"Tyngsboro, MA",https://www.facebook.com/marketplace/item/2607...
2,"$950,000",4 Beds 2.5 Baths - House,"Hampstead, NH",https://www.facebook.com/marketplace/item/7921...
6,"$689,900",4 Beds 2.5 Baths - House,"East Bridgewater, MA",https://www.facebook.com/marketplace/item/7501...
7,"$489,900",3 Beds 2 Baths - Townhouse,"Whitman, MA",https://www.facebook.com/marketplace/item/3768...
9,"$1,379,000",4 Beds 3.5 Baths House,"Acton, MA",https://www.facebook.com/marketplace/item/3501...
10,"$950,000",3 Beds 2 Baths Apartment,"South Boston, MA",https://www.facebook.com/marketplace/item/2738...
11,"$479,000",2 Beds 3 Baths Townhouse,"Tewksbury, MA",https://www.facebook.com/marketplace/item/1363...
12,"$159,000",2 Beds 1 Bath - House,"Middleboro, MA",https://www.facebook.com/marketplace/item/8048...
13,"$829,900",2 Beds 2 Baths House,"Swampscott, MA",https://www.facebook.com/marketplace/item/6166...
15,"$724,900",2 Beds 2 Baths - House,"Rowley, MA",https://www.facebook.com/marketplace/item/1134...


In [40]:
# Save the DataFrame to a CSV file
csv_file_path = 'fb_listings.csv'
df.to_csv(csv_file_path, index=False)
print(f'Saved extracted information to {csv_file_path}')

# Alternatively, to save it as an Excel file, you can do:
excel_file_path = 'fb_listings.xlsx'
df.to_excel(excel_file_path, index=False)
print(f'Saved extracted information to {excel_file_path}')

Saved extracted information to fb_listings.csv
Saved extracted information to fb_listings.xlsx
