# House Pricing Data Scraper
This notebook demonstrates how to scrape house pricing data from multiple websites dynamically. The configuration for each website is stored in a separate JSON file to keep the main code clean and professional.

In [1]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from pathlib import Path
from time import sleep
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

## Load Configuration
The configuration file contains the details required for scraping each website, such as base URLs and labels for the required data fields.


In [2]:
# Load configuration
config_path = Path("../config.json")
with config_path.open() as config_file:
    config = json.load(config_file)

logging.info("Configuration loaded successfully.")

2024-07-27 18:00:27,723 - INFO - Configuration loaded successfully.


## Functions for Scraping
We define functions to extract data from individual listings and to scrape data from a given website configuration.


In [3]:
# Function to get property links from a page
def get_property_links(base_url, page_num, listing_div_class):
    url = f"{base_url}{page_num}"
    logging.info(f"Fetching property links from page {page_num}...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    listing_divs = soup.find_all("div", class_=listing_div_class)
    links = [div.find("a")["href"] for div in listing_divs if div.find("a")]
    logging.info(f"Found {len(links)} property links on page {page_num}.")
    return links


# Function to extract data from a single property page
def extract_property_data(url, base_domain, labels, neighborhoods):
    full_url = base_domain + url
    logging.info(f"Fetching property data from page...")
    response = requests.get(full_url)
    soup = BeautifulSoup(response.content, "html.parser")
    data = {}

    # Extract price
    price_elem = soup.find("span", {"aria-label": labels["price"]["aria_label"]})
    data["price"] = price_elem.text.strip() if price_elem else "N/A"

    # Extract area
    area_label = soup.find("span", text=labels["area"]["text_label"])
    data["area"] = area_label.find_next("span").text.strip() if area_label else "N/A"

    # Extract bedrooms
    bedrooms_label = soup.find("span", text=labels["bedrooms"]["text_label"])
    data["bedrooms"] = (
        bedrooms_label.find_next("span").text.strip() if bedrooms_label else "N/A"
    )

    # Extract bathrooms
    bathrooms_label = soup.find("span", text=labels["bathrooms"]["text_label"])
    data["bathrooms"] = (
        bathrooms_label.find_next("span").text.strip() if bathrooms_label else "N/A"
    )

    # Extract location
    location_elem = soup.find("span", {"aria-label": labels["location"]["aria_label"]})
    location = location_elem.text.strip() if location_elem else "N/A"

    # Ensure the location is in Alexandria
    if any(neighborhood in location for neighborhood in neighborhoods):
        if "Alexandria" not in location:
            location += ", Alexandria"
    else:
        location = "N/A"

    data["location"] = location

    logging.info(f"Extracted data: {data}")
    return data

## Scraping Data
We define a function to manage the entire scraping process for each website, collect the data, and store it in a DataFrame.


In [4]:
# Scrape data from a given website configuration
def scrape_website(config, max_pages=40):
    all_links = []
    base_url = config["BASE_DOMAIN"] + config["BASE_PATH"]

    # Collect all property links from the first 40 pages
    for page in range(1, max_pages + 1):
        logging.info(f"Collecting links from page {page}...")
        links = get_property_links(base_url, page, config["LISTING_DIV_CLASS"])
        if not links:
            logging.info("No more links found, stopping.")
            break
        all_links.extend(links)
        sleep(1)  # To avoid getting blocked by the website

    # Collect detailed data from each property link
    listings = []
    for link in all_links:
        logging.info(f"Extracting data from a property link...")
        data = extract_property_data(
            link, config["BASE_DOMAIN"], config["FIELDS"], config["NEIGHBORHOODS"]
        )
        if data["location"] != "N/A":
            listings.append(data)
        sleep(1)  # To avoid getting blocked by the website

    return pd.DataFrame(listings)

## Main Scraping Process
We loop through each website in the configuration, scrape the data, and combine it into a single DataFrame.


In [None]:
# Scrape data for each website in the configuration
all_data = []
for website_name, website_config in config["websites"].items():
    logging.info(f"Scraping data from a website configuration...")
    data = scrape_website(website_config)
    all_data.append(data)

# Combine all data into a single DataFrame
all_data_df = pd.concat(all_data, ignore_index=True)
logging.info("Data scraping completed successfully.")

## Save and Display Data
We save the combined data to a CSV file and display the first few rows for verification.


In [None]:
# Save the data to a CSV file
output_path = Path("../data/house_pricing_data.csv")
all_data_df.to_csv(output_path, index=False)
logging.info(f"Data saved to {output_path}")

# Display the first few rows of the data
all_data_df.head()

## Load Existing Data and Check for New Entries
We load the existing dataset, check for new entries, and add only the new entries to the dataset.

In [None]:
# Load existing data
existing_data_path = Path("../data/house_pricing_data.csv")
if existing_data_path.exists():
    existing_data_df = pd.read_csv(existing_data_path)
else:
    existing_data_df = pd.DataFrame()

# Scrape data for each website in the configuration
all_data = []
for website_name, website_config in config["websites"].items():
    logging.info(f"Scraping data from a website configuration...")
    data = scrape_website(website_config)
    all_data.append(data)

# Combine all new data into a single DataFrame
new_data_df = pd.concat(all_data, ignore_index=True)

# Ensure that "Alexandria" is added to locations if missing
new_data_df["location"] = new_data_df["location"].apply(
    lambda x: x if "Alexandria" in x else x + ", Alexandria"
)

# Check for new entries by comparing with existing data
if not existing_data_df.empty:
    combined_df = pd.concat([existing_data_df, new_data_df]).drop_duplicates(
        subset=["price", "area", "bedrooms", "bathrooms", "location"], keep="first"
    )
else:
    combined_df = new_data_df

# Save the combined data to a CSV file
combined_df.to_csv(existing_data_path, index=False)
logging.info(f"Data saved to CSV file: {existing_data_path}")

# Display the first few rows of the data
combined_df.head()

## Remove Duplicates from the Dataset
We check for and remove duplicate rows from the dataset, then save the cleaned dataset back to the CSV file.


In [None]:
# Load the dataset
file_path = "../data/house_pricing_data.csv"
house_pricing_data = pd.read_csv(file_path)

# Check for duplicate rows
duplicates = house_pricing_data[house_pricing_data.duplicated()]

# Display the number of duplicate rows and the duplicate rows themselves
num_duplicates = duplicates.shape[0]
print(f"Number of duplicate rows: {num_duplicates}")
print("Duplicate rows:")
print(duplicates)

# Remove duplicates from the dataset
house_pricing_data_cleaned = house_pricing_data.drop_duplicates()

# Save the cleaned dataset to the original CSV file
cleaned_file_path = "../data/house_pricing_data.csv"
house_pricing_data_cleaned.to_csv(cleaned_file_path, index=False)
logging.info(f"Cleaned data saved to {cleaned_file_path}")

# Display the first few rows of the cleaned data
house_pricing_data_cleaned.head()

## Summary and Next Steps
The notebook successfully scraped house pricing data from multiple websites, validated the data, and ensured it meets expected standards. The data has been saved to a CSV file for further analysis and building deep learning models to predict house prices.
