In [31]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time
import os

# Create database folder if it doesn't exist
if not os.path.exists('database'):
    os.makedirs('database')

class RightmoveScraper:
    def __init__(self):
        # Setup Chrome driver
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=options)
        self.wait = WebDriverWait(self.driver, 10)
    
    def handle_cookies(self):
        try:
            cookie_button = self.driver.find_element(By.ID, "onetrust-reject-all-handler")
            cookie_button.click()
            time.sleep(1)
        except:
            pass

    def get_page_url(self, page):
        base_url = "https://www.rightmove.co.uk/house-prices/southwark-85215.html"
        return base_url if page == 1 else f"{base_url}?page={page}"

    def scrape_page(self, page):
        try:
            # Load the page
            self.driver.get(self.get_page_url(page))
            
            # Handle cookies on first page
            if page == 1:
                self.handle_cookies()
            
            # Wait for listings to load
            listings = self.wait.until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, '//div[@data-testid="propertyCard"]')
                )
            )
            
            # Extract data from each listing
            page_data = []
            for listing in listings:
                try:
                    address = listing.find_element(By.CSS_SELECTOR, 'a.title.clickable').text.strip()
                    price = listing.find_element(By.CSS_SELECTOR, 'td.price').text.strip()
                    property_type = listing.find_element(By.CSS_SELECTOR, 'span.propertyType').text.strip()
                    
                    page_data.append({
                        'address': address,
                        'price': price,
                        'property_type': property_type
                    })
                except:
                    continue
            
            # Save the data
            if page_data:
                df = pd.DataFrame(page_data)
                filename = f'database/rightmove_data_page_{page}.csv'
                df.to_csv(filename, index=False)
                print(f"Saved page {page} data - {len(page_data)} listings")
            
            time.sleep(2)  # Basic delay between pages
            
        except Exception as e:
            print(f"Error on page {page}: {str(e)}")

    def combine_files(self):
        try:
            # Get all CSV files
            all_files = [f for f in os.listdir('database') if f.startswith('rightmove_data_page_')]
            
            # Combine all files
            all_data = []
            for file in all_files:
                df = pd.read_csv(f'database/{file}')
                all_data.append(df)
            
            # Save combined data
            if all_data:
                combined_df = pd.concat(all_data, ignore_index=True)
                combined_df.to_csv('database/combined_data.csv', index=False)
                print("Created combined data file")
        except Exception as e:
            print(f"Error combining files: {str(e)}")

    def run(self, start_page=1, end_page=40):
        try:
            for page in range(start_page, end_page + 1):
                self.scrape_page(page)
            self.combine_files()
        finally:
            self.driver.quit()

def main():
    scraper = RightmoveScraper()
    scraper.run()

if __name__ == "__main__":
    main()

Error scraping page 1: HTTPConnectionPool(host='localhost', port=54405): Read timed out. (read timeout=120)
Scraping page 2...
Data saved to rightmove_data_page_2.csv
Page 2 scraped successfully.
Scraping page 3...
Data saved to rightmove_data_page_3.csv
Page 3 scraped successfully.
Scraping page 4...
Data saved to rightmove_data_page_4.csv
Page 4 scraped successfully.
Scraping page 5...
Data saved to rightmove_data_page_5.csv
Page 5 scraped successfully.
Scraping page 6...
Data saved to rightmove_data_page_6.csv
Page 6 scraped successfully.
Scraping page 7...
Data saved to rightmove_data_page_7.csv
Page 7 scraped successfully.
Scraping page 8...
Data saved to rightmove_data_page_8.csv
Page 8 scraped successfully.
Scraping page 9...
Data saved to rightmove_data_page_9.csv
Page 9 scraped successfully.
Scraping page 10...
Data saved to rightmove_data_page_10.csv
Page 10 scraped successfully.
Error scraping page 11: HTTPConnectionPool(host='localhost', port=54923): Read timed out. (read t

In [1]:
import os
import pandas as pd

# Path to the folder containing CSV files
folder_path = 'database'

# List to store each CSV file's data
csv_data = []

# Iterate over all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        # Read CSV file and append to the list
        csv_data.append(pd.read_csv(file_path))

# Concatenate all CSV data into a single DataFrame
combined_df = pd.concat(csv_data, ignore_index=True)

# Save the combined data to a new CSV file
combined_df.to_csv('combined_database.csv', index=False)

print("All CSV files have been combined into 'combined_database.csv'")


All CSV files have been combined into 'combined_database.csv'
