In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json
import jmespath
import asyncio
from httpx import AsyncClient
from parsel import Selector
import nest_asyncio

class RightmoveRentalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        self.properties = []
        self.httpx_client = AsyncClient(
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            },
            follow_redirects=True,
            http2=False,
            timeout=15,
        )
    
    def find_json_objects(self, text: str):
        pos = 0
        while True:
            match = text.find("{", pos)
            if match == -1: break
            try:
                result, index = json.JSONDecoder().raw_decode(text[match:])
                yield result
                pos = match + index
            except:
                pos = match + 1

    def extract_property_id(self, url):
        """Extract property ID from URL"""
        match = re.search(r'/properties/(\d+)', url)
        return match.group(1) if match else 'N/A'
    
    def format_address_and_postcode(self, address_data):
        """Extract and format address and postcode from JSON address object"""
        if isinstance(address_data, dict):
            # Extract display address and clean it
            display_address = address_data.get('displayAddress', '')
            if display_address:
                # Remove line breaks and extra spaces
                clean_address = re.sub(r'\r\n|\r|\n', ', ', display_address)
                clean_address = re.sub(r',\s*,', ',', clean_address)  # Remove double commas
                clean_address = re.sub(r'\s+', ' ', clean_address).strip()  # Normalize spaces
                
                # Remove postcode from address if it appears at the end
                postcode_pattern = r',?\s*[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\s*$'
                clean_address = re.sub(postcode_pattern, '', clean_address, flags=re.IGNORECASE)
                clean_address = clean_address.rstrip(', ')
            else:
                clean_address = 'N/A'
            
            # Combine outcode and incode to form postcode
            outcode = address_data.get('outcode', '')
            incode = address_data.get('incode', '')
            
            if outcode and incode:
                postcode = f"{outcode} {incode}"
            else:
                postcode = 'N/A'
            
            return clean_address, postcode
    
    def extract_tenure_details(self, json_data):
        """Extract tenure type and lease years remaining from JSON data"""
        tenure_type = 'N/A'
        lease_years_remaining = 'N/A'
        
        if json_data:
            # Extract tenure information
            tenure_info = jmespath.search("tenure", json_data)
            
            if isinstance(tenure_info, dict):
                # Extract tenure type
                tenure_type = tenure_info.get('tenureType', 'N/A')
                
                # Extract years remaining on lease
                years_remaining = tenure_info.get('yearsRemainingOnLease')
                if years_remaining is not None:
                    lease_years_remaining = years_remaining
                
                # Handle message field if needed
                message = tenure_info.get('message')
                if message:
                    # You can append message to tenure_type if it contains useful info
                    tenure_type = f"{tenure_type} - {message}"
            
            elif isinstance(tenure_info, str):
                # If it's just a string, use it as tenure type
                tenure_type = tenure_info
        
        return tenure_type, lease_years_remaining

    async def extract_property_json_data(self, property_url):
        try:
            response = await self.httpx_client.get(property_url)
            data = Selector(response.text).xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
            return list(self.find_json_objects(data))[0]["propertyData"]
        except:
            return None

    

    def get_property_links(self, search_url, max_pages=1):
        property_links = []
        for page in range(max_pages):
            page_url = f"{search_url}&index={page * 24}"
            try:
                response = self.session.get(page_url, timeout=10)
                soup = BeautifulSoup(response.content, 'html.parser')
                links = [f"https://www.rightmove.co.uk{link.get('href')}" for link in soup.find_all('a', class_='propertyCard-link') if link.get('href')]
                property_links.extend(links)
                print(f"Page {page + 1}: {len(links)} properties")
                time.sleep(0.5)
            except:
                break
        return property_links

    async def scrape_property_details_fast(self, property_url):
        try:
            html_task = asyncio.create_task(self.httpx_client.get(property_url))
            json_data = await self.extract_property_json_data(property_url)
            response = await html_task
            
            soup = BeautifulSoup(response.content, 'html.parser')
            property_data = {'url': property_url}
            
            # Extract property ID and use as title
            property_id = self.extract_property_id(property_url)
            property_data['title'] = property_id
            
            # Basic property info
            bed_match = re.search(r'(\d+)\s*bed', soup.get_text(), re.I)
            bath_match = re.search(r'(\d+)\s*bath', soup.get_text(), re.I)
            property_data['bedrooms'] = bed_match.group(1) if bed_match else 'N/A'
            property_data['bathrooms'] = bath_match.group(1) if bath_match else 'N/A'
            

            # Key features extraction
            property_data['key_features'] = 'N/A'
            feature_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'key features|features|amenities', re.I))
            for header in feature_headers:
                next_element = header.find_next_sibling()
                while next_element:
                    if next_element.name in ['ul', 'ol']:
                        features = [li.get_text().strip() for li in next_element.find_all('li')]
                        property_data['key_features'] = '; '.join(features) if features else property_data['key_features']
                        break
                    next_element = next_element.find_next_sibling()
                if property_data['key_features'] != 'N/A':
                    break
            
            # Property details extraction
            property_data.update({
                'parking': 'N/A',
                'garden': 'N/A',
                'council_tax': 'N/A',
                'accessibility': 'N/A',
                'size_sqft': 'N/A',
                'size_sqm': 'N/A',
                'furish_status': 'N/A',
            })
           
            # Extract property details from sections
            detail_sections = soup.find_all('dt', class_='_17A0LehXZKxGHbPeiLQ1BI')
            for section in detail_sections:
                section_text = section.get_text().strip().upper()
                value_element = section.find_next_sibling('dd')
                value_text = value_element.get_text().strip() if value_element else 'N/A'
                
                property_data['parking'] = value_text if 'PARKING' in section_text else property_data['parking']
                property_data['garden'] = value_text if 'GARDEN' in section_text else property_data['garden']
                property_data['council_tax'] = value_text if 'COUNCIL TAX' in section_text else property_data['council_tax']
                property_data['accessibility'] = value_text if 'ACCESSIBILITY' in section_text else property_data['accessibility']

            furnish_dt = soup.find('dt', string=re.compile(r'^Furnish type:\s*$', re.I))
            if furnish_dt:
                furnish_dd = furnish_dt.find_next_sibling('dd')
                if furnish_dd:
                    property_data['furish_status'] = furnish_dd.get_text().strip()

            # Size extraction from full text
            all_text = soup.get_text()
            sqft_match = re.search(r'([\d,]+)\s*sq\s*ft', all_text, re.I)
            sqm_match = re.search(r'([\d,]+)\s*sq\s*m', all_text, re.I)
            property_data['size_sqft'] = sqft_match.group(1).replace(',', '') if sqft_match else 'N/A'
            property_data['size_sqm'] = sqm_match.group(1).replace(',', '') if sqm_match else 'N/A'

            # Stations and photos from JSON
            if json_data:
                stations = jmespath.search("nearestStations[*].{name: name, distance: distance}", json_data) or []
                photos = jmespath.search("images[*].{url: url, caption: caption}", json_data) or []
                floorplan = jmespath.search("floorplans[*].{url: url, caption: caption}", json_data) or []
                property_type = jmespath.search("propertySubType", json_data) or jmespath.search("propertyType", json_data) or 'N/A'
                description = jmespath.search("text.description", json_data) or 'N/A'
                address_data = jmespath.search("address", json_data) or 'N/A'
                tenure_info = jmespath.search("tenure", json_data) or {}
                tenure_type = tenure_info.get('tenureType', 'N/A') if isinstance(tenure_info, dict) else (tenure_info if tenure_info != 'N/A' else 'N/A')
                lease_years_remaining = tenure_info.get('yearsRemainingOnLease', 'N/A') if isinstance(tenure_info, dict) else 'N/A'
                latitude = jmespath.search("location.latitude", json_data) or 'N/A'
                longitude = jmespath.search("location.longitude", json_data) or 'N/A'
                price = jmespath.search("prices.primaryPrice", json_data) or 'N/A'
                
                formatted_address, postcode = self.format_address_and_postcode(address_data)
                property_data.update({
                    'nearest_stations': '; '.join([s['name'] for s in stations]) or 'N/A',
                    'station_distances': '; '.join([f"{s['distance']} miles" for s in stations]) or 'N/A',
                    'station_count': len(stations),
                    'image_count': len(photos),
                    'image_urls': '; '.join([p['url'] for p in photos]) or 'N/A',
                    'floorplan_count': len(floorplan),
                    'floorplan_urls': '; '.join([f['url'] for f in floorplan]) or 'N/A',
                    'property_type': property_type,
                    'tenure_type': tenure_type,
                    'lease_years_remaining': lease_years_remaining,
                    'description': description,
                    'latitude': latitude,
                    'longitude': longitude,
                    'address': formatted_address,
                    'postcode': postcode,
                    'price': price
                })
            else:
                # HTML fallback for property type if JSON fails
                property_type_fallback = 'N/A'
                type_element = soup.find(string=re.compile(r'(flat|house|apartment|studio|maisonette)', re.I))
                if type_element:
                    property_type_fallback = type_element.strip()
                
                property_data.update({
                    'nearest_stations': 'N/A',
                    'station_distances': 'N/A', 
                    'station_count': 0,
                    'image_count': 0,
                    'image_urls': 'N/A',
                    'floorplan_count': 0,
                    'floorplan_urls': 'N/A',
                    'property_type': property_type_fallback,
                    'description': 'N/A',
                    'tenure_type': 'N/A',
                    'lease_years_remaining': 'N/A',
                    'latitude': 'N/A',
                    'longitude': 'N/A',
                    'address': 'N/A',
                    'postcode': 'N/A',
                    'price': 'N/A'
                })
            
            return property_data
            
        except Exception as e:
            return {'url': property_url, 'error': str(e)}

    async def scrape_all_properties_fast(self, search_url, max_pages=2, max_properties=None):
        property_links = self.get_property_links(search_url, max_pages)
        property_links = property_links[:max_properties] if max_properties else property_links
        
        print(f"Scraping {len(property_links)} properties...")
        
        batch_size = 5
        for i in range(0, len(property_links), batch_size):
            batch = property_links[i:i+batch_size]
            tasks = [self.scrape_property_details_fast(url) for url in batch]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            
            for result in results:
                self.properties.append(result if isinstance(result, dict) else {'error': str(result)})
            
            print(f"Completed {min(i+batch_size, len(property_links))}/{len(property_links)} properties")
            await asyncio.sleep(0.5)

    def save_to_csv(self, filename='rightmove_housing price_test.csv'):
        df = pd.DataFrame(self.properties)
        if 'description' in df.columns:
            df['description'] = df['description'].astype(str).apply(
                lambda x: re.sub(r'<[^>]+>', ' ', x) if x != 'N/A' else x
            ).apply(
                lambda x: re.sub(r'\s+', ' ', x).strip() if x != 'N/A' else x
            )
        df.to_csv(filename, index=False)
        print(f"Saved {len(self.properties)} properties to {filename}")
        return df

async def run_complete_scraper():
    scraper = RightmoveRentalScraper()
    
    # Multiple London area searches to get 50,000 properties
    london_search_urls = [
        # Greater London (main search)
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=0.0&_includeSSTC=on",
        
        # Central London
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=Central+London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=0.0&_includeSSTC=on",
        
        # North London
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=North+London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=0.0&_includeSSTC=on",
        
        # South London
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=South+London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=0.0&_includeSSTC=on",
        
        # East London
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=East+London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=0.0&_includeSSTC=on",
        
        # West London
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=West+London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=0.0&_includeSSTC=on",
        
        # Include surrounding areas within London commuter belt
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=Greater+London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=5.0&_includeSSTC=on",
        
        # Extend radius to 10 miles from London center
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=10.0&_includeSSTC=on",
        
        # Extend radius to 15 miles from London center
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=15.0&_includeSSTC=on",
        
        # Include all property types and price ranges
        "https://www.rightmove.co.uk/property-for-sale/find.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&buy=For+sale&radius=20.0&_includeSSTC=on&includeSSTC=true"
    ]
    
    start_time = time.time()
    
    target_properties = 50000
    
    for i, search_url in enumerate(london_search_urls):
        # Extract search type from URL for logging
        if "radius=0.0" in search_url:
            search_type = "London Core"
        elif "radius=5.0" in search_url:
            search_type = "London +5 miles"
        elif "radius=10.0" in search_url:
            search_type = "London +10 miles"
        elif "radius=15.0" in search_url:
            search_type = "London +15 miles"
        elif "radius=20.0" in search_url:
            search_type = "London +20 miles"
        else:
            search_type = f"London Area {i+1}"
        
        print(f"\n🏙️ Scraping {search_type}")
        print(f"Current total: {len(scraper.properties)} properties")
        
        # Calculate how many more properties we need
        remaining_needed = target_properties - len(scraper.properties)
        if remaining_needed <= 0:
            print(f"✅ Target of {target_properties} properties reached!")
            break
        
        # Set max_properties for this search
        max_for_this_search = min(remaining_needed, 10000)  # Don't take more than 10k per search
        
        await scraper.scrape_all_properties_fast(
            search_url, 
            max_pages=500,  # Increased pages per search
            max_properties=max_for_this_search
        )
        
        print(f"✅ {search_type} completed. Total properties: {len(scraper.properties)}")
        
        # Break if we've reached our target
        if len(scraper.properties) >= target_properties:
            print(f"🎯 Target reached! Total: {len(scraper.properties)} properties")
            break
    
    end_time = time.time()
    
    # Trim to exactly 50,000 if we got more
    if len(scraper.properties) > target_properties:
        scraper.properties = scraper.properties[:target_properties]
        print(f"✂️ Trimmed to exactly {target_properties} properties")
    
    scraper.save_to_csv('rightmove_london_50k_properties.csv')
    await scraper.httpx_client.aclose()
    
    print(f"⚡ Completed in {end_time - start_time:.2f} seconds")
    print(f"📊 Final count: {len(scraper.properties)} London properties")
    return scraper.properties

if __name__ == "__main__":
    nest_asyncio.apply()
    results = asyncio.run(run_complete_scraper())


🏙️ Scraping London Core
Current total: 0 properties
Page 1: 25 properties
Page 2: 25 properties
Page 3: 25 properties
Page 4: 25 properties
Page 5: 25 properties
Page 6: 25 properties
Page 7: 25 properties
Page 8: 25 properties
Page 9: 25 properties
Page 10: 25 properties
Page 11: 25 properties
Page 12: 25 properties
Page 13: 25 properties
Page 14: 25 properties
Page 15: 25 properties
Page 16: 25 properties
Page 17: 25 properties
Page 18: 25 properties
Page 19: 25 properties
Page 20: 25 properties
Page 21: 25 properties
Page 22: 25 properties
Page 23: 25 properties
Page 24: 25 properties
Page 25: 25 properties
Page 26: 25 properties
Page 27: 25 properties
Page 28: 25 properties
Page 29: 25 properties
Page 30: 25 properties
Page 31: 25 properties
Page 32: 25 properties
Page 33: 25 properties
Page 34: 25 properties
Page 35: 25 properties
Page 36: 25 properties
Page 37: 25 properties
Page 38: 25 properties
Page 39: 25 properties
Page 40: 25 properties
Page 41: 25 properties
Page 42: 25 p

KeyboardInterrupt: 

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json
import jmespath
import asyncio
from httpx import AsyncClient
from parsel import Selector
import nest_asyncio

class RightmoveRentalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        self.properties = []
        self.scraped_urls = set()  # Track scraped URLs to avoid duplicates
        self.httpx_client = AsyncClient(
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            },
            follow_redirects=True,
            http2=False,
            timeout=30,
        )
    
    def find_json_objects(self, text: str):
        pos = 0
        while True:
            match = text.find("{", pos)
            if match == -1: break
            try:
                result, index = json.JSONDecoder().raw_decode(text[match:])
                yield result
                pos = match + index
            except:
                pos = match + 1

    def extract_property_id(self, url):
        """Extract property ID from URL"""
        match = re.search(r'/properties/(\d+)', url)
        return match.group(1) if match else 'N/A'
    
    def format_address_and_postcode(self, address_data):
        """Extract and format address and postcode from JSON address object"""
        if isinstance(address_data, dict):
            display_address = address_data.get('displayAddress', '')
            if display_address:
                clean_address = re.sub(r'\r\n|\r|\n', ', ', display_address)
                clean_address = re.sub(r',\s*,', ',', clean_address)
                clean_address = re.sub(r'\s+', ' ', clean_address).strip()
                postcode_pattern = r',?\s*[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\s*$'
                clean_address = re.sub(postcode_pattern, '', clean_address, flags=re.IGNORECASE)
                clean_address = clean_address.rstrip(', ')
            else:
                clean_address = 'N/A'
            
            outcode = address_data.get('outcode', '')
            incode = address_data.get('incode', '')
            
            if outcode and incode:
                postcode = f"{outcode} {incode}"
            else:
                postcode = 'N/A'
            
            return clean_address, postcode
        return 'N/A', 'N/A'
    
    def extract_tenure_details(self, json_data):
        """Extract tenure type and lease years remaining from JSON data"""
        tenure_type = 'N/A'
        lease_years_remaining = 'N/A'
        
        if json_data:
            tenure_info = jmespath.search("tenure", json_data)
            
            if isinstance(tenure_info, dict):
                tenure_type = tenure_info.get('tenureType', 'N/A')
                years_remaining = tenure_info.get('yearsRemainingOnLease')
                if years_remaining is not None:
                    lease_years_remaining = years_remaining
                
                message = tenure_info.get('message')
                if message:
                    tenure_type = f"{tenure_type} - {message}"
            
            elif isinstance(tenure_info, str):
                tenure_type = tenure_info
        
        return tenure_type, lease_years_remaining

    async def extract_property_json_data(self, property_url):
        try:
            response = await self.httpx_client.get(property_url)
            data = Selector(response.text).xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
            if data:
                json_objects = list(self.find_json_objects(data))
                for obj in json_objects:
                    if "propertyData" in obj:
                        return obj["propertyData"]
            return None
        except Exception as e:
            print(f"JSON extraction error for {property_url}: {e}")
            return None

    def get_property_links(self, search_url, max_pages=42):
        """Get property links with enhanced duplicate detection"""
        property_links = []
        consecutive_empty_pages = 0
        
        for page in range(max_pages):
            page_url = f"{search_url}&index={page * 24}"
            try:
                response = self.session.get(page_url, timeout=15)
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Try multiple selectors
                links = []
                selectors = [
                    'a.propertyCard-link',
                    'a[href*="/properties/"]',
                    '.propertyCard a',
                    '[data-test="property-card"] a'
                ]
                
                for selector in selectors:
                    found_links = soup.select(selector)
                    if found_links:
                        links = [f"https://www.rightmove.co.uk{link.get('href')}" 
                                for link in found_links if link.get('href')]
                        break
                
                if len(links) == 0:
                    consecutive_empty_pages += 1
                    if consecutive_empty_pages >= 3:  # Stop if 3 consecutive empty pages
                        print(f"No more results found after page {page + 1}")
                        break
                else:
                    consecutive_empty_pages = 0
                    property_links.extend(links)
                    print(f"Page {page + 1}: {len(links)} properties found")
                
                time.sleep(1)  # Be respectful
                
            except Exception as e:
                print(f"Error on page {page + 1}: {e}")
                consecutive_empty_pages += 1
                if consecutive_empty_pages >= 3:
                    break
                
        # Remove duplicates while preserving order
        seen = set()
        unique_links = []
        for link in property_links:
            if link not in seen:
                seen.add(link)
                unique_links.append(link)
        
        return unique_links

    async def scrape_property_details_fast(self, property_url):
        # Skip if already scraped
        if property_url in self.scraped_urls:
            return None
            
        try:
            html_task = asyncio.create_task(self.httpx_client.get(property_url))
            json_data = await self.extract_property_json_data(property_url)
            response = await html_task
            
            soup = BeautifulSoup(response.content, 'html.parser')
            property_data = {'url': property_url}
            
            # Mark as scraped
            self.scraped_urls.add(property_url)
            
            # Extract property ID
            property_id = self.extract_property_id(property_url)
            property_data['title'] = property_id
            
            # Basic property info with enhanced extraction
            all_text = soup.get_text()
            
            # Enhanced bedroom extraction
            bed_patterns = [
                r'(\d+)\s*bedroom',
                r'(\d+)\s*bed(?:room)?s?',
                r'(\d+)\s*-?\s*bed'
            ]
            property_data['bedrooms'] = 'N/A'
            for pattern in bed_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bedrooms'] = match.group(1)
                    break
            
            # Enhanced bathroom extraction
            bath_patterns = [
                r'(\d+)\s*bathroom',
                r'(\d+)\s*bath(?:room)?s?',
                r'(\d+)\s*-?\s*bath'
            ]
            property_data['bathrooms'] = 'N/A'
            for pattern in bath_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bathrooms'] = match.group(1)
                    break

            # Key features extraction with multiple methods
            property_data['key_features'] = 'N/A'
            
            # Method 1: Look for feature lists
            feature_selectors = [
                'ul.lIhZ24u1NHlVy5_W9V__6 li',
                '.key-features li',
                '[data-test="key-features"] li',
                'h2:contains("Key features") + ul li',
                'h3:contains("Key features") + ul li'
            ]
            
            for selector in feature_selectors:
                features = soup.select(selector)
                if features:
                    feature_texts = [f.get_text().strip() for f in features]
                    property_data['key_features'] = '; '.join(feature_texts)
                    break
            
            # Method 2: Look for feature headers and following lists
            if property_data['key_features'] == 'N/A':
                feature_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'key features|features|amenities', re.I))
                for header in feature_headers:
                    next_element = header.find_next_sibling()
                    while next_element:
                        if next_element.name in ['ul', 'ol']:
                            features = [li.get_text().strip() for li in next_element.find_all('li')]
                            if features:
                                property_data['key_features'] = '; '.join(features)
                                break
                        next_element = next_element.find_next_sibling()
                    if property_data['key_features'] != 'N/A':
                        break
            
            # Property details extraction
            property_data.update({
                'parking': 'N/A',
                'garden': 'N/A',
                'council_tax': 'N/A',
                'accessibility': 'N/A',
                'size_sqft': 'N/A',
                'size_sqm': 'N/A',
                'furnish_status': 'N/A',
            })
           
            # Extract details from various selectors
            detail_selectors = [
                'dt._17A0LehXZKxGHbPeiLQ1BI',
                'dt[class*="detail"]',
                '.property-details dt',
                'dl dt'
            ]
            
            for selector in detail_selectors:
                detail_sections = soup.select(selector)
                if detail_sections:
                    for section in detail_sections:
                        section_text = section.get_text().strip().upper()
                        value_element = section.find_next_sibling(['dd', 'span', 'div'])
                        value_text = value_element.get_text().strip() if value_element else 'N/A'
                        
                        if 'PARKING' in section_text:
                            property_data['parking'] = value_text
                        elif 'GARDEN' in section_text:
                            property_data['garden'] = value_text
                        elif 'COUNCIL TAX' in section_text:
                            property_data['council_tax'] = value_text
                        elif 'ACCESSIBILITY' in section_text:
                            property_data['accessibility'] = value_text
                        elif 'FURNISH' in section_text:
                            property_data['furnish_status'] = value_text
                    break

            # Size extraction
            size_patterns = [
                r'([0-9,]+)\s*sq\.?\s*ft',
                r'([0-9,]+)\s*sqft',
                r'([0-9,]+)\s*square\s*feet'
            ]
            for pattern in size_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqft'] = match.group(1).replace(',', '')
                    break
            
            sqm_patterns = [
                r'([0-9,]+)\s*sq\.?\s*m',
                r'([0-9,]+)\s*sqm',
                r'([0-9,]+)\s*square\s*metres?'
            ]
            for pattern in sqm_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqm'] = match.group(1).replace(',', '')
                    break

            # Extract data from JSON if available
            if json_data:
                try:
                    stations = jmespath.search("nearestStations[*].{name: name, distance: distance}", json_data) or []
                    photos = jmespath.search("images[*].{url: url, caption: caption}", json_data) or []
                    floorplans = jmespath.search("floorplans[*].{url: url, caption: caption}", json_data) or []
                    property_type = (jmespath.search("propertySubType", json_data) or 
                                   jmespath.search("propertyType", json_data) or 'N/A')
                    description = jmespath.search("text.description", json_data) or 'N/A'
                    address_data = jmespath.search("address", json_data)
                    latitude = jmespath.search("location.latitude", json_data) or 'N/A'
                    longitude = jmespath.search("location.longitude", json_data) or 'N/A'
                    price = jmespath.search("prices.primaryPrice", json_data) or 'N/A'
                    
                    # Tenure information
                    tenure_info = jmespath.search("tenure", json_data) or {}
                    tenure_type = 'N/A'
                    lease_years_remaining = 'N/A'
                    
                    if isinstance(tenure_info, dict):
                        tenure_type = tenure_info.get('tenureType', 'N/A')
                        lease_years_remaining = tenure_info.get('yearsRemainingOnLease', 'N/A')
                    elif isinstance(tenure_info, str):
                        tenure_type = tenure_info
                    
                    formatted_address, postcode = self.format_address_and_postcode(address_data)
                    
                    property_data.update({
                        'nearest_stations': '; '.join([s['name'] for s in stations]) or 'N/A',
                        'station_distances': '; '.join([f"{s['distance']} miles" for s in stations]) or 'N/A',
                        'station_count': len(stations),
                        'image_count': len(photos),
                        'image_urls': '; '.join([p['url'] for p in photos]) or 'N/A',
                        'floorplan_count': len(floorplans),
                        'floorplan_urls': '; '.join([f['url'] for f in floorplans]) or 'N/A',
                        'property_type': property_type,
                        'tenure_type': tenure_type,
                        'lease_years_remaining': lease_years_remaining,
                        'description': description,
                        'latitude': latitude,
                        'longitude': longitude,
                        'address': formatted_address,
                        'postcode': postcode,
                        'price': price
                    })
                except Exception as e:
                    print(f"Error extracting JSON data: {e}")
            else:
                # HTML fallback
                property_type_fallback = 'N/A'
                type_element = soup.find(string=re.compile(r'(flat|house|apartment|studio|maisonette)', re.I))
                if type_element:
                    property_type_fallback = type_element.strip()
                
                property_data.update({
                    'nearest_stations': 'N/A',
                    'station_distances': 'N/A', 
                    'station_count': 0,
                    'image_count': 0,
                    'image_urls': 'N/A',
                    'floorplan_count': 0,
                    'floorplan_urls': 'N/A',
                    'property_type': property_type_fallback,
                    'description': 'N/A',
                    'tenure_type': 'N/A',
                    'lease_years_remaining': 'N/A',
                    'latitude': 'N/A',
                    'longitude': 'N/A',
                    'address': 'N/A',
                    'postcode': 'N/A',
                    'price': 'N/A'
                })
            
            return property_data
            
        except Exception as e:
            print(f"Error scraping {property_url}: {e}")
            return {'url': property_url, 'error': str(e)}

    async def scrape_all_properties_fast(self, search_url, max_pages=42, max_properties=None):
        print(f"Getting property links from up to {max_pages} pages...")
        property_links = self.get_property_links(search_url, max_pages)
        
        if max_properties:
            property_links = property_links[:max_properties]
        
        print(f"Found {len(property_links)} unique properties to scrape...")
        
        if not property_links:
            print("No property links found.")
            return
        
        # Filter out already scraped properties
        new_links = [link for link in property_links if link not in self.scraped_urls]
        print(f"New properties to scrape: {len(new_links)}")
        
        batch_size = 3  # Conservative batch size
        successful = 0
        
        for i in range(0, len(new_links), batch_size):
            batch = new_links[i:i+batch_size]
            tasks = [self.scrape_property_details_fast(url) for url in batch]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            
            for result in results:
                if isinstance(result, dict) and result is not None and 'error' not in result:
                    self.properties.append(result)
                    successful += 1
                elif isinstance(result, dict) and 'error' in result:
                    print(f"Error in result: {result['error']}")
            
            print(f"Completed {min(i+batch_size, len(new_links))}/{len(new_links)} properties (Success: {successful})")
            await asyncio.sleep(1)  # Rate limiting

    def save_to_csv(self, filename='rightmove_all_properties.csv'):
        if not self.properties:
            print("No properties to save!")
            return None
            
        df = pd.DataFrame(self.properties)
        
        # Clean description field
        if 'description' in df.columns:
            df['description'] = df['description'].astype(str).apply(
                lambda x: re.sub(r'<[^>]+>', ' ', x) if x != 'N/A' else x
            ).apply(
                lambda x: re.sub(r'\s+', ' ', x).strip() if x != 'N/A' else x
            )
        
        # Remove duplicate properties by URL
        df_clean = df.drop_duplicates(subset=['url'], keep='first')
        
        df_clean.to_csv(filename, index=False)
        print(f"Saved {len(df_clean)} unique properties to {filename}")
        
        if len(df_clean) > 0:
            print(f"\nData Summary:")
            print(f"Properties with prices: {len(df_clean[df_clean['price'] != 'N/A'])}")
            print(f"Properties with addresses: {len(df_clean[df_clean['address'] != 'N/A'])}")
            print(f"Properties with images: {len(df_clean[df_clean['image_count'] != 0])}")
        
        return df_clean

        # ...existing code...

    # ...existing code...

async def run_complete_scraper():
    scraper = RightmoveRentalScraper()
    
    # London Boroughs with their proper location identifiers
    london_boroughs = [
        # Inner London Boroughs
        {"name": "City of London", "id": "REGION%5E92824"},
        
    ]
    
    # Enhanced price ranges to segment large result sets
    price_ranges = [
        "",  # All prices (no price filter)
        
        # Lower price ranges
        "&maxPrice=230000",
        "&minPrice=230000&maxPrice=240000",
        "&minPrice=240000&maxPrice=250000",
        "&minPrice=250000&maxPrice=260000",
        "&minPrice=260000&maxPrice=270000",
        "&minPrice=270000&maxPrice=280000",
        "&minPrice=280000&maxPrice=290000",
        "&minPrice=290000&maxPrice=300000",
        "&minPrice=300000&maxPrice=325000",
        "&minPrice=325000&maxPrice=350000",
        "&minPrice=350000&maxPrice=375000",
        "&minPrice=375000&maxPrice=400000",
        "&minPrice=400000&maxPrice=425000",
        "&minPrice=425000&maxPrice=450000",
        "&minPrice=450000&maxPrice=475000",
        "&minPrice=475000&maxPrice=500000",
        
        # Mid-range prices
        "&minPrice=500000&maxPrice=550000",
        "&minPrice=550000&maxPrice=600000",
        "&minPrice=600000&maxPrice=650000",
        "&minPrice=650000&maxPrice=700000",
        "&minPrice=700000&maxPrice=800000",
        "&minPrice=800000&maxPrice=900000",
        "&minPrice=900000&maxPrice=1000000",
        
        # Higher price ranges
        "&minPrice=1000000&maxPrice=1250000",
        "&minPrice=1250000&maxPrice=1500000",
        "&minPrice=1500000&maxPrice=1750000",
        "&minPrice=1750000&maxPrice=2000000",
        "&minPrice=2000000&maxPrice=2500000",
        "&minPrice=2500000&maxPrice=3000000",
        "&minPrice=3000000&maxPrice=4000000",
        "&minPrice=4000000&maxPrice=5000000",
        "&minPrice=5000000&maxPrice=7500000",
        "&minPrice=7500000&maxPrice=10000000",
        "&minPrice=10000000&maxPrice=15000000",
        "&minPrice=15000000&maxPrice=20000000",
        
        # Ultra-high end (no maximum)
        "&minPrice=20000000"
    ]
    
    # Generate all search URL combinations with proper format
    london_search_urls = []
    
    for borough in london_boroughs:
        for price_range in price_ranges:
            # Sale properties with proper URL format
            sale_url = f"https://www.rightmove.co.uk/property-for-sale/find.html?useLocationIdentifier=true&locationIdentifier={borough['id']}&buy=For+sale&radius=0.0&_includeSSTC=on&index=0&sortType=2&channel=BUY&transactionType=BUY&displayLocationIdentifier={borough['name'].replace(' ', '-')}.html{price_range}"
            london_search_urls.append({
                'url': sale_url,
                'name': borough['name'],
                'type': 'sale',
                'price_range': price_range.replace('&', '') if price_range else 'all_prices'
            })
    
    print(f"Generated {len(london_search_urls)} London search combinations")
    
    start_time = time.time()
    target_properties = 50000
    scraped_urls = set()  # Track to avoid duplicates
    
    for i, search_item in enumerate(london_search_urls):
        search_url = search_item['url']
        borough_name = search_item['name']
        search_type = search_item['type']
        price_range = search_item['price_range']
        
        print(f"\n🏙️ Scraping {borough_name} ({search_type}) - {price_range}")
        print(f"Current total: {len(scraper.properties)} properties")
        
        # Calculate how many more properties we need
        remaining_needed = target_properties - len(scraper.properties)
        if remaining_needed <= 0:
            print(f"✅ Target of {target_properties} properties reached!")
            break
        
        # Set max_properties for this search (limit per search combination)
        max_for_this_search = min(remaining_needed, 500)  # Max 500 per search combination
        
        try:
            # Get property links first to check for duplicates
            property_links = scraper.get_property_links(search_url, max_pages=42)
            
            # Filter out already scraped URLs
            new_links = [link for link in property_links if link not in scraped_urls]
            
            if new_links:
                # Limit to max_for_this_search
                new_links = new_links[:max_for_this_search]
                
                # Add to scraped set
                scraped_urls.update(new_links)
                
                print(f"Found {len(new_links)} new properties to scrape")
                
                # Scrape the new properties
                batch_size = 5
                for j in range(0, len(new_links), batch_size):
                    batch = new_links[j:j+batch_size]
                    tasks = [scraper.scrape_property_details_fast(url) for url in batch]
                    results = await asyncio.gather(*tasks, return_exceptions=True)
                    
                    for result in results:
                        if isinstance(result, dict) and result is not None and 'error' not in result:
                            scraper.properties.append(result)
                    
                    print(f"Completed {min(j+batch_size, len(new_links))}/{len(new_links)} properties from {borough_name}")
                    await asyncio.sleep(0.5)
            else:
                print(f"No new properties found in {borough_name} - {price_range}")
            
        except Exception as e:
            print(f"Error scraping {borough_name}: {e}")
            continue
        
        print(f"✅ {borough_name} ({price_range}) completed. Total properties: {len(scraper.properties)}")
        
        # Break if we've reached our target
        if len(scraper.properties) >= target_properties:
            print(f"🎯 Target reached! Total: {len(scraper.properties)} properties")
            break
        
        # Small delay between searches
        await asyncio.sleep(1)

    end_time = time.time()
    
    # Trim to exactly 50,000 if we got more
    if len(scraper.properties) > target_properties:
        scraper.properties = scraper.properties[:target_properties]
        print(f"✂️ Trimmed to exactly {target_properties} properties")
    
    # Remove any duplicate properties by URL
    seen_urls = set()
    unique_properties = []
    for prop in scraper.properties:
        if prop.get('url') not in seen_urls:
            seen_urls.add(prop.get('url'))
            unique_properties.append(prop)
    
    scraper.properties = unique_properties
    
    scraper.save_to_csv('rightmove_london_50k_comprehensive.csv')
    await scraper.httpx_client.aclose()
    
    print(f"⚡ Completed in {end_time - start_time:.2f} seconds")
    print(f"📊 Final count: {len(scraper.properties)} unique London properties")
    print(f"🏘️ Covered all {len(london_boroughs)} London boroughs/areas")
    return scraper.properties



if __name__ == "__main__":
    nest_asyncio.apply()
    results = asyncio.run(run_complete_scraper())

Generated 37 London search combinations

🏙️ Scraping City of London (sale) - all_prices
Current total: 0 properties
Page 1: 25 properties found
Page 2: 25 properties found
Page 3: 25 properties found
Page 4: 25 properties found
Page 5: 25 properties found
Page 6: 25 properties found
Page 7: 25 properties found
Page 8: 25 properties found
Page 9: 25 properties found
Page 10: 25 properties found
Page 11: 25 properties found
Page 12: 25 properties found
Page 13: 25 properties found
Page 14: 25 properties found
Page 15: 25 properties found
Page 16: 25 properties found
Page 17: 25 properties found
Page 18: 25 properties found
Page 19: 25 properties found
Page 20: 25 properties found
Page 21: 25 properties found
Page 22: 25 properties found
Page 23: 25 properties found
Page 24: 25 properties found
Page 25: 25 properties found
Page 26: 25 properties found
Page 27: 25 properties found
Page 28: 25 properties found
Page 29: 25 properties found
Page 30: 25 properties found
Page 31: 25 properties 

KeyboardInterrupt: 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json
import jmespath
import asyncio
from httpx import AsyncClient
from parsel import Selector
import nest_asyncio

class RightmoveRentalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        self.properties = []
        self.scraped_urls = set()  # Track scraped URLs to avoid duplicates
        self.httpx_client = AsyncClient(
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            },
            follow_redirects=True,
            http2=False,
            timeout=30,
        )
    
    def find_json_objects(self, text: str):
        pos = 0
        while True:
            match = text.find("{", pos)
            if match == -1: break
            try:
                result, index = json.JSONDecoder().raw_decode(text[match:])
                yield result
                pos = match + index
            except:
                pos = match + 1

    def extract_property_id(self, url):
        """Extract property ID from URL"""
        match = re.search(r'/properties/(\d+)', url)
        return match.group(1) if match else 'N/A'
    
    def format_address_and_postcode(self, address_data):
        """Extract and format address and postcode from JSON address object"""
        if isinstance(address_data, dict):
            # Extract display address and clean it
            display_address = address_data.get('displayAddress', '')
            if display_address:
                # Remove line breaks and extra spaces
                clean_address = re.sub(r'\r\n|\r|\n', ', ', display_address)
                clean_address = re.sub(r',\s*,', ',', clean_address)  # Remove double commas
                clean_address = re.sub(r'\s+', ' ', clean_address).strip()  # Normalize spaces
                
                # Remove postcode from address if it appears at the end
                postcode_pattern = r',?\s*[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\s*$'
                clean_address = re.sub(postcode_pattern, '', clean_address, flags=re.IGNORECASE)
                clean_address = clean_address.rstrip(', ')
            else:
                clean_address = 'N/A'
            
            # Combine outcode and incode to form postcode
            outcode = address_data.get('outcode', '')
            incode = address_data.get('incode', '')
            
            if outcode and incode:
                postcode = f"{outcode} {incode}"
            else:
                postcode = 'N/A'
            
            return clean_address, postcode
        return 'N/A', 'N/A'
    
    def extract_tenure_details(self, json_data):
        """Extract tenure type and lease years remaining from JSON data"""
        tenure_type = 'N/A'
        lease_years_remaining = 'N/A'
        
        if json_data:
            # Extract tenure information
            tenure_info = jmespath.search("tenure", json_data)
            
            if isinstance(tenure_info, dict):
                # Extract tenure type
                tenure_type = tenure_info.get('tenureType', 'N/A')
                
                # Extract years remaining on lease
                years_remaining = tenure_info.get('yearsRemainingOnLease')
                if years_remaining is not None:
                    lease_years_remaining = years_remaining
                
                # Handle message field if needed
                message = tenure_info.get('message')
                if message:
                    # You can append message to tenure_type if it contains useful info
                    tenure_type = f"{tenure_type} - {message}"
            
            elif isinstance(tenure_info, str):
                # If it's just a string, use it as tenure type
                tenure_type = tenure_info
        
        return tenure_type, lease_years_remaining

    async def extract_property_json_data(self, property_url):
        try:
            response = await self.httpx_client.get(property_url)
            data = Selector(response.text).xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
            if data:
                json_objects = list(self.find_json_objects(data))
                for obj in json_objects:
                    if "propertyData" in obj:
                        return obj["propertyData"]
            return None
        except Exception as e:
            print(f"JSON extraction error for {property_url}: {e}")
            return None

    def get_property_links(self, search_url, max_pages=42):
        """Get property links with smart page detection - stops early if no results"""
        property_links = []
        consecutive_empty_pages = 0
        
        for page in range(max_pages):
            page_url = f"{search_url}&index={page * 24}"
            try:
                response = self.session.get(page_url, timeout=15)
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Try multiple selectors
                links = []
                selectors = [
                    'a.propertyCard-link',
                    'a[href*="/properties/"]',
                    '.propertyCard a',
                    '[data-test="property-card"] a'
                ]
                
                for selector in selectors:
                    found_links = soup.select(selector)
                    if found_links:
                        links = [f"https://www.rightmove.co.uk{link.get('href')}" 
                                for link in found_links if link.get('href')]
                        break
                
                if len(links) == 0:
                    consecutive_empty_pages += 1
                    print(f"Page {page + 1}: 0 properties found")
                    # Stop after 2 consecutive empty pages (faster than 3)
                    if consecutive_empty_pages >= 2:
                        print(f"No more results found after page {page + 1}")
                        break
                else:
                    consecutive_empty_pages = 0
                    property_links.extend(links)
                    print(f"Page {page + 1}: {len(links)} properties found")
                
                time.sleep(0.5)  # Reduced delay for faster scraping
                
            except Exception as e:
                print(f"Error on page {page + 1}: {e}")
                consecutive_empty_pages += 1
                if consecutive_empty_pages >= 2:
                    break
                
        # Remove duplicates while preserving order
        seen = set()
        unique_links = []
        for link in property_links:
            if link not in seen:
                seen.add(link)
                unique_links.append(link)
        
        return unique_links

    async def scrape_property_details_fast(self, property_url):
        # Skip if already scraped
        if property_url in self.scraped_urls:
            return None
            
        try:
            html_task = asyncio.create_task(self.httpx_client.get(property_url))
            json_data = await self.extract_property_json_data(property_url)
            response = await html_task
            
            soup = BeautifulSoup(response.content, 'html.parser')
            property_data = {'url': property_url}
            
            # Mark as scraped
            self.scraped_urls.add(property_url)
            
            # Extract property ID and use as title
            property_id = self.extract_property_id(property_url)
            property_data['title'] = property_id
            
            # Basic property info with enhanced extraction
            all_text = soup.get_text()
            
            # Enhanced bedroom extraction
            bed_patterns = [
                r'(\d+)\s*bedroom',
                r'(\d+)\s*bed(?:room)?s?',
                r'(\d+)\s*-?\s*bed'
            ]
            property_data['bedrooms'] = 'N/A'
            for pattern in bed_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bedrooms'] = match.group(1)
                    break
            
            # Enhanced bathroom extraction
            bath_patterns = [
                r'(\d+)\s*bathroom',
                r'(\d+)\s*bath(?:room)?s?',
                r'(\d+)\s*-?\s*bath'
            ]
            property_data['bathrooms'] = 'N/A'
            for pattern in bath_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bathrooms'] = match.group(1)
                    break

            # Key features extraction with multiple methods
            property_data['key_features'] = 'N/A'
            
            # Method 1: Look for feature lists
            feature_selectors = [
                'ul.lIhZ24u1NHlVy5_W9V__6 li',
                '.key-features li',
                '[data-test="key-features"] li',
                'h2:contains("Key features") + ul li',
                'h3:contains("Key features") + ul li'
            ]
            
            for selector in feature_selectors:
                features = soup.select(selector)
                if features:
                    feature_texts = [f.get_text().strip() for f in features]
                    property_data['key_features'] = '; '.join(feature_texts)
                    break
            
            # Method 2: Look for feature headers and following lists
            if property_data['key_features'] == 'N/A':
                feature_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'key features|features|amenities', re.I))
                for header in feature_headers:
                    next_element = header.find_next_sibling()
                    while next_element:
                        if next_element.name in ['ul', 'ol']:
                            features = [li.get_text().strip() for li in next_element.find_all('li')]
                            if features:
                                property_data['key_features'] = '; '.join(features)
                                break
                        next_element = next_element.find_next_sibling()
                    if property_data['key_features'] != 'N/A':
                        break
            
            # Property details extraction
            property_data.update({
                'parking': 'N/A',
                'garden': 'N/A',
                'council_tax': 'N/A',
                'accessibility': 'N/A',
                'size_sqft': 'N/A',
                'size_sqm': 'N/A',
                'furnish_status': 'N/A',
            })
           
            # Extract details from various selectors
            detail_selectors = [
                'dt._17A0LehXZKxGHbPeiLQ1BI',
                'dt[class*="detail"]',
                '.property-details dt',
                'dl dt'
            ]
            
            for selector in detail_selectors:
                detail_sections = soup.select(selector)
                if detail_sections:
                    for section in detail_sections:
                        section_text = section.get_text().strip().upper()
                        value_element = section.find_next_sibling(['dd', 'span', 'div'])
                        value_text = value_element.get_text().strip() if value_element else 'N/A'
                        
                        if 'PARKING' in section_text:
                            property_data['parking'] = value_text
                        elif 'GARDEN' in section_text:
                            property_data['garden'] = value_text
                        elif 'COUNCIL TAX' in section_text:
                            property_data['council_tax'] = value_text
                        elif 'ACCESSIBILITY' in section_text:
                            property_data['accessibility'] = value_text
                        elif 'FURNISH' in section_text:
                            property_data['furnish_status'] = value_text
                    break

            # Additional furnish status extraction
            furnish_dt = soup.find('dt', string=re.compile(r'^Furnish type:\s*$', re.I))
            if furnish_dt:
                furnish_dd = furnish_dt.find_next_sibling('dd')
                if furnish_dd:
                    property_data['furnish_status'] = furnish_dd.get_text().strip()

            # Size extraction from full text
            sqft_patterns = [
                r'([0-9,]+)\s*sq\.?\s*ft',
                r'([0-9,]+)\s*sqft',
                r'([0-9,]+)\s*square\s*feet'
            ]
            for pattern in sqft_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqft'] = match.group(1).replace(',', '')
                    break
            
            sqm_patterns = [
                r'([0-9,]+)\s*sq\.?\s*m',
                r'([0-9,]+)\s*sqm',
                r'([0-9,]+)\s*square\s*metres?'
            ]
            for pattern in sqm_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqm'] = match.group(1).replace(',', '')
                    break

            # Extract data from JSON if available
            if json_data:
                try:
                    stations = jmespath.search("nearestStations[*].{name: name, distance: distance}", json_data) or []
                    photos = jmespath.search("images[*].{url: url, caption: caption}", json_data) or []
                    floorplans = jmespath.search("floorplans[*].{url: url, caption: caption}", json_data) or []
                    property_type = (jmespath.search("propertySubType", json_data) or 
                                   jmespath.search("propertyType", json_data) or 'N/A')
                    description = jmespath.search("text.description", json_data) or 'N/A'
                    address_data = jmespath.search("address", json_data)
                    latitude = jmespath.search("location.latitude", json_data) or 'N/A'
                    longitude = jmespath.search("location.longitude", json_data) or 'N/A'
                    price = jmespath.search("prices.primaryPrice", json_data) or 'N/A'
                    
                    # Tenure information
                    tenure_info = jmespath.search("tenure", json_data) or {}
                    tenure_type = 'N/A'
                    lease_years_remaining = 'N/A'
                    
                    if isinstance(tenure_info, dict):
                        tenure_type = tenure_info.get('tenureType', 'N/A')
                        lease_years_remaining = tenure_info.get('yearsRemainingOnLease', 'N/A')
                    elif isinstance(tenure_info, str):
                        tenure_type = tenure_info
                    
                    formatted_address, postcode = self.format_address_and_postcode(address_data)
                    
                    property_data.update({
                        'nearest_stations': '; '.join([s['name'] for s in stations]) or 'N/A',
                        'station_distances': '; '.join([f"{s['distance']} miles" for s in stations]) or 'N/A',
                        'station_count': len(stations),
                        'image_count': len(photos),
                        'image_urls': '; '.join([p['url'] for p in photos]) or 'N/A',
                        'floorplan_count': len(floorplans),
                        'floorplan_urls': '; '.join([f['url'] for f in floorplans]) or 'N/A',
                        'property_type': property_type,
                        'tenure_type': tenure_type,
                        'lease_years_remaining': lease_years_remaining,
                        'description': description,
                        'latitude': latitude,
                        'longitude': longitude,
                        'address': formatted_address,
                        'postcode': postcode,
                        'price': price
                    })
                except Exception as e:
                    print(f"Error extracting JSON data: {e}")
            else:
                # HTML fallback for property type if JSON fails
                property_type_fallback = 'N/A'
                type_element = soup.find(string=re.compile(r'(flat|house|apartment|studio|maisonette)', re.I))
                if type_element:
                    property_type_fallback = type_element.strip()
                
                property_data.update({
                    'nearest_stations': 'N/A',
                    'station_distances': 'N/A', 
                    'station_count': 0,
                    'image_count': 0,
                    'image_urls': 'N/A',
                    'floorplan_count': 0,
                    'floorplan_urls': 'N/A',
                    'property_type': property_type_fallback,
                    'description': 'N/A',
                    'tenure_type': 'N/A',
                    'lease_years_remaining': 'N/A',
                    'latitude': 'N/A',
                    'longitude': 'N/A',
                    'address': 'N/A',
                    'postcode': 'N/A',
                    'price': 'N/A'
                })
            
            return property_data
            
        except Exception as e:
            print(f"Error scraping {property_url}: {e}")
            return {'url': property_url, 'error': str(e)}

    async def scrape_all_properties_fast(self, search_url, max_pages=42, max_properties=None):
        print(f"Getting property links (will stop early if no results found)...")
        property_links = self.get_property_links(search_url, max_pages)
        
        if max_properties:
            property_links = property_links[:max_properties]
        
        print(f"Found {len(property_links)} unique properties to scrape...")
        
        if not property_links:
            print("No property links found.")
            return
        
        # Filter out already scraped properties
        new_links = [link for link in property_links if link not in self.scraped_urls]
        print(f"New properties to scrape: {len(new_links)}")
        
        batch_size = 3  # Conservative batch size
        successful = 0
        
        for i in range(0, len(new_links), batch_size):
            batch = new_links[i:i+batch_size]
            tasks = [self.scrape_property_details_fast(url) for url in batch]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            
            for result in results:
                if isinstance(result, dict) and result is not None and 'error' not in result:
                    self.properties.append(result)
                    successful += 1
                elif isinstance(result, dict) and 'error' in result:
                    print(f"Error in result: {result['error']}")
            
            print(f"Completed {min(i+batch_size, len(new_links))}/{len(new_links)} properties (Success: {successful})")
            await asyncio.sleep(0.5)  # Reduced delay

    def save_to_csv(self, filename='rightmove_london_50k_comprehensive.csv'):
        if not self.properties:
            print("No properties to save!")
            return None
            
        df = pd.DataFrame(self.properties)
        
        # Clean description field
        if 'description' in df.columns:
            df['description'] = df['description'].astype(str).apply(
                lambda x: re.sub(r'<[^>]+>', ' ', x) if x != 'N/A' else x
            ).apply(
                lambda x: re.sub(r'\s+', ' ', x).strip() if x != 'N/A' else x
            )
        
        # Remove duplicate properties by URL
        df_clean = df.drop_duplicates(subset=['url'], keep='first')
        
        df_clean.to_csv(filename, index=False)
        print(f"Saved {len(df_clean)} unique properties to {filename}")
        
        if len(df_clean) > 0:
            print(f"\nData Summary:")
            print(f"Properties with prices: {len(df_clean[df_clean['price'] != 'N/A'])}")
            print(f"Properties with addresses: {len(df_clean[df_clean['address'] != 'N/A'])}")
            print(f"Properties with images: {len(df_clean[df_clean['image_count'] != 0])}")
        
        return df_clean

async def run_complete_scraper():
    scraper = RightmoveRentalScraper()
    
    # Complete London Boroughs list
    london_boroughs = [
        # Inner London Boroughs
        {"name": "City of London", "id": "REGION%5E61224"},
        
    ]
    
    # More granular price ranges to reduce pages per search
    price_ranges = [
        
        
        # Very granular lower price ranges (£10k increments)
        "&minPrice=200000&maxPrice=300000",
        "&minPrice=300000&maxPrice=400000",
        "&minPrice=400000&maxPrice=500000",
        "&minPrice=500000&maxPrice=600000",
        "&minPrice=600000&maxPrice=700000",
        "&minPrice=700000&maxPrice=800000",
        "&minPrice=800000&maxPrice=900000",
        "&minPrice=900000&maxPrice=1000000",
        

        
        
        # £100k increments for higher range
        "&minPrice=1000000&maxPrice=1100000",
        "&minPrice=1100000&maxPrice=1200000",
        "&minPrice=1200000&maxPrice=1300000",
        "&minPrice=1300000&maxPrice=1400000",
        "&minPrice=1400000&maxPrice=1500000",
        "&minPrice=1500000&maxPrice=1600000",
        "&minPrice=1600000&maxPrice=1700000",
        "&minPrice=1700000&maxPrice=1800000",
        "&minPrice=1800000&maxPrice=1900000",
        "&minPrice=1900000&maxPrice=2000000",
        
        # £250k increments for high range
        "&minPrice=2000000&maxPrice=2250000",
        "&minPrice=2250000&maxPrice=2500000",
        "&minPrice=2500000&maxPrice=2750000",
        "&minPrice=2750000&maxPrice=3000000",
        "&minPrice=3000000&maxPrice=3500000",
        "&minPrice=3500000&maxPrice=4000000",
        "&minPrice=4000000&maxPrice=4500000",
        "&minPrice=4500000&maxPrice=5000000",
        
        # Larger increments for ultra-high end
        "&minPrice=5000000&maxPrice=6000000",
        "&minPrice=6000000&maxPrice=7000000",
        "&minPrice=7000000&maxPrice=8000000",
        "&minPrice=8000000&maxPrice=9000000",
        "&minPrice=9000000&maxPrice=10000000",
        "&minPrice=10000000&maxPrice=12000000",
        "&minPrice=12000000&maxPrice=15000000",
        "&minPrice=15000000&maxPrice=20000000",
        
        # Ultra-high end (no maximum)
        "&minPrice=20000000"
    ]
    
    # Generate all search URL combinations with proper format
    london_search_urls = []
    
    for borough in london_boroughs:
        for price_range in price_ranges:
            # Sale properties with proper URL format
            sale_url = f"https://www.rightmove.co.uk/property-for-sale/find.html?useLocationIdentifier=true&locationIdentifier={borough['id']}&buy=For+sale&radius=0.0&_includeSSTC=on&index=0&sortType=2&channel=BUY&transactionType=BUY{price_range}"
            london_search_urls.append({
                'url': sale_url,
                'name': borough['name'],
                'type': 'sale',
                'price_range': price_range.replace('&', '') if price_range else 'all_prices'
            })
    
    print(f"Generated {len(london_search_urls)} London search combinations")
    
    start_time = time.time()
    target_properties = 50000
    scraped_urls = set()  # Track to avoid duplicates
    
    for i, search_item in enumerate(london_search_urls):
        search_url = search_item['url']
        borough_name = search_item['name']
        search_type = search_item['type']
        price_range = search_item['price_range']
        
        print(f"\n🏙️ Scraping {borough_name} ({search_type}) - {price_range}")
        print(f"Current total: {len(scraper.properties)} properties")
        
        # Calculate how many more properties we need
        remaining_needed = target_properties - len(scraper.properties)
        if remaining_needed <= 0:
            print(f"✅ Target of {target_properties} properties reached!")
            break
        
        # Set max_properties for this search (limit per search combination)
        max_for_this_search = min(remaining_needed, 300)  # Reduced from 500 since we have more granular searches
        
        try:
            # Smart page detection - will stop early if no results
            await scraper.scrape_all_properties_fast(
                search_url, 
                max_pages=42,  # Max possible, but will stop early
                max_properties=max_for_this_search
            )
            
        except Exception as e:
            print(f"Error scraping {borough_name}: {e}")
            continue
        
        print(f"✅ {borough_name} ({price_range}) completed. Total properties: {len(scraper.properties)}")
        
        # Break if we've reached our target
        if len(scraper.properties) >= target_properties:
            print(f"🎯 Target reached! Total: {len(scraper.properties)} properties")
            break
        
        # Small delay between searches
        await asyncio.sleep(0.5)  # Reduced delay

    end_time = time.time()
    
    # Trim to exactly 50,000 if we got more
    if len(scraper.properties) > target_properties:
        scraper.properties = scraper.properties[:target_properties]
        print(f"✂️ Trimmed to exactly {target_properties} properties")
    
    # Remove any duplicate properties by URL
    seen_urls = set()
    unique_properties = []
    for prop in scraper.properties:
        if prop.get('url') not in seen_urls:
            seen_urls.add(prop.get('url'))
            unique_properties.append(prop)
    
    scraper.properties = unique_properties
    
    scraper.save_to_csv('rightmove_london_50k_comprehensive.csv')
    await scraper.httpx_client.aclose()
    
    print(f"⚡ Completed in {end_time - start_time:.2f} seconds")
    print(f"📊 Final count: {len(scraper.properties)} unique London properties")
    print(f"🏘️ Covered all {len(london_boroughs)} London boroughs/areas")
    return scraper.properties

if __name__ == "__main__":
    nest_asyncio.apply()
    results = asyncio.run(run_complete_scraper())

Generated 35 London search combinations

🏙️ Scraping City of London (sale) - minPrice=200000maxPrice=300000
Current total: 0 properties
Getting property links (will stop early if no results found)...
Page 1: 1 properties found
Page 2: 1 properties found
Page 3: 1 properties found
Page 4: 1 properties found
Page 5: 1 properties found
Page 6: 1 properties found
Page 7: 1 properties found
Page 8: 1 properties found


Task exception was never retrieved
future: <Task finished name='Task-3598' coro=<run_complete_scraper() done, defined at C:\Users\Jc\AppData\Local\Temp\ipykernel_5792\3331485428.py:457> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "c:\Users\Jc\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Jc\AppData\Local\Temp\ipykernel_5792\3331485428.py", line 629, in <module>
    results = asyncio.run(run_complete_scraper())
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Jc\anaconda3\Lib\site-packages\nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Jc\anaconda3\Lib\site-packages\nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "c:\Users\Jc\anaconda3\Lib\site-packages\nest_asyncio.py", line 133, in _run_once
    handle._run()
  File 

Page 9: 1 properties found
Page 10: 1 properties found
Page 11: 1 properties found
Page 12: 1 properties found
Page 13: 1 properties found
Page 14: 1 properties found
Page 15: 1 properties found
Page 16: 1 properties found
Page 17: 1 properties found
Page 18: 1 properties found
Page 19: 1 properties found
Page 20: 1 properties found
Page 21: 1 properties found
Page 22: 1 properties found
Page 23: 1 properties found
Page 24: 1 properties found
Page 25: 1 properties found
Page 26: 1 properties found
Page 27: 1 properties found
Page 28: 1 properties found
Page 29: 1 properties found
Page 30: 1 properties found
Page 31: 1 properties found
Page 32: 1 properties found


KeyboardInterrupt: 

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json
import jmespath
import asyncio
from httpx import AsyncClient
from parsel import Selector
import nest_asyncio

class RightmoveRentalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        self.properties = []
        self.scraped_urls = set()  # Track scraped URLs to avoid duplicates
        self.httpx_client = AsyncClient(
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            },
            follow_redirects=True,
            http2=False,
            timeout=30,
        )
    
    def find_json_objects(self, text: str):
        pos = 0
        while True:
            match = text.find("{", pos)
            if match == -1: break
            try:
                result, index = json.JSONDecoder().raw_decode(text[match:])
                yield result
                pos = match + index
            except:
                pos = match + 1

    def extract_property_id(self, url):
        """Extract property ID from URL"""
        match = re.search(r'/properties/(\d+)', url)
        return match.group(1) if match else 'N/A'
    
    def format_address_and_postcode(self, address_data):
        """Extract and format address and postcode from JSON address object"""
        if isinstance(address_data, dict):
            # Extract display address and clean it
            display_address = address_data.get('displayAddress', '')
            if display_address:
                # Remove line breaks and extra spaces
                clean_address = re.sub(r'\r\n|\r|\n', ', ', display_address)
                clean_address = re.sub(r',\s*,', ',', clean_address)  # Remove double commas
                clean_address = re.sub(r'\s+', ' ', clean_address).strip()  # Normalize spaces
                
                # Remove postcode from address if it appears at the end
                postcode_pattern = r',?\s*[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\s*$'
                clean_address = re.sub(postcode_pattern, '', clean_address, flags=re.IGNORECASE)
                clean_address = clean_address.rstrip(', ')
            else:
                clean_address = 'N/A'
            
            # Combine outcode and incode to form postcode
            outcode = address_data.get('outcode', '')
            incode = address_data.get('incode', '')
            
            if outcode and incode:
                postcode = f"{outcode} {incode}"
            else:
                postcode = 'N/A'
            
            return clean_address, postcode
        return 'N/A', 'N/A'
    
    def extract_tenure_details(self, json_data):
        """Extract tenure type and lease years remaining from JSON data"""
        tenure_type = 'N/A'
        lease_years_remaining = 'N/A'
        
        if json_data:
            # Extract tenure information
            tenure_info = jmespath.search("tenure", json_data)
            
            if isinstance(tenure_info, dict):
                # Extract tenure type
                tenure_type = tenure_info.get('tenureType', 'N/A')
                
                # Extract years remaining on lease
                years_remaining = tenure_info.get('yearsRemainingOnLease')
                if years_remaining is not None:
                    lease_years_remaining = years_remaining
                
                # Handle message field if needed
                message = tenure_info.get('message')
                if message:
                    # You can append message to tenure_type if it contains useful info
                    tenure_type = f"{tenure_type} - {message}"
            
            elif isinstance(tenure_info, str):
                # If it's just a string, use it as tenure type
                tenure_type = tenure_info
        
        return tenure_type, lease_years_remaining

    async def extract_property_json_data(self, property_url):
        try:
            response = await self.httpx_client.get(property_url)
            data = Selector(response.text).xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
            if data:
                json_objects = list(self.find_json_objects(data))
                for obj in json_objects:
                    if "propertyData" in obj:
                        return obj["propertyData"]
            return None
        except Exception as e:
            print(f"JSON extraction error for {property_url}: {e}")
            return None

    def get_property_links(self, search_url, max_pages=42):
        """Get property links with smart page detection - stops early if no results"""
        property_links = []
        consecutive_empty_pages = 0
        
        for page in range(max_pages):
            page_url = f"{search_url}&index={page * 24}"
            try:
                response = self.session.get(page_url, timeout=15)
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Try multiple selectors to find property links
                links = []
                selectors = [
                    'a.propertyCard-link',
                    'a[href*="/properties/"]',
                    '.propertyCard a',
                    '[data-test="property-card"] a',
                    'a[href*="/property-for-sale/"]'
                ]
                
                for selector in selectors:
                    found_links = soup.select(selector)
                    if found_links:
                        links = [f"https://www.rightmove.co.uk{link.get('href')}" 
                                for link in found_links if link.get('href') and '/properties/' in link.get('href')]
                        break
                
                if len(links) == 0:
                    consecutive_empty_pages += 1
                    print(f"Page {page + 1}: 0 properties found")
                    # Stop after 2 consecutive empty pages
                    if consecutive_empty_pages >= 2:
                        print(f"No more results found after page {page + 1}. Stopping early.")
                        break
                else:
                    consecutive_empty_pages = 0
                    property_links.extend(links)
                    print(f"Page {page + 1}: {len(links)} properties found")
                
                # Add a small delay to be respectful
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error on page {page + 1}: {e}")
                consecutive_empty_pages += 1
                if consecutive_empty_pages >= 2:
                    print("Too many consecutive errors. Stopping page search.")
                    break
                
        # Remove duplicates while preserving order
        seen = set()
        unique_links = []
        for link in property_links:
            if link not in seen:
                seen.add(link)
                unique_links.append(link)
        
        print(f"Total unique property links found: {len(unique_links)}")
        return unique_links

    async def scrape_property_details_fast(self, property_url):
        # Skip if already scraped
        if property_url in self.scraped_urls:
            return None
            
        try:
            html_task = asyncio.create_task(self.httpx_client.get(property_url))
            json_data = await self.extract_property_json_data(property_url)
            response = await html_task
            
            soup = BeautifulSoup(response.content, 'html.parser')
            property_data = {'url': property_url}
            
            # Mark as scraped
            self.scraped_urls.add(property_url)
            
            # Extract property ID and use as title
            property_id = self.extract_property_id(property_url)
            property_data['title'] = property_id
            
            # Basic property info with enhanced extraction
            all_text = soup.get_text()
            
            # Enhanced bedroom extraction
            bed_patterns = [
                r'(\d+)\s*bedroom',
                r'(\d+)\s*bed(?:room)?s?',
                r'(\d+)\s*-?\s*bed'
            ]
            property_data['bedrooms'] = 'N/A'
            for pattern in bed_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bedrooms'] = match.group(1)
                    break
            
            # Enhanced bathroom extraction
            bath_patterns = [
                r'(\d+)\s*bathroom',
                r'(\d+)\s*bath(?:room)?s?',
                r'(\d+)\s*-?\s*bath'
            ]
            property_data['bathrooms'] = 'N/A'
            for pattern in bath_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bathrooms'] = match.group(1)
                    break

            # Key features extraction with multiple methods
            property_data['key_features'] = 'N/A'
            
            # Method 1: Look for feature lists
            feature_selectors = [
                'ul.lIhZ24u1NHlVy5_W9V__6 li',
                '.key-features li',
                '[data-test="key-features"] li',
                'h2:contains("Key features") + ul li',
                'h3:contains("Key features") + ul li'
            ]
            
            for selector in feature_selectors:
                features = soup.select(selector)
                if features:
                    feature_texts = [f.get_text().strip() for f in features]
                    property_data['key_features'] = '; '.join(feature_texts)
                    break
            
            # Method 2: Look for feature headers and following lists
            if property_data['key_features'] == 'N/A':
                feature_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'key features|features|amenities', re.I))
                for header in feature_headers:
                    next_element = header.find_next_sibling()
                    while next_element:
                        if next_element.name in ['ul', 'ol']:
                            features = [li.get_text().strip() for li in next_element.find_all('li')]
                            if features:
                                property_data['key_features'] = '; '.join(features)
                                break
                        next_element = next_element.find_next_sibling()
                    if property_data['key_features'] != 'N/A':
                        break
            
            # Property details extraction
            property_data.update({
                'parking': 'N/A',
                'garden': 'N/A',
                'council_tax': 'N/A',
                'accessibility': 'N/A',
                'size_sqft': 'N/A',
                'size_sqm': 'N/A',
                'furnish_status': 'N/A',
            })
           
            # Extract details from various selectors
            detail_selectors = [
                'dt._17A0LehXZKxGHbPeiLQ1BI',
                'dt[class*="detail"]',
                '.property-details dt',
                'dl dt'
            ]
            
            for selector in detail_selectors:
                detail_sections = soup.select(selector)
                if detail_sections:
                    for section in detail_sections:
                        section_text = section.get_text().strip().upper()
                        value_element = section.find_next_sibling(['dd', 'span', 'div'])
                        value_text = value_element.get_text().strip() if value_element else 'N/A'
                        
                        if 'PARKING' in section_text:
                            property_data['parking'] = value_text
                        elif 'GARDEN' in section_text:
                            property_data['garden'] = value_text
                        elif 'COUNCIL TAX' in section_text:
                            property_data['council_tax'] = value_text
                        elif 'ACCESSIBILITY' in section_text:
                            property_data['accessibility'] = value_text
                        elif 'FURNISH' in section_text:
                            property_data['furnish_status'] = value_text
                    break

            # Additional furnish status extraction
            furnish_dt = soup.find('dt', string=re.compile(r'^Furnish type:\s*$', re.I))
            if furnish_dt:
                furnish_dd = furnish_dt.find_next_sibling('dd')
                if furnish_dd:
                    property_data['furnish_status'] = furnish_dd.get_text().strip()

            # Size extraction from full text
            sqft_patterns = [
                r'([0-9,]+)\s*sq\.?\s*ft',
                r'([0-9,]+)\s*sqft',
                r'([0-9,]+)\s*square\s*feet'
            ]
            for pattern in sqft_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqft'] = match.group(1).replace(',', '')
                    break
            
            sqm_patterns = [
                r'([0-9,]+)\s*sq\.?\s*m',
                r'([0-9,]+)\s*sqm',
                r'([0-9,]+)\s*square\s*metres?'
            ]
            for pattern in sqm_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqm'] = match.group(1).replace(',', '')
                    break

            # Extract data from JSON if available
            if json_data:
                try:
                    stations = jmespath.search("nearestStations[*].{name: name, distance: distance}", json_data) or []
                    photos = jmespath.search("images[*].{url: url, caption: caption}", json_data) or []
                    floorplans = jmespath.search("floorplans[*].{url: url, caption: caption}", json_data) or []
                    property_type = (jmespath.search("propertySubType", json_data) or 
                                   jmespath.search("propertyType", json_data) or 'N/A')
                    description = jmespath.search("text.description", json_data) or 'N/A'
                    address_data = jmespath.search("address", json_data)
                    latitude = jmespath.search("location.latitude", json_data) or 'N/A'
                    longitude = jmespath.search("location.longitude", json_data) or 'N/A'
                    price = jmespath.search("prices.primaryPrice", json_data) or 'N/A'
                    
                    # Tenure information using the dedicated method
                    tenure_type, lease_years_remaining = self.extract_tenure_details(json_data)
                    
                    formatted_address, postcode = self.format_address_and_postcode(address_data)
                    
                    property_data.update({
                        'nearest_stations': '; '.join([s['name'] for s in stations]) or 'N/A',
                        'station_distances': '; '.join([f"{s['distance']} miles" for s in stations]) or 'N/A',
                        'station_count': len(stations),
                        'image_count': len(photos),
                        'image_urls': '; '.join([p['url'] for p in photos]) or 'N/A',
                        'floorplan_count': len(floorplans),
                        'floorplan_urls': '; '.join([f['url'] for f in floorplans]) or 'N/A',
                        'property_type': property_type,
                        'tenure_type': tenure_type,
                        'lease_years_remaining': lease_years_remaining,
                        'description': description,
                        'latitude': latitude,
                        'longitude': longitude,
                        'address': formatted_address,
                        'postcode': postcode,
                        'price': price
                    })
                except Exception as e:
                    print(f"Error extracting JSON data: {e}")
            else:
                # HTML fallback for property type if JSON fails
                property_type_fallback = 'N/A'
                type_element = soup.find(string=re.compile(r'(flat|house|apartment|studio|maisonette)', re.I))
                if type_element:
                    property_type_fallback = type_element.strip()
                
                property_data.update({
                    'nearest_stations': 'N/A',
                    'station_distances': 'N/A', 
                    'station_count': 0,
                    'image_count': 0,
                    'image_urls': 'N/A',
                    'floorplan_count': 0,
                    'floorplan_urls': 'N/A',
                    'property_type': property_type_fallback,
                    'description': 'N/A',
                    'tenure_type': 'N/A',
                    'lease_years_remaining': 'N/A',
                    'latitude': 'N/A',
                    'longitude': 'N/A',
                    'address': 'N/A',
                    'postcode': 'N/A',
                    'price': 'N/A'
                })
            
            return property_data
            
        except Exception as e:
            print(f"Error scraping {property_url}: {e}")
            return {'url': property_url, 'error': str(e)}

    async def scrape_all_properties_fast(self, search_url, max_pages=42, max_properties=None):
        print(f"Getting property links (will stop early if no results found)...")
        property_links = self.get_property_links(search_url, max_pages)
        
        if max_properties:
            property_links = property_links[:max_properties]
        
        print(f"Found {len(property_links)} unique properties to scrape...")
        
        if not property_links:
            print("No property links found.")
            return
        
        # Filter out already scraped properties
        new_links = [link for link in property_links if link not in self.scraped_urls]
        print(f"New properties to scrape: {len(new_links)}")
        
        if not new_links:
            print("All properties already scraped.")
            return
        
        batch_size = 3  # Conservative batch size
        successful = 0
        
        for i in range(0, len(new_links), batch_size):
            batch = new_links[i:i+batch_size]
            tasks = [self.scrape_property_details_fast(url) for url in batch]
            results = await asyncio.gather(*tasks, return_exceptions=True)
            
            for result in results:
                if isinstance(result, dict) and result is not None and 'error' not in result:
                    self.properties.append(result)
                    successful += 1
                elif isinstance(result, dict) and 'error' in result:
                    print(f"Error in result: {result['error']}")
            
            print(f"Completed {min(i+batch_size, len(new_links))}/{len(new_links)} properties (Success: {successful})")
            await asyncio.sleep(0.5)  # Reduced delay

    def save_to_csv(self, filename='rightmove_london_properties.csv'):
        if not self.properties:
            print("No properties to save!")
            return None
            
        df = pd.DataFrame(self.properties)
        
        # Clean description field
        if 'description' in df.columns:
            df['description'] = df['description'].astype(str).apply(
                lambda x: re.sub(r'<[^>]+>', ' ', x) if x != 'N/A' else x
            ).apply(
                lambda x: re.sub(r'\s+', ' ', x).strip() if x != 'N/A' else x
            )
        
        # Remove duplicate properties by URL
        df_clean = df.drop_duplicates(subset=['url'], keep='first')
        
        df_clean.to_csv(filename, index=False)
        print(f"Saved {len(df_clean)} unique properties to {filename}")
        
        if len(df_clean) > 0:
            print(f"\nData Summary:")
            print(f"Properties with prices: {len(df_clean[df_clean['price'] != 'N/A'])}")
            print(f"Properties with addresses: {len(df_clean[df_clean['address'] != 'N/A'])}")
            print(f"Properties with images: {len(df_clean[df_clean['image_count'] != 0])}")
        
        return df_clean

async def run_complete_scraper():
    scraper = RightmoveRentalScraper()
    
    # Complete London Boroughs list
    london_boroughs = [
        # Inner London Boroughs
        {"name": "City of London", "id": "REGION%5E92824"},
        {"name": "Camden", "id": "REGION%5E1465"},
        {"name": "Greenwich", "id": "REGION%5E1467"},
        {"name": "Hackney", "id": "REGION%5E1468"},
        {"name": "Hammersmith and Fulham", "id": "REGION%5E1469"},
        {"name": "Islington", "id": "REGION%5E1470"},
        {"name": "Kensington and Chelsea", "id": "REGION%5E1471"},
        {"name": "Lambeth", "id": "REGION%5E1472"},
        {"name": "Lewisham", "id": "REGION%5E1473"},
        {"name": "Southwark", "id": "REGION%5E1474"},
        {"name": "Tower Hamlets", "id": "REGION%5E1475"},
        {"name": "Wandsworth", "id": "REGION%5E1476"},
        {"name": "Westminster", "id": "REGION%5E1477"},
        
        # Outer London Boroughs
        {"name": "Barking and Dagenham", "id": "REGION%5E1478"},
        {"name": "Barnet", "id": "REGION%5E1479"},
        {"name": "Bexley", "id": "REGION%5E1480"},
        {"name": "Brent", "id": "REGION%5E1481"},
        {"name": "Bromley", "id": "REGION%5E1482"},
        {"name": "Croydon", "id": "REGION%5E1483"},
        {"name": "Ealing", "id": "REGION%5E1484"},
        {"name": "Enfield", "id": "REGION%5E1485"},
        {"name": "Haringey", "id": "REGION%5E1486"},
        {"name": "Harrow", "id": "REGION%5E1487"},
        {"name": "Havering", "id": "REGION%5E1488"},
        {"name": "Hillingdon", "id": "REGION%5E1489"},
        {"name": "Hounslow", "id": "REGION%5E1490"},
        {"name": "Kingston upon Thames", "id": "REGION%5E1491"},
        {"name": "Merton", "id": "REGION%5E1492"},
        {"name": "Newham", "id": "REGION%5E1493"},
        {"name": "Redbridge", "id": "REGION%5E1494"},
        {"name": "Richmond upon Thames", "id": "REGION%5E1495"},
        {"name": "Sutton", "id": "REGION%5E1496"},
        {"name": "Waltham Forest", "id": "REGION%5E1497"}
        
    ]
    
    # More granular price ranges to reduce pages per search
    price_ranges = [
        # Start with broader ranges, then get more granular
        "&maxPrice=300000",
        "&minPrice=300000&maxPrice=500000",
        "&minPrice=500000&maxPrice=750000",
        "&minPrice=750000&maxPrice=1000000",
        "&minPrice=1000000&maxPrice=1500000",
        "&minPrice=1500000&maxPrice=2000000",
        "&minPrice=2000000&maxPrice=3000000",
        "&minPrice=3000000&maxPrice=5000000",
        "&minPrice=5000000"
    ]
    
    # Generate search URLs
    london_search_urls = []
    
    for borough in london_boroughs:
        for price_range in price_ranges:
            sale_url = f"https://www.rightmove.co.uk/property-for-sale/find.html?useLocationIdentifier=true&locationIdentifier={borough['id']}&buy=For+sale&radius=0.0&_includeSSTC=on&index=0&sortType=2&channel=BUY&transactionType=BUY{price_range}"
            london_search_urls.append({
                'url': sale_url,
                'name': borough['name'],
                'type': 'sale',
                'price_range': price_range.replace('&', '') if price_range else 'all_prices'
            })
    
    print(f"Generated {len(london_search_urls)} London search combinations")
    
    start_time = time.time()
    target_properties = 50000  # Reduced target for testing
    
    for i, search_item in enumerate(london_search_urls):
        search_url = search_item['url']
        borough_name = search_item['name']
        search_type = search_item['type']
        price_range = search_item['price_range']
        
        print(f"\n🏙️ Scraping {borough_name} ({search_type}) - {price_range}")
        print(f"Current total: {len(scraper.properties)} properties")
        
        # Calculate how many more properties we need
        remaining_needed = target_properties - len(scraper.properties)
        if remaining_needed <= 0:
            print(f"✅ Target of {target_properties} properties reached!")
            break
        
        # Set max_properties for this search
        max_for_this_search = min(remaining_needed, 200)
        
        try:
            await scraper.scrape_all_properties_fast(
                search_url, 
                max_pages=42,  # Reduced for testing
                max_properties=max_for_this_search
            )
            
        except Exception as e:
            print(f"Error scraping {borough_name}: {e}")
            continue
        
        print(f"✅ {borough_name} ({price_range}) completed. Total properties: {len(scraper.properties)}")
        
        # Break if we've reached our target
        if len(scraper.properties) >= target_properties:
            print(f"🎯 Target reached! Total: {len(scraper.properties)} properties")
            break
        
        # Small delay between searches
        await asyncio.sleep(1)

    end_time = time.time()
    
    # Remove any duplicate properties by URL
    seen_urls = set()
    unique_properties = []
    for prop in scraper.properties:
        if prop.get('url') not in seen_urls:
            seen_urls.add(prop.get('url'))
            unique_properties.append(prop)
    
    scraper.properties = unique_properties
    
    scraper.save_to_csv('rightmove_london_properties.csv')
    await scraper.httpx_client.aclose()
    
    print(f"⚡ Completed in {end_time - start_time:.2f} seconds")
    print(f"📊 Final count: {len(scraper.properties)} unique London properties")
    return scraper.properties

if __name__ == "__main__":
    nest_asyncio.apply()
    results = asyncio.run(run_complete_scraper())

Generated 297 London search combinations

🏙️ Scraping City of London (sale) - maxPrice=300000
Current total: 0 properties
Getting property links (will stop early if no results found)...
Page 1: 25 properties found


Task exception was never retrieved
future: <Task finished name='Task-4342' coro=<run_complete_scraper() done, defined at C:\Users\Jc\AppData\Local\Temp\ipykernel_5792\3133418577.py:474> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "c:\Users\Jc\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Jc\AppData\Local\Temp\ipykernel_5792\3133418577.py", line 623, in <module>
    results = asyncio.run(run_complete_scraper())
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Jc\anaconda3\Lib\site-packages\nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Jc\anaconda3\Lib\site-packages\nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "c:\Users\Jc\anaconda3\Lib\site-packages\nest_asyncio.py", line 133, in _run_once
    handle._run()
  File 

Page 2: 25 properties found
Page 3: 25 properties found
Page 4: 25 properties found
Page 5: 25 properties found
Page 6: 25 properties found
Page 7: 25 properties found
Page 8: 25 properties found
Page 9: 25 properties found
Page 10: 25 properties found
Page 11: 25 properties found
Page 12: 25 properties found
Page 13: 25 properties found
Page 14: 25 properties found
Page 15: 25 properties found
Page 16: 25 properties found
Page 17: 25 properties found
Page 18: 25 properties found
Page 19: 25 properties found
Page 20: 25 properties found
Page 21: 25 properties found
Page 22: 25 properties found
Page 23: 25 properties found
Page 24: 25 properties found
Page 25: 25 properties found
Page 26: 25 properties found
Page 27: 25 properties found
Page 28: 25 properties found
Page 29: 25 properties found
Page 30: 25 properties found
Page 31: 25 properties found
Page 32: 25 properties found
Page 33: 25 properties found
Page 34: 25 properties found
Page 35: 25 properties found
Page 36: 25 properties

KeyboardInterrupt: 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json
import jmespath
import asyncio
from httpx import AsyncClient
from parsel import Selector
import nest_asyncio

class RightmoveRentalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        self.properties = []
        self.scraped_urls = set()  # Track scraped URLs to avoid duplicates
        self.httpx_client = AsyncClient(
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            },
            follow_redirects=True,
            http2=False,
            timeout=30,
        )
    
    def find_json_objects(self, text: str):
        pos = 0
        while True:
            match = text.find("{", pos)
            if match == -1: break
            try:
                result, index = json.JSONDecoder().raw_decode(text[match:])
                yield result
                pos = match + index
            except:
                pos = match + 1

    def extract_property_id(self, url):
        """Extract property ID from URL"""
        match = re.search(r'/properties/(\d+)', url)
        return match.group(1) if match else 'N/A'
    
    def format_address_and_postcode(self, address_data):
        """Extract and format address and postcode from JSON address object"""
        if isinstance(address_data, dict):
            display_address = address_data.get('displayAddress', '')
            if display_address:
                clean_address = re.sub(r'\r\n|\r|\n', ', ', display_address)
                clean_address = re.sub(r',\s*,', ',', clean_address)
                clean_address = re.sub(r'\s+', ' ', clean_address).strip()
                postcode_pattern = r',?\s*[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\s*$'
                clean_address = re.sub(postcode_pattern, '', clean_address, flags=re.IGNORECASE)
                clean_address = clean_address.rstrip(', ')
            else:
                clean_address = 'N/A'
            
            outcode = address_data.get('outcode', '')
            incode = address_data.get('incode', '')
            
            if outcode and incode:
                postcode = f"{outcode} {incode}"
            else:
                postcode = 'N/A'
            
            return clean_address, postcode
        return 'N/A', 'N/A'
    
    def extract_tenure_details(self, json_data):
        """Extract tenure type and lease years remaining from JSON data"""
        tenure_type = 'N/A'
        lease_years_remaining = 'N/A'
        
        if json_data:
            tenure_info = jmespath.search("tenure", json_data)
            
            if isinstance(tenure_info, dict):
                tenure_type = tenure_info.get('tenureType', 'N/A')
                years_remaining = tenure_info.get('yearsRemainingOnLease')
                if years_remaining is not None:
                    lease_years_remaining = years_remaining
                
                message = tenure_info.get('message')
                if message:
                    tenure_type = f"{tenure_type} - {message}"
            elif isinstance(tenure_info, str):
                tenure_type = tenure_info
        
        return tenure_type, lease_years_remaining

    async def extract_property_json_data(self, property_url):
        try:
            response = await self.httpx_client.get(property_url)
            data = Selector(response.text).xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
            if data:
                json_objects = list(self.find_json_objects(data))
                for obj in json_objects:
                    if "propertyData" in obj:
                        return obj["propertyData"]
            return None
        except Exception as e:
            print(f"JSON extraction error for {property_url}: {e}")
            return None

    async def scrape_property_details_fast(self, property_url):
        # Skip if already scraped
        if property_url in self.scraped_urls:
            return None
            
        try:
            html_task = asyncio.create_task(self.httpx_client.get(property_url))
            json_data = await self.extract_property_json_data(property_url)
            response = await html_task
            
            soup = BeautifulSoup(response.content, 'html.parser')
            property_data = {'url': property_url}
            
            # Mark as scraped
            self.scraped_urls.add(property_url)
            
            # Extract property ID and use as title
            property_id = self.extract_property_id(property_url)
            property_data['title'] = property_id
            
            # Basic property info with enhanced extraction
            all_text = soup.get_text()
            
            # Enhanced bedroom extraction
            bed_patterns = [
                r'(\d+)\s*bedroom',
                r'(\d+)\s*bed(?:room)?s?',
                r'(\d+)\s*-?\s*bed'
            ]
            property_data['bedrooms'] = 'N/A'
            for pattern in bed_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bedrooms'] = match.group(1)
                    break
            
            # Enhanced bathroom extraction
            bath_patterns = [
                r'(\d+)\s*bathroom',
                r'(\d+)\s*bath(?:room)?s?',
                r'(\d+)\s*-?\s*bath'
            ]
            property_data['bathrooms'] = 'N/A'
            for pattern in bath_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bathrooms'] = match.group(1)
                    break

            # Key features extraction
            property_data['key_features'] = 'N/A'
            feature_selectors = [
                'ul.lIhZ24u1NHlVy5_W9V__6 li',
                '.key-features li',
                '[data-test="key-features"] li'
            ]
            
            for selector in feature_selectors:
                features = soup.select(selector)
                if features:
                    feature_texts = [f.get_text().strip() for f in features]
                    property_data['key_features'] = '; '.join(feature_texts)
                    break
            
            if property_data['key_features'] == 'N/A':
                feature_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'key features|features|amenities', re.I))
                for header in feature_headers:
                    next_element = header.find_next_sibling()
                    while next_element:
                        if next_element.name in ['ul', 'ol']:
                            features = [li.get_text().strip() for li in next_element.find_all('li')]
                            if features:
                                property_data['key_features'] = '; '.join(features)
                                break
                        next_element = next_element.find_next_sibling()
                    if property_data['key_features'] != 'N/A':
                        break
            
            # Property details extraction
            property_data.update({
                'parking': 'N/A',
                'garden': 'N/A',
                'council_tax': 'N/A',
                'accessibility': 'N/A',
                'size_sqft': 'N/A',
                'size_sqm': 'N/A',
                'furnish_status': 'N/A',
            })
           
            # Extract details from various selectors
            detail_selectors = [
                'dt._17A0LehXZKxGHbPeiLQ1BI',
                'dt[class*="detail"]',
                '.property-details dt',
                'dl dt'
            ]
            
            for selector in detail_selectors:
                detail_sections = soup.select(selector)
                if detail_sections:
                    for section in detail_sections:
                        section_text = section.get_text().strip().upper()
                        value_element = section.find_next_sibling(['dd', 'span', 'div'])
                        value_text = value_element.get_text().strip() if value_element else 'N/A'
                        
                        if 'PARKING' in section_text:
                            property_data['parking'] = value_text
                        elif 'GARDEN' in section_text:
                            property_data['garden'] = value_text
                        elif 'COUNCIL TAX' in section_text:
                            property_data['council_tax'] = value_text
                        elif 'ACCESSIBILITY' in section_text:
                            property_data['accessibility'] = value_text
                        elif 'FURNISH' in section_text:
                            property_data['furnish_status'] = value_text
                    break

            # Size extraction
            sqft_patterns = [
                r'([0-9,]+)\s*sq\.?\s*ft',
                r'([0-9,]+)\s*sqft',
                r'([0-9,]+)\s*square\s*feet'
            ]
            for pattern in sqft_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqft'] = match.group(1).replace(',', '')
                    break
            
            sqm_patterns = [
                r'([0-9,]+)\s*sq\.?\s*m',
                r'([0-9,]+)\s*sqm',
                r'([0-9,]+)\s*square\s*metres?'
            ]
            for pattern in sqm_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqm'] = match.group(1).replace(',', '')
                    break

            # Extract data from JSON if available
            if json_data:
                try:
                    stations = jmespath.search("nearestStations[*].{name: name, distance: distance}", json_data) or []
                    photos = jmespath.search("images[*].{url: url, caption: caption}", json_data) or []
                    floorplans = jmespath.search("floorplans[*].{url: url, caption: caption}", json_data) or []
                    property_type = (jmespath.search("propertySubType", json_data) or 
                                   jmespath.search("propertyType", json_data) or 'N/A')
                    description = jmespath.search("text.description", json_data) or 'N/A'
                    address_data = jmespath.search("address", json_data)
                    latitude = jmespath.search("location.latitude", json_data) or 'N/A'
                    longitude = jmespath.search("location.longitude", json_data) or 'N/A'
                    price = jmespath.search("prices.primaryPrice", json_data) or 'N/A'
                    
                    tenure_type, lease_years_remaining = self.extract_tenure_details(json_data)
                    formatted_address, postcode = self.format_address_and_postcode(address_data)
                    
                    property_data.update({
                        'nearest_stations': '; '.join([s['name'] for s in stations]) or 'N/A',
                        'station_distances': '; '.join([f"{s['distance']} miles" for s in stations]) or 'N/A',
                        'station_count': len(stations),
                        'image_count': len(photos),
                        'image_urls': '; '.join([p['url'] for p in photos]) or 'N/A',
                        'floorplan_count': len(floorplans),
                        'floorplan_urls': '; '.join([f['url'] for f in floorplans]) or 'N/A',
                        'property_type': property_type,
                        'tenure_type': tenure_type,
                        'lease_years_remaining': lease_years_remaining,
                        'description': description,
                        'latitude': latitude,
                        'longitude': longitude,
                        'address': formatted_address,
                        'postcode': postcode,
                        'price': price
                    })
                except Exception as e:
                    print(f"Error extracting JSON data: {e}")
            else:
                # HTML fallback
                property_type_fallback = 'N/A'
                type_element = soup.find(string=re.compile(r'(flat|house|apartment|studio|maisonette)', re.I))
                if type_element:
                    property_type_fallback = type_element.strip()
                
                property_data.update({
                    'nearest_stations': 'N/A',
                    'station_distances': 'N/A', 
                    'station_count': 0,
                    'image_count': 0,
                    'image_urls': 'N/A',
                    'floorplan_count': 0,
                    'floorplan_urls': 'N/A',
                    'property_type': property_type_fallback,
                    'description': 'N/A',
                    'tenure_type': 'N/A',
                    'lease_years_remaining': 'N/A',
                    'latitude': 'N/A',
                    'longitude': 'N/A',
                    'address': 'N/A',
                    'postcode': 'N/A',
                    'price': 'N/A'
                })
            
            return property_data
            
        except Exception as e:
            print(f"Error scraping {property_url}: {e}")
            return {'url': property_url, 'error': str(e)}

    async def scrape_search_directly(self, search_url, max_properties=None):
    """Directly scrape properties from search pages - stop when we hit repetitions"""
    print(f"Starting direct scrape of search URL...")
    
    page = 0
    consecutive_empty_pages = 0
    consecutive_duplicate_pages = 0
    new_properties_found = 0
    
    while True:
        page_url = f"{search_url}&index={page * 24}"
        
        try:
            # Get the search page
            response = self.session.get(page_url, timeout=15)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find property links on this page
            links = []
            selectors = [
                'a.propertyCard-link',
                'a[href*="/properties/"]',
                '.propertyCard a',
                '[data-test="property-card"] a'
            ]
            
            for selector in selectors:
                found_links = soup.select(selector)
                if found_links:
                    links = [f"https://www.rightmove.co.uk{link.get('href')}" 
                            for link in found_links if link.get('href') and '/properties/' in link.get('href')]
                    break
            
            if len(links) == 0:
                consecutive_empty_pages += 1
                print(f"Page {page + 1}: 0 properties found")
                # Increase threshold - stop after 5 consecutive empty pages instead of 2
                if consecutive_empty_pages >= 5:
                    print(f"No more results found after {consecutive_empty_pages} empty pages. Stopping.")
                    break
            else:
                consecutive_empty_pages = 0
                
                # Check how many are new vs duplicates
                new_links = [link for link in links if link not in self.scraped_urls]
                duplicate_count = len(links) - len(new_links)
                
                print(f"Page {page + 1}: {len(links)} properties found, {len(new_links)} new, {duplicate_count} duplicates")
                
                # If mostly duplicates, we might be hitting repeated content
                if len(new_links) == 0:
                    consecutive_duplicate_pages += 1
                    # Increase threshold - stop after 5 consecutive pages with only duplicates
                    if consecutive_duplicate_pages >= 5:
                        print(f"Too many consecutive pages with only duplicates. Stopping.")
                        break
                else:
                    consecutive_duplicate_pages = 0
                
                # Scrape the new properties directly
                if new_links:
                    batch_size = 3
                    for i in range(0, len(new_links), batch_size):
                        batch = new_links[i:i+batch_size]
                        tasks = [self.scrape_property_details_fast(url) for url in batch]
                        results = await asyncio.gather(*tasks, return_exceptions=True)
                        
                        for result in results:
                            if isinstance(result, dict) and result is not None and 'error' not in result:
                                self.properties.append(result)
                                new_properties_found += 1
                        
                        # Check if we've reached our target
                        if max_properties and len(self.properties) >= max_properties:
                            print(f"Reached target of {max_properties} properties!")
                            return new_properties_found
                        
                        await asyncio.sleep(0.3)  # Small delay between batches
            
            page += 1  # THIS IS HOW WE MOVE TO NEXT PAGE
            time.sleep(0.5)  # Delay between pages
            
            # Safety break to avoid infinite loops - increase limit
            if page > 500:  # Increased from 200 to 500
                print(f"Reached maximum page limit (500 pages). Stopping.")
                break
                
        except Exception as e:
            print(f"Error on page {page + 1}: {e}")
            consecutive_empty_pages += 1
            if consecutive_empty_pages >= 5:  # Increased threshold
                break
            page += 1  # Still move to next page even after error
    
    return new_properties_found

    def save_to_csv(self, filename='rightmove_london_properties.csv'):
        if not self.properties:
            print("No properties to save!")
            return None
            
        df = pd.DataFrame(self.properties)
        
        # Clean description field
        if 'description' in df.columns:
            df['description'] = df['description'].astype(str).apply(
                lambda x: re.sub(r'<[^>]+>', ' ', x) if x != 'N/A' else x
            ).apply(
                lambda x: re.sub(r'\s+', ' ', x).strip() if x != 'N/A' else x
            )
        
        # Remove duplicate properties by URL (just in case)
        df_clean = df.drop_duplicates(subset=['url'], keep='first')
        
        df_clean.to_csv(filename, index=False)
        print(f"Saved {len(df_clean)} unique properties to {filename}")
        
        if len(df_clean) > 0:
            print(f"\nData Summary:")
            print(f"Properties with prices: {len(df_clean[df_clean['price'] != 'N/A'])}")
            print(f"Properties with addresses: {len(df_clean[df_clean['address'] != 'N/A'])}")
            print(f"Properties with images: {len(df_clean[df_clean['image_count'] != 0])}")
        
        return df_clean

async def run_complete_scraper():
    scraper = RightmoveRentalScraper()
    
    # London Boroughs
    london_boroughs = [
        {"name": "City of London", "id": "REGION%5E61224"},
        {"name": "Camden", "id": "REGION%5E93941"},
        {"name": "Greenwich", "id": "REGION%5E61226"},
        {"name": "Hackney", "id": "REGION%5E93953"},
        {"name": "Hammersmith and Fulham", "id": "REGION%5E61407"},
        {"name": "Islington", "id": "REGION%5E93965"},
        {"name": "Kensington and Chelsea", "id": "REGION%5E61229"},
        {"name": "Lambeth", "id": "REGION%5E93971"},
        {"name": "Lewisham", "id": "REGION%5E61413"},
        {"name": "Southwark", "id": "REGION%5E61518"},
        {"name": "Tower Hamlets", "id": "REGION%5E61417"},
        {"name": "Wandsworth", "id": "REGION%5E93977"},
        {"name": "Westminster", "id": "REGION%5E93980"},
        {"name": "Barking and Dagenham", "id": "REGION%5E61400"},
        {"name": "Barnet", "id": "REGION%5E93929"},
        {"name": "Bexley", "id": "REGION%5E93932"},
        {"name": "Brent", "id": "REGION%5E93935"},
        {"name": "Bromley", "id": "REGION%5E93938"},
        {"name": "Croydon", "id": "REGION%5E93944"},
        {"name": "Ealing", "id": "REGION%5E93947"},
        {"name": "Enfield", "id": "REGION%5E93950"},
        {"name": "Haringey", "id": "REGION%5E61227"},
        {"name": "Harrow", "id": "REGION%5E93956"},
        {"name": "Havering", "id": "REGION%5E61228"},
        {"name": "Hillingdon", "id": "REGION%5E93959"},
        {"name": "Hounslow", "id": "REGION%5E93962"},
        {"name": "Kingston upon Thames", "id": "REGION%5E93968"},
        {"name": "Merton", "id": "REGION%5E61414"},
        {"name": "Newham", "id": "REGION%5E61231"},
        {"name": "Redbridge", "id": "REGION%5E61537"},
        {"name": "Richmond upon Thames", "id": "REGION%5E61415"},
        {"name": "Sutton", "id": "REGION%5E93974"},
        {"name": "Waltham Forest", "id": "REGION%5E61232"}
    ]
    
    # Price ranges
    price_ranges = [
        "&maxPrice=400000",
        "&minPrice=400000&maxPrice=600000",
        "&minPrice=600000&maxPrice=800000",
        "&minPrice=800000&maxPrice=1000000",
        "&minPrice=1000000&maxPrice=1500000",
        "&minPrice=1500000&maxPrice=2000000",
        "&minPrice=2000000&maxPrice=3000000",
        "&minPrice=3000000"
    ]
    
    start_time = time.time()
    target_properties = 50000
    
    for borough in london_boroughs:
        for price_range in price_ranges:
            search_url = f"https://www.rightmove.co.uk/property-for-sale/find.html?useLocationIdentifier=true&locationIdentifier={borough['id']}&buy=For+sale&radius=0.0&_includeSSTC=on&index=0&sortType=2&channel=BUY&transactionType=BUY{price_range}"
            
            print(f"\n🏙️ Scraping {borough['name']} - {price_range.replace('&', '')}")
            print(f"Current total: {len(scraper.properties)} properties")
            
            remaining_needed = target_properties - len(scraper.properties)
            if remaining_needed <= 0:
                print(f"✅ Target of {target_properties} properties reached!")
                break
            
            try:
                new_found = await scraper.scrape_search_directly(
                    search_url, 
                    max_properties=remaining_needed
                )
                print(f"✅ Found {new_found} new properties. Total: {len(scraper.properties)}")
                
            except Exception as e:
                print(f"Error scraping {borough['name']}: {e}")
                continue
            
            if len(scraper.properties) >= target_properties:
                print(f"🎯 Target reached! Total: {len(scraper.properties)} properties")
                break
            
            await asyncio.sleep(1)
        
        if len(scraper.properties) >= target_properties:
            break

    end_time = time.time()
    
    scraper.save_to_csv('rightmove_london_properties.csv')
    await scraper.httpx_client.aclose()
    
    print(f"⚡ Completed in {end_time - start_time:.2f} seconds")
    print(f"📊 Final count: {len(scraper.properties)} unique London properties")
    return scraper.properties

if __name__ == "__main__":
    nest_asyncio.apply()
    results = asyncio.run(run_complete_scraper())


🏙️ Scraping City of London - maxPrice=400000
Current total: 0 properties
Starting direct scrape of search URL...
Page 1: 15 properties found, 15 new, 0 duplicates
Page 2: 15 properties found, 0 new, 15 duplicates
Page 3: 15 properties found, 0 new, 15 duplicates
Page 4: 15 properties found, 0 new, 15 duplicates
Too many consecutive pages with only duplicates. Stopping.
✅ Found 14 new properties. Total: 14

🏙️ Scraping City of London - minPrice=400000maxPrice=600000
Current total: 14 properties
Starting direct scrape of search URL...
Page 1: 25 properties found, 25 new, 0 duplicates
Page 2: 25 properties found, 0 new, 25 duplicates
Page 3: 25 properties found, 0 new, 25 duplicates
Page 4: 25 properties found, 0 new, 25 duplicates
Too many consecutive pages with only duplicates. Stopping.
✅ Found 24 new properties. Total: 38

🏙️ Scraping City of London - minPrice=600000maxPrice=800000
Current total: 38 properties
Starting direct scrape of search URL...
Page 1: 25 properties found, 25 ne

KeyboardInterrupt: 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import json
import jmespath
import asyncio
from httpx import AsyncClient
from parsel import Selector
import nest_asyncio

class RightmoveRentalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        self.properties = []
        self.scraped_urls = set()
        self.httpx_client = AsyncClient(
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            },
            follow_redirects=True,
            http2=False,
            timeout=30,
        )
    
    def find_json_objects(self, text: str):
        pos = 0
        while True:
            match = text.find("{", pos)
            if match == -1: break
            try:
                result, index = json.JSONDecoder().raw_decode(text[match:])
                yield result
                pos = match + index
            except:
                pos = match + 1

    def extract_property_id(self, url):
        """Extract property ID from URL"""
        match = re.search(r'/properties/(\d+)', url)
        return match.group(1) if match else 'N/A'
    
    def format_address_and_postcode(self, address_data):
        """Extract and format address and postcode from JSON address object"""
        if isinstance(address_data, dict):
            display_address = address_data.get('displayAddress', '')
            if display_address:
                clean_address = re.sub(r'\r\n|\r|\n', ', ', display_address)
                clean_address = re.sub(r',\s*,', ',', clean_address)
                clean_address = re.sub(r'\s+', ' ', clean_address).strip()
                postcode_pattern = r',?\s*[A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2}\s*$'
                clean_address = re.sub(postcode_pattern, '', clean_address, flags=re.IGNORECASE)
                clean_address = clean_address.rstrip(', ')
            else:
                clean_address = 'N/A'
            
            outcode = address_data.get('outcode', '')
            incode = address_data.get('incode', '')
            
            if outcode and incode:
                postcode = f"{outcode} {incode}"
            else:
                postcode = 'N/A'
            
            return clean_address, postcode
        return 'N/A', 'N/A'
    
    def extract_tenure_details(self, json_data):
        """Extract tenure type and lease years remaining from JSON data"""
        tenure_type = 'N/A'
        lease_years_remaining = 'N/A'
        
        if json_data:
            tenure_info = jmespath.search("tenure", json_data)
            
            if isinstance(tenure_info, dict):
                tenure_type = tenure_info.get('tenureType', 'N/A')
                years_remaining = tenure_info.get('yearsRemainingOnLease')
                if years_remaining is not None:
                    lease_years_remaining = years_remaining
                
                message = tenure_info.get('message')
                if message:
                    tenure_type = f"{tenure_type} - {message}"
            elif isinstance(tenure_info, str):
                tenure_type = tenure_info
        
        return tenure_type, lease_years_remaining

    async def extract_property_json_data(self, property_url):
        try:
            response = await self.httpx_client.get(property_url)
            data = Selector(response.text).xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
            if data:
                json_objects = list(self.find_json_objects(data))
                for obj in json_objects:
                    if "propertyData" in obj:
                        return obj["propertyData"]
            return None
        except Exception as e:
            print(f"JSON extraction error for {property_url}: {e}")
            return None

    async def scrape_property_details_fast(self, property_url):
        # Skip if already scraped
        if property_url in self.scraped_urls:
            return None
            
        try:
            html_task = asyncio.create_task(self.httpx_client.get(property_url))
            json_data = await self.extract_property_json_data(property_url)
            response = await html_task
            
            soup = BeautifulSoup(response.content, 'html.parser')
            property_data = {'url': property_url}
            
            # Mark as scraped
            self.scraped_urls.add(property_url)
            
            # Extract property ID and use as title
            property_id = self.extract_property_id(property_url)
            property_data['title'] = property_id
            
            # Basic property info with enhanced extraction
            all_text = soup.get_text()
            
            # Enhanced bedroom extraction
            bed_patterns = [
                r'(\d+)\s*bedroom',
                r'(\d+)\s*bed(?:room)?s?',
                r'(\d+)\s*-?\s*bed'
            ]
            property_data['bedrooms'] = 'N/A'
            for pattern in bed_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bedrooms'] = match.group(1)
                    break
            
            # Enhanced bathroom extraction
            bath_patterns = [
                r'(\d+)\s*bathroom',
                r'(\d+)\s*bath(?:room)?s?',
                r'(\d+)\s*-?\s*bath'
            ]
            property_data['bathrooms'] = 'N/A'
            for pattern in bath_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['bathrooms'] = match.group(1)
                    break

            # Key features extraction
            property_data['key_features'] = 'N/A'
            feature_selectors = [
                'ul.lIhZ24u1NHlVy5_W9V__6 li',
                '.key-features li',
                '[data-test="key-features"] li'
            ]
            
            for selector in feature_selectors:
                features = soup.select(selector)
                if features:
                    feature_texts = [f.get_text().strip() for f in features]
                    property_data['key_features'] = '; '.join(feature_texts)
                    break
            
            if property_data['key_features'] == 'N/A':
                feature_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'key features|features|amenities', re.I))
                for header in feature_headers:
                    next_element = header.find_next_sibling()
                    while next_element:
                        if next_element.name in ['ul', 'ol']:
                            features = [li.get_text().strip() for li in next_element.find_all('li')]
                            if features:
                                property_data['key_features'] = '; '.join(features)
                                break
                        next_element = next_element.find_next_sibling()
                    if property_data['key_features'] != 'N/A':
                        break
            
            # Property details extraction
            property_data.update({
                'parking': 'N/A',
                'garden': 'N/A',
                'council_tax': 'N/A',
                'accessibility': 'N/A',
                'size_sqft': 'N/A',
                'size_sqm': 'N/A',
                'furnish_status': 'N/A',
            })
           
            # Extract details from various selectors
            detail_selectors = [
                'dt._17A0LehXZKxGHbPeiLQ1BI',
                'dt[class*="detail"]',
                '.property-details dt',
                'dl dt'
            ]
            
            for selector in detail_selectors:
                detail_sections = soup.select(selector)
                if detail_sections:
                    for section in detail_sections:
                        section_text = section.get_text().strip().upper()
                        value_element = section.find_next_sibling(['dd', 'span', 'div'])
                        value_text = value_element.get_text().strip() if value_element else 'N/A'
                        
                        if 'PARKING' in section_text:
                            property_data['parking'] = value_text
                        elif 'GARDEN' in section_text:
                            property_data['garden'] = value_text
                        elif 'COUNCIL TAX' in section_text:
                            property_data['council_tax'] = value_text
                        elif 'ACCESSIBILITY' in section_text:
                            property_data['accessibility'] = value_text
                        elif 'FURNISH' in section_text:
                            property_data['furnish_status'] = value_text
                    break

            # Size extraction
            sqft_patterns = [
                r'([0-9,]+)\s*sq\.?\s*ft',
                r'([0-9,]+)\s*sqft',
                r'([0-9,]+)\s*square\s*feet'
            ]
            for pattern in sqft_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqft'] = match.group(1).replace(',', '')
                    break
            
            sqm_patterns = [
                r'([0-9,]+)\s*sq\.?\s*m',
                r'([0-9,]+)\s*sqm',
                r'([0-9,]+)\s*square\s*metres?'
            ]
            for pattern in sqm_patterns:
                match = re.search(pattern, all_text, re.I)
                if match:
                    property_data['size_sqm'] = match.group(1).replace(',', '')
                    break

            # Extract data from JSON if available
            if json_data:
                try:
                    stations = jmespath.search("nearestStations[*].{name: name, distance: distance}", json_data) or []
                    photos = jmespath.search("images[*].{url: url, caption: caption}", json_data) or []
                    floorplans = jmespath.search("floorplans[*].{url: url, caption: caption}", json_data) or []
                    property_type = (jmespath.search("propertySubType", json_data) or 
                                   jmespath.search("propertyType", json_data) or 'N/A')
                    description = jmespath.search("text.description", json_data) or 'N/A'
                    address_data = jmespath.search("address", json_data)
                    latitude = jmespath.search("location.latitude", json_data) or 'N/A'
                    longitude = jmespath.search("location.longitude", json_data) or 'N/A'
                    price = jmespath.search("prices.primaryPrice", json_data) or 'N/A'
                    
                    tenure_type, lease_years_remaining = self.extract_tenure_details(json_data)
                    formatted_address, postcode = self.format_address_and_postcode(address_data)
                    
                    property_data.update({
                        'nearest_stations': '; '.join([s['name'] for s in stations]) or 'N/A',
                        'station_distances': '; '.join([f"{s['distance']} miles" for s in stations]) or 'N/A',
                        'station_count': len(stations),
                        'image_count': len(photos),
                        'image_urls': '; '.join([p['url'] for p in photos]) or 'N/A',
                        'floorplan_count': len(floorplans),
                        'floorplan_urls': '; '.join([f['url'] for f in floorplans]) or 'N/A',
                        'property_type': property_type,
                        'tenure_type': tenure_type,
                        'lease_years_remaining': lease_years_remaining,
                        'description': description,
                        'latitude': latitude,
                        'longitude': longitude,
                        'address': formatted_address,
                        'postcode': postcode,
                        'price': price
                    })
                except Exception as e:
                    print(f"Error extracting JSON data: {e}")
            else:
                # HTML fallback
                property_type_fallback = 'N/A'
                type_element = soup.find(string=re.compile(r'(flat|house|apartment|studio|maisonette)', re.I))
                if type_element:
                    property_type_fallback = type_element.strip()
                
                property_data.update({
                    'nearest_stations': 'N/A',
                    'station_distances': 'N/A', 
                    'station_count': 0,
                    'image_count': 0,
                    'image_urls': 'N/A',
                    'floorplan_count': 0,
                    'floorplan_urls': 'N/A',
                    'property_type': property_type_fallback,
                    'description': 'N/A',
                    'tenure_type': 'N/A',
                    'lease_years_remaining': 'N/A',
                    'latitude': 'N/A',
                    'longitude': 'N/A',
                    'address': 'N/A',
                    'postcode': 'N/A',
                    'price': 'N/A'
                })
            
            return property_data
            
        except Exception as e:
            print(f"Error scraping {property_url}: {e}")
            return {'url': property_url, 'error': str(e)}

    async def scrape_search_directly(self, search_url, max_properties=None):
        """Directly scrape properties from search pages - continue until no more results"""
        print(f"Starting direct scrape of search URL...")
        
        page = 0  # ✅ Fixed variable name
        consecutive_empty_pages = 0
        consecutive_duplicate_pages = 0
        new_properties_found = 0
        
        while True:
            page_url = f"{search_url}&index={page * 24}"
            
            try:
                # Get the search page
                response = self.session.get(page_url, timeout=15)
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find property links on this page
                links = []
                selectors = [
                    'a.propertyCard-link',
                    'a[href*="/properties/"]',
                    '.propertyCard a',
                    '[data-test="property-card"] a'
                ]
                
                for selector in selectors:
                    found_links = soup.select(selector)
                    if found_links:
                        links = [f"https://www.rightmove.co.uk{link.get('href')}" 
                                for link in found_links if link.get('href') and '/properties/' in link.get('href')]
                        break
                
                if len(links) == 0:
                    consecutive_empty_pages += 1
                    print(f"Page {page + 1}: 0 properties found")
                    # ✅ Increased threshold - stop after 10 consecutive empty pages
                    if consecutive_empty_pages >= 10:
                        print(f"No more results found after {consecutive_empty_pages} empty pages. Stopping.")
                        break
                else:
                    consecutive_empty_pages = 0
                    
                    # Check how many are new vs duplicates
                    new_links = [link for link in links if link not in self.scraped_urls]
                    duplicate_count = len(links) - len(new_links)
                    
                    print(f"Page {page + 1}: {len(links)} properties found, {len(new_links)} new, {duplicate_count} duplicates")
                    
                    # If mostly duplicates, we might be hitting repeated content
                    if len(new_links) == 0:
                        consecutive_duplicate_pages += 1
                        # ✅ Increased threshold - stop after 10 consecutive pages with only duplicates
                        if consecutive_duplicate_pages >= 10:
                            print(f"Too many consecutive pages with only duplicates. Stopping.")
                            break
                    else:
                        consecutive_duplicate_pages = 0
                    
                    # Scrape the new properties directly
                    if new_links:
                        batch_size = 3
                        for i in range(0, len(new_links), batch_size):
                            batch = new_links[i:i+batch_size]
                            tasks = [self.scrape_property_details_fast(url) for url in batch]
                            results = await asyncio.gather(*tasks, return_exceptions=True)
                            
                            for result in results:
                                if isinstance(result, dict) and result is not None and 'error' not in result:
                                    self.properties.append(result)
                                    new_properties_found += 1
                            
                            # Check if we've reached our target
                            if max_properties and len(self.properties) >= max_properties:
                                print(f"Reached target of {max_properties} properties!")
                                return new_properties_found
                            
                            await asyncio.sleep(0.3)  # Small delay between batches
                
                page += 1  # ✅ Move to next page
                time.sleep(0.5)  # Delay between pages
                
                # Safety break to avoid infinite loops
                if page > 1000:  # ✅ Increased limit
                    print(f"Reached maximum page limit (1000 pages). Stopping.")
                    break
                    
            except Exception as e:
                print(f"Error on page {page + 1}: {e}")
                consecutive_empty_pages += 1
                if consecutive_empty_pages >= 10:  # ✅ Increased threshold
                    break
                page += 1  # Still move to next page even after error
        
        return new_properties_found

    def save_to_csv(self, filename='rightmove_london_properties.csv'):
        if not self.properties:
            print("No properties to save!")
            return None
            
        df = pd.DataFrame(self.properties)
        
        # Clean description field
        if 'description' in df.columns:
            df['description'] = df['description'].astype(str).apply(
                lambda x: re.sub(r'<[^>]+>', ' ', x) if x != 'N/A' else x
            ).apply(
                lambda x: re.sub(r'\s+', ' ', x).strip() if x != 'N/A' else x
            )
        
        # Remove duplicate properties by URL
        df_clean = df.drop_duplicates(subset=['url'], keep='first')
        
        df_clean.to_csv(filename, index=False)
        print(f"Saved {len(df_clean)} unique properties to {filename}")
        
        if len(df_clean) > 0:
            print(f"\nData Summary:")
            print(f"Properties with prices: {len(df_clean[df_clean['price'] != 'N/A'])}")
            print(f"Properties with addresses: {len(df_clean[df_clean['address'] != 'N/A'])}")
            print(f"Properties with images: {len(df_clean[df_clean['image_count'] != 0])}")
        
        return df_clean

async def run_complete_scraper():
    scraper = RightmoveRentalScraper()
    
    # London Boroughs - ✅ Using correct location identifiers
    london_boroughs = [
        {"name": "City of London", "id": "REGION%5E61224"},
        {"name": "Camden", "id": "REGION%5E93941"},
        {"name": "Greenwich", "id": "REGION%5E61226"},
        {"name": "Hackney", "id": "REGION%5E93953"},
        {"name": "Hammersmith and Fulham", "id": "REGION%5E61407"},
        {"name": "Islington", "id": "REGION%5E93965"},
        {"name": "Kensington and Chelsea", "id": "REGION%5E61229"},
        {"name": "Lambeth", "id": "REGION%5E93971"},
        {"name": "Lewisham", "id": "REGION%5E61413"},
        {"name": "Southwark", "id": "REGION%5E61518"},
        {"name": "Tower Hamlets", "id": "REGION%5E61417"},
        {"name": "Wandsworth", "id": "REGION%5E93977"},
        {"name": "Westminster", "id": "REGION%5E93980"},
        {"name": "Barking and Dagenham", "id": "REGION%5E61400"},
        {"name": "Barnet", "id": "REGION%5E93929"},
        {"name": "Bexley", "id": "REGION%5E93932"},
        {"name": "Brent", "id": "REGION%5E93935"},
        {"name": "Bromley", "id": "REGION%5E93938"},
        {"name": "Croydon", "id": "REGION%5E93944"},
        {"name": "Ealing", "id": "REGION%5E93947"},
        {"name": "Enfield", "id": "REGION%5E93950"},
        {"name": "Haringey", "id": "REGION%5E61227"},
        {"name": "Harrow", "id": "REGION%5E93956"},
        {"name": "Havering", "id": "REGION%5E61228"},
        {"name": "Hillingdon", "id": "REGION%5E93959"},
        {"name": "Hounslow", "id": "REGION%5E93962"},
        {"name": "Kingston upon Thames", "id": "REGION%5E93968"},
        {"name": "Merton", "id": "REGION%5E61414"},
        {"name": "Newham", "id": "REGION%5E61231"},
        {"name": "Redbridge", "id": "REGION%5E61537"},
        {"name": "Richmond upon Thames", "id": "REGION%5E61415"},
        {"name": "Sutton", "id": "REGION%5E93974"},
        {"name": "Waltham Forest", "id": "REGION%5E61232"}
    ]
    
    # Price ranges
    price_ranges = [
        "&maxPrice=400000",
        "&minPrice=400000&maxPrice=600000",
        "&minPrice=600000&maxPrice=800000",
        "&minPrice=800000&maxPrice=1000000",
        "&minPrice=1000000&maxPrice=1500000",
        "&minPrice=1500000&maxPrice=2000000",
        "&minPrice=2000000&maxPrice=3000000",
        "&minPrice=3000000"
    ]
    
    start_time = time.time()
    target_properties = 50000
    
    for borough in london_boroughs:
        for price_range in price_ranges:
            search_url = f"https://www.rightmove.co.uk/property-for-sale/find.html?useLocationIdentifier=true&locationIdentifier={borough['id']}&buy=For+sale&radius=0.0&_includeSSTC=on&index=0&sortType=2&channel=BUY&transactionType=BUY{price_range}"
            
            print(f"\n🏙️ Scraping {borough['name']} - {price_range.replace('&', '')}")
            print(f"Current total: {len(scraper.properties)} properties")
            
            remaining_needed = target_properties - len(scraper.properties)
            if remaining_needed <= 0:
                print(f"✅ Target of {target_properties} properties reached!")
                break
            
            try:
                new_found = await scraper.scrape_search_directly(
                    search_url, 
                    max_properties=remaining_needed
                )
                print(f"✅ Found {new_found} new properties. Total: {len(scraper.properties)}")
                
            except Exception as e:
                print(f"Error scraping {borough['name']}: {e}")
                continue
            
            if len(scraper.properties) >= target_properties:
                print(f"🎯 Target reached! Total: {len(scraper.properties)} properties")
                break
            
            await asyncio.sleep(1)
        
        if len(scraper.properties) >= target_properties:
            break

    end_time = time.time()
    
    scraper.save_to_csv('rightmove_london_properties.csv')
    await scraper.httpx_client.aclose()
    
    print(f"⚡ Completed in {end_time - start_time:.2f} seconds")
    print(f"📊 Final count: {len(scraper.properties)} unique London properties")
    return scraper.properties

if __name__ == "__main__":
    nest_asyncio.apply()
    results = asyncio.run(run_complete_scraper())


🏙️ Scraping City of London - maxPrice=400000
Current total: 0 properties
Starting direct scrape of search URL...
Page 1: 15 properties found, 15 new, 0 duplicates
Page 2: 15 properties found, 0 new, 15 duplicates
Page 3: 15 properties found, 0 new, 15 duplicates
Page 4: 15 properties found, 0 new, 15 duplicates
Page 5: 15 properties found, 0 new, 15 duplicates
Page 6: 15 properties found, 0 new, 15 duplicates
Page 7: 15 properties found, 0 new, 15 duplicates
Page 8: 15 properties found, 0 new, 15 duplicates
Page 9: 15 properties found, 0 new, 15 duplicates
Page 10: 15 properties found, 0 new, 15 duplicates
Page 11: 15 properties found, 0 new, 15 duplicates
Too many consecutive pages with only duplicates. Stopping.
✅ Found 14 new properties. Total: 14

🏙️ Scraping City of London - minPrice=400000maxPrice=600000
Current total: 14 properties
Starting direct scrape of search URL...
Page 1: 25 properties found, 25 new, 0 duplicates
Page 2: 25 properties found, 1 new, 24 duplicates
Page 3: 

KeyboardInterrupt: 