In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
import time

# Base URL
base_url = "https://housing.com/in/buy/bangalore/bangalore"
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; AOL 9.0; Windows NT 5.1; {}; Alawar 2.08; .NET CLR 1.0.3705)'}

# Initialize a list to store extracted data
extracted_data = []

# Function to process a single page
def process_page(url):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all relevant anchor tags
    anchors = soup.find_all(
        "a", 
        {"data-q": "title", "class": "_j31f9d _c8dlk8 _g3l52n _csbfng _frwh2y T_e4485809 _ks15vq _vv1q9c _sq1l2s T_091c165f"}
    )
    
    # Iterate through each link
    for anchor in anchors:
        href = anchor.get("href")  # Relative link
        if not href:
            continue
        full_url = urljoin(base_url, href)  # Convert to absolute URL
        
        # Fetch the linked page
        linked_page_response = requests.get(full_url)
        if linked_page_response.status_code != 200:
            print(f"Failed to fetch {full_url}")
            continue
        
        linked_soup = BeautifulSoup(linked_page_response.content, "html.parser")
        print(f"Processing: {full_url}")
        
        # Extract the required data
        data = {}
        data["Location"] = linked_soup.find("div", class_="css-1ty5xzi").text.strip() if linked_soup.find("div", class_="css-1ty5xzi") else "No Location"
        data["Date of Listing"] = linked_soup.find("div", class_="css-1k19e3").text.strip() if linked_soup.find("div", class_="css-1k19e3") else "No Date"
        data["No. of Bedrooms"] = linked_soup.find("div", class_="css-1mtnl2n").text.strip() if linked_soup.find("div", class_="css-1mtnl2n") else "No Bedrooms"
        data["No. of Bathrooms"] = linked_soup.find("div", class_="bathrooms-class").text.strip() if linked_soup.find("div", class_="bathrooms-class") else "No Bathrooms"
        data["Area"] = linked_soup.find("li", class_="area-header-title active css-r25t52").text.strip() if linked_soup.find("li", class_="area-header-title active css-r25t52") else "No Area"
        data["Parking Availability"] = linked_soup.find("div", class_="T_5990d116 _1q731trj _6w1e54 _9scj1k _fycs5v _ks15vq _c81fwx _h3ftgi label").text.strip() if linked_soup.find("div", class_="T_5990d116 _1q731trj _6w1e54 _9scj1k _fycs5v _ks15vq _c81fwx _h3ftgi label") else "No Parking Info"
        data["Furnishing Status"] = linked_soup.find("div", class_="").text.strip() if linked_soup.find("div", class_="furnishing-class") else "No Furnishing Info"
        data["Apartment Name"] = linked_soup.find("h1", class_="css-1hidc9c").text.strip() if linked_soup.find("h1", class_="css-1hidc9c") else "No Apartment Name"
        data["Current Price"] = linked_soup.find("div", class_="css-yv0jp1").text.strip() if linked_soup.find("div", class_="css-yv0jp1") else "No Price"

        # Append to the data list
        extracted_data.append(data)
        
        # Introduce a delay to avoid overloading the server
        time.sleep(1)

# Pagination loop
page_number = 1
max_pages = 2  # Limit to 3 pages

while page_number <= max_pages:
    # Generate the URL for the current page
    page_url = f"{base_url}?page={page_number}"
    print(f"Fetching page {page_number}: {page_url}")
    
    # Process the page
    response = requests.get(page_url)
    if response.status_code != 200 or "No results found" in response.text:
        print("No more pages to process.")
        break
    
    process_page(page_url)
    page_number += 1



'''# Pagination loop
page_number = 1
while True:
    # Generate the URL for the current page
    page_url = f"{base_url}?page={page_number}"
    print(f"Fetching page {page_number}: {page_url}")
    
    # Process the page
    response = requests.get(page_url)
    if response.status_code != 200 or "No results found" in response.text:
        print("No more pages to process.")
        break
    
    process_page(page_url)
    page_number += 1
'''

# Save the data to a CSV file
csv_file = "scraped_data.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Location", "Date of Listing", "No. of Bedrooms", "No. of Bathrooms", "Area", "Parking Availability", "Furnishing Status", "Apartment Name", "Current Price"])
    writer.writeheader()
    writer.writerows(extracted_data)

print(f"Data saved to {csv_file}")


Fetching page 1: https://housing.com/in/buy/bangalore/bangalore?page=1
Processing: https://housing.com/in/buy/projects/page/328018-serene-heights-by-modern-spaaces-in-kadagrahara
Processing: https://housing.com/in/buy/projects/page/318558-ma-sarada-upavan-phase-ii-by-ma-sarada-constructions-bangalore-in-bommasandra
Processing: https://housing.com/in/buy/projects/page/329608-sattva-songbird-by-sattva-group-in-aavalahalli
Processing: https://housing.com/in/buy/projects/page/336167-embassy-verde-by-embassy-group-in-mahadeva-kodigehalli
Processing: https://housing.com/in/buy/projects/page/288186-inspira-infinity-by-inspira-builders-in-carmelaram
Processing: https://housing.com/in/buy/projects/page/332912-signature-heights-by-signature-dwellings-in-neraluru
Processing: https://housing.com/in/buy/projects/page/329241-sumadhura-capitol-residences-by-sumadhura-infracon-pvt-ltd-in-whitefield
Processing: https://housing.com/in/buy/projects/page/333007-konig-north-county-by-konig-homes-in-yelahan