In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Create session for connection reuse
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

CITY = "Kolkata"

def extract_details(detail_soup):
    """Extract address and per sqft price from BUY detail page"""
    address = "N/A"
    per_sqft = "N/A"
    
    # Extract Address
    title_div = detail_soup.find("div", class_="mb-ldp__dtls__title")
    if title_div:
        address_link = title_div.find("a", class_="mb-ldp__dtls__title--link")
        if address_link:
            address = address_link.get_text(strip=True)
    
    # Extract Per Sqft Price
    items = detail_soup.find_all("li", class_="mb-ldp__dtls__body__list--item")
    for item in items:
        label = item.find("div", class_="mb-ldp__dtls__body__list--label")
        if not label:
            continue
        text = label.get_text(strip=True)

        if "Area" in text or "Carpet Area" in text or "Super Area" in text:
            size_div = item.find("div", class_="mb-ldp__dtls__body__list--size")
            if size_div:
                price_text = size_div.get_text(strip=True)
                price_match = re.search(r'₹([\d,]+)', price_text)
                if price_match:
                    per_sqft = price_match.group(1).replace(",", "")
            break
    
    return {"address": address, "per_sqft": per_sqft}

def fetch_details(link):
    """Fetch address and per sqft price from detail page"""
    if not link:
        return {"address": "N/A", "per_sqft": "N/A"}
    try:
        response = session.get(link, timeout=5)
        detail_soup = BeautifulSoup(response.text, "html.parser")
        return extract_details(detail_soup)
    except:
        return {"address": "N/A", "per_sqft": "N/A"}

def parse_card(card):
    """Parse a single property card"""
    # RESIDENTIAL FILTER
    furn = card.find("div", {"data-summary": "furnishing"})
    if furn:
        fv = furn.find("div", class_="mb-srp__card__summary--value")
        fval = fv.get_text(strip=True) if fv else "N/A"
    else:
        fval = "N/A"

    if fval == "N/A":
        return None  # skip non-residential

    # NAME
    title_tag = card.find("h2", class_="mb-srp__card--title")
    if not title_tag:
        return None
    title = title_tag.get_text(strip=True)

    # BEDROOMS
    match = re.search(r"(\d+)\s*BHK", title, re.IGNORECASE)
    bedroom = match.group(1) if match else "N/A"

    # PRICE
    price_tag = card.find("div", class_="mb-srp__card__price--amount")
    if price_tag:
        price = price_tag.get_text(strip=True).replace("₹", "").replace(",", "").strip()
    else:
        price = "N/A"

    # AREA
    super_area = "N/A"
    carpet_area = "N/A"

    sup = card.find("div", {"data-summary": "super-area"})
    if sup:
        v = sup.find("div", class_="mb-srp__card__summary--value")
        if v:
            super_area = re.sub(r"\D", "", v.get_text(strip=True))

    carp = card.find("div", {"data-summary": "carpet-area"})
    if carp:
        v = carp.find("div", class_="mb-srp__card__summary--value")
        if v:
            carpet_area = re.sub(r"\D", "", v.get_text(strip=True))

    area = carpet_area if carpet_area != "N/A" else super_area

    # DETAIL LINK
    link = None
    scripts = card.find_all("script", {"type": "application/ld+json"})
    for s in scripts:
        try:
            d = json.loads(s.text)
            if "url" in d:
                link = d["url"]
                break
        except:
            pass

    return {
        "name": title,
        "city": CITY,
        "bedroom": bedroom,
        "price": price,
        "area": area,
        "furnishing": fval,
        "link": link
    }

# SCRAPE BUY PROPERTIES (RESIDENTIAL ONLY) — 40 pages
properties_data = []

for page in range(1, 40):
    url = f"https://www.magicbricks.com/property-for-sale/residential-real-estate?cityName=kolkata&page={page}"
    response = session.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    cards = soup.find_all("div", class_="mb-srp__list")
    
    for card in cards:
        prop_data = parse_card(card)
        if prop_data:
            properties_data.append(prop_data)
    
    time.sleep(0.3)

# Fetch all details concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_idx = {
        executor.submit(fetch_details, prop["link"]): idx 
        for idx, prop in enumerate(properties_data)
    }
    
    for future in as_completed(future_to_idx):
        idx = future_to_idx[future]
        try:
            details = future.result()
            properties_data[idx]["address"] = details["address"]
            properties_data[idx]["per_sqft"] = details["per_sqft"]
        except Exception as e:
            properties_data[idx]["address"] = "N/A"
            properties_data[idx]["per_sqft"] = "N/A"

# Build final dataframe
df = pd.DataFrame({
    "Name": [p["name"] for p in properties_data],
    "City": [p["city"] for p in properties_data],
    "Address": [p["address"] for p in properties_data],
    "Bedrooms": [p["bedroom"] for p in properties_data],
    "Price": [p["price"] for p in properties_data],
    "Rent": ["N/A"] * len(properties_data),
    "Area": [p["area"] for p in properties_data],
    "Per_Sqft_Price": [p["per_sqft"] for p in properties_data],
    "Furnishing": [p["furnishing"] for p in properties_data]
})

df.reset_index(drop=True, inplace=True)

In [28]:
df.sample(5)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing
773,4 BHK House for Sale in Habra Kolkata,Kolkata,"Habra, Kolkata",4,65 Lac,,3,3010,Unfurnished
309,6 BHK House for Sale in Lake Town Kolkata,Kolkata,"Lake Town, Kolkata",6,1.20 Cr,,900,13333,Semi-Furnished
118,"2 BHK Flat for Sale in Bhawani Tower, Bhawani ...",Kolkata,"Kestopur, Kolkata",2,52 Lac,,850,6118,Furnished
356,"4 BHK House for Sale in Own Property, Barrackp...",Kolkata,,4,1.40 Cr,,2500,5600,Semi-Furnished
442,2 BHK Flat for Sale in Panchawati Lakeshore Ap...,Kolkata,"Santragachi, Kolkata",2,47 Lac,,935,5027,Semi-Furnished


In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Create session for connection reuse
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

CITY = "Kolkata"

def extract_details(detail_soup):
    """Extract area, address, and per sqft price from RENT detail page"""
    area = "N/A"
    address = "N/A"
    per_sqft = "N/A"
    
    # Extract Area and Per Sqft Price
    items = detail_soup.find_all("li", class_="mb-ldp__dtls__body__list--item")
    for item in items:
        label = item.find("div", class_="mb-ldp__dtls__body__list--label")
        if not label:
            continue
        text = label.get_text(strip=True)

        if "Area" in text:
            # Extract area number
            block = item.find("div", class_="mb-ldp__dtls__body__list")
            if block:
                num = block.contents[0].strip()
                num = re.sub(r"\D", "", num)
                area = num
            
            # Extract per sqft price
            size_div = item.find("div", class_="mb-ldp__dtls__body__list--size")
            if size_div:
                price_text = size_div.get_text(strip=True)
                price_match = re.search(r'₹(\d+)', price_text)
                if price_match:
                    per_sqft = price_match.group(1)
            break
    
    # Extract Address
    title_div = detail_soup.find("div", class_="mb-ldp__dtls__title")
    if title_div:
        address_link = title_div.find("a", class_="mb-ldp__dtls__title--link")
        if address_link:
            address = address_link.get_text(strip=True)
    
    return {"area": area, "address": address, "per_sqft": per_sqft}

def fetch_details(link):
    """Fetch area, address, and per sqft price from detail page"""
    if not link:
        return {"area": "N/A", "address": "N/A", "per_sqft": "N/A"}
    try:
        response = session.get(link, timeout=5)
        detail_soup = BeautifulSoup(response.text, "html.parser")
        return extract_details(detail_soup)
    except:
        return {"area": "N/A", "address": "N/A", "per_sqft": "N/A"}

def parse_card(card):
    """Parse a single property card"""
    # RESIDENTIAL FILTER
    furn = card.find("div", {"data-summary": "furnishing"})
    if not furn:
        return None
    fv = furn.find("div", class_="mb-srp__card__summary--value")
    if not fv:
        return None
    fval = fv.get_text(strip=True)

    # NAME
    title_tag = card.find("h2", class_="mb-srp__card--title")
    if not title_tag:
        return None
    title = title_tag.get_text(strip=True)

    # BEDROOMS
    match = re.search(r"(\d+)\s*BHK", title)
    bedroom = match.group(1) if match else "N/A"

    # RENT
    price_tag = card.find("div", class_="mb-srp__card__price--amount")
    rent = price_tag.get_text(strip=True).replace("₹", "").replace(",", "").strip() if price_tag else "N/A"

    # DETAIL LINK
    link = None
    scripts = card.find_all("script", {"type": "application/ld+json"})
    for s in scripts:
        try:
            d = json.loads(s.text)
            if "url" in d:
                link = d["url"]
                break
        except:
            pass

    return {
        "name": title,
        "city": CITY,
        "bedroom": bedroom,
        "rent": rent,
        "furnishing": fval,
        "link": link
    }

# SCRAPE RENT PROPERTIES (RESIDENTIAL ONLY) — 5 pages
properties_data = []

for page in range(1, 40):
    url = f"https://www.magicbricks.com/property-for-rent/residential-real-estate?cityName=kolkata&page={page}"
    response = session.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    cards = soup.find_all("div", class_="mb-srp__list")
    
    for card in cards:
        prop_data = parse_card(card)
        if prop_data:
            properties_data.append(prop_data)
    
    time.sleep(0.3)

# Fetch all details concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_idx = {
        executor.submit(fetch_details, prop["link"]): idx 
        for idx, prop in enumerate(properties_data)
    }
    
    for future in as_completed(future_to_idx):
        idx = future_to_idx[future]
        try:
            details = future.result()
            properties_data[idx]["area"] = details["area"]
            properties_data[idx]["address"] = details["address"]
            properties_data[idx]["per_sqft"] = details["per_sqft"]
        except Exception as e:
            properties_data[idx]["area"] = "N/A"
            properties_data[idx]["address"] = "N/A"
            properties_data[idx]["per_sqft"] = "N/A"

# Build final dataframe
df2 = pd.DataFrame({
    "Name": [p["name"] for p in properties_data],
    "City": [p["city"] for p in properties_data],
    "Address": [p["address"] for p in properties_data],
    "Bedrooms": [p["bedroom"] for p in properties_data],
    "Price": ["N/A"] * len(properties_data),
    "Rent": [p["rent"] for p in properties_data],
    "Area": [p.get("area", "N/A") for p in properties_data],
    "Per_Sqft_Price": [p.get("per_sqft", "N/A") for p in properties_data],
    "Furnishing": [p["furnishing"] for p in properties_data]
})

df2.reset_index(drop=True, inplace=True)

In [None]:
df2.sample(5)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing
0,"3 BHK Flat for Rent in Sherwood Estate, Sherwo...",Kolkata,"Narendrapur, Kolkata",3,,28000,1100,25,Unfurnished
1,"3 BHK Flat for Rent in Upohar The Condoville, ...",Kolkata,"EM Bypass, Kolkata",3,,57000,1966,29,Semi-Furnished
2,"3 BHK Flat for Rent in South City, South City,...",Kolkata,"Prince Anwar Shah Road, Kolkata",3,,80000,1550,52,Furnished
3,"3 BHK Flat for Rent in TATA Avenida, TATA Aven...",Kolkata,"New Town, Kolkata",3,,75000,1715,44,Furnished
4,"4 BHK Flat for Rent in South City, South City,...",Kolkata,"Prince Anwar Shah Road, Kolkata",4,,90000,2045,44,Semi-Furnished
...,...,...,...,...,...,...,...,...,...
791,1 BHK Flat for Rent in Eastern Metropolitan By...,Kolkata,"EM Bypass, Kolkata",1,,13500,448,30,Unfurnished
792,"3 BHK Flat for Rent in Netaji Nagar, Kolkata",Kolkata,"Netaji Nagar, Kolkata",3,,25000,1200,21,Semi-Furnished
793,"2 BHK Flat for Rent in Biswanath Abasan, Biswa...",Kolkata,"Biswanath Abasan, Tegharia Baguiati, Kolkata,B...",2,,17000,806,21,Furnished
794,"2 BHK Flat for Rent in Naktala, Garia, Kolkata",Kolkata,"Naktala, Kolkata",2,,18000,650,28,Semi-Furnished


In [30]:
final_df = pd.concat([df, df2], ignore_index=True)


In [32]:
final_df.to_csv("properties.csv", index=False)