In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def convert_price(price):
    if price.endswith('Crore'):
        return round(float(price[:-5]) * 10000000)
    elif price.endswith('Lakh'):
        return round(float(price[:-4]) * 100000)
    elif price.endswith('Million'):
        return round(float(price[:-7]) * 1000000)
    elif price.endswith('Arab'):
        return round(float(price[:-4]) * 1000000000)
    elif price.endswith('Thousand'):
        return round(float(price[:-8]) * 1000)
    else:
        return round(float(price))

In [3]:
def convert_size(size):
    if not isinstance(size, str):  # Ensure the input is a string
        size = str(size)
    
    size = size.strip()  # Remove any leading/trailing whitespace

    if size.endswith('Marla'):
        marla = float(size[:-5].replace(",", ""))
        sqft = marla * 225
        return marla, sqft
    elif size.endswith('Kanal'):
        kanal = float(size[:-5].replace(",", ""))
        marla = kanal * 20
        sqft = marla * 225
        return marla, sqft
    else:
        # If size cannot be converted, return 0 for both marla and sqft
        return 0, 0

In [4]:
def text(tag, datatype="str"):
    if tag is None and datatype in ["num", "price", "size"]:
        return 0 if datatype == "num" else 0.0
    if tag is None and datatype == "str":
        return "N/A"
    if datatype == "num":
        try:
            return int(tag.text.strip())
        except ValueError:
            return 0
    if datatype == "price":
        return convert_price(tag.text.strip())
    if datatype == "size":
        return tag.text.strip()
    return tag.text.strip()

In [5]:
def scrap(city, pages_range, city_id):
    house_info = []

    for page_number in range(1, pages_range + 1):
        url = f'https://www.zameen.com/Homes/{city}-{page_number}.html'
        print(f"Scraping: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        house_list = soup.select("main > div > div > div > div > ul > li")

        prev_len = len(house_info)

        for house in house_list:
            baths = house.select_one("span[aria-label='Baths']")
            beds = house.select_one("span[aria-label='Beds']")
            location = house.select_one("div[aria-label='Location']")
            price = house.select_one("span[aria-label='Price']")
            size = house.select_one("div[title]>div > div > span:nth-child(1)")
            link = house.select_one("a")

            if link:
                link = "https://www.zameen.com" + link["href"]

            if price:
                if size is None:
                    size = location.parent.select_one(
                        "div:nth-child(2) > div > span:nth-child(3)"
                    )
                size_text = text(size, datatype="size")
                marla, sqft = convert_size(size_text)
                house_info.append(
                    {
                        "city": city.split("-")[0],
                        "city_id": city_id,
                        "link": link,
                        "location": text(location),
                        "price": text(price, datatype="price"),
                        "bedrooms": text(beds, datatype="num"),
                        "baths": text(baths, datatype="num"),
                        "size_marla": marla,
                        "size_sqft": sqft,
                    }
                )

        if len(house_info) == prev_len:
            break

    return house_info

In [8]:
if __name__ == "__main__":
    house_info = []

    # Define the cities with IDs
    cities = [
         {'id': 385, 'name': 'Abbottabad'},
        {'id': 23, 'name': 'Bahawalpur'},
        {'id': 751, 'name': 'Chakwal'},
        {'id': 26, 'name': 'Dera Ghazi Khan'},
        {'id': 8244, 'name': 'Dera_Ismail_Khan'},
        {'id': 1293, 'name': 'Fateh Jang'},
        {'id': 1753, 'name': 'Gilgit'},
        {'id': 30, 'name': 'Hyderabad'},
        {'id': 19, 'name': 'Jhelum'},
        {'id': 40, 'name': 'Rahim_Yar_Khan'},
        {'id': 480, 'name': 'Sialkot'},
        # {'id': 1, 'name': 'Lahore'},
        # {'id': 2, 'name': 'Karachi'},
        # {'id': 3, 'name': 'Islamabad'},
        # {'id': 15, 'name': 'Multan'},
        # {'id': 16, 'name': 'Faisalabad'},
        # {'id': 17, 'name': 'Peshawar'},
        # {'id': 18, 'name': 'Quetta'},
        # {'id': 41, 'name': 'Rawalpindi'},
        # {'id': 36, 'name': 'Murree'},
        # {'id': 327, 'name': 'Gujranwala'},
        # {'id': 1233, 'name': 'Attock'},
        # {'id': 3234, 'name': '2_FECHS'},
    ]

    for city in cities:
        city_name = city.get('name')
        city_id = city.get('id')
        house_info.extend(scrap(f"{city_name}-{city_id}", 500, city_id))  # Pass city_id explicitly

    # Write to CSV with separate city and city_id columns
    with open("zameen_with_details_1.csv", "w") as f:
        f.write("city,city_id,link,location,price,bedrooms,baths,size_marla,size_sqft\n")
    
        for info in house_info:
            location = info.get("location")
            if "-" in location:
                location = location.replace("\n", " ").strip()
            f.write(
                f"{info.get('city')},{info.get('city_id')},{info.get('link')},\"{location}\",{info.get('price')},{info.get('bedrooms')},{info.get('baths')},{info.get('size_marla')},{info.get('size_sqft')}\n"
            )


Scraping: https://www.zameen.com/Homes/Abbottabad-385-1.html
Scraping: https://www.zameen.com/Homes/Abbottabad-385-2.html
Scraping: https://www.zameen.com/Homes/Abbottabad-385-3.html
Scraping: https://www.zameen.com/Homes/Abbottabad-385-4.html
Scraping: https://www.zameen.com/Homes/Abbottabad-385-5.html
Scraping: https://www.zameen.com/Homes/Abbottabad-385-6.html
Scraping: https://www.zameen.com/Homes/Abbottabad-385-7.html
Scraping: https://www.zameen.com/Homes/Abbottabad-385-8.html
Scraping: https://www.zameen.com/Homes/Abbottabad-385-9.html
Scraping: https://www.zameen.com/Homes/Bahawalpur-23-1.html
Scraping: https://www.zameen.com/Homes/Bahawalpur-23-2.html
Scraping: https://www.zameen.com/Homes/Bahawalpur-23-3.html
Scraping: https://www.zameen.com/Homes/Bahawalpur-23-4.html
Scraping: https://www.zameen.com/Homes/Bahawalpur-23-5.html
Scraping: https://www.zameen.com/Homes/Chakwal-751-1.html
Scraping: https://www.zameen.com/Homes/Chakwal-751-2.html
Scraping: https://www.zameen.com/Ho