In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re

In [4]:
def convert_size(size):
    # Change every size into sq.ft
    if size.endswith('Sq. Yd.'):
        return round(float(size[:-7].strip().replace(',','')) * 9)
    elif size.endswith('Sq. M.'):
        return round(float(size[:-6].strip().replace(',','')) * 10.764)
    elif size.endswith('Marla'):
        return round(float(size[:-5].strip().replace(',','')) * 272.3)
    elif size.endswith('Kanal'):
        return round(float(size[:-5].strip().replace(',','')) * 5445)
    else:
        return round(float(size))

In [1]:
# st = '500,000 Sq. Yd.'
# st = str(st).strip()
# s = st.endswith('Sq. Yd.')
# s

In [2]:
# st[:-7].strip().replace(',','')

In [5]:
def convert_price(price):
    if price.endswith('Thousand'):
        return round(float(price[:-8].strip()) * 1000)
    elif price.endswith('Lakh'):
        return round(float(price[:-4].strip()) * 100000)
    elif price.endswith('Million'):
        return round(float(price[:-7].strip()) * 1000000)
    elif price.endswith('Crore'):
        return round(float(price[:-5].strip()) * 10000000)
    elif price.endswith('Arab'):
        return round(float(price[:-4].strip()) * 1000000000)
    else:
        return round(float(price))

In [6]:
def text(tag, datatype = 'str'):
    # num --> bed, bath,  str -->  location,  price --> price, size --> size
    if tag is None and datatype == 'num':
        return 0
        
    if tag is None and datatype == 'str':
        return ""
        
    if tag is None and datatype == 'price':
        return 0.0

    # New line
    if tag is None and datatype == 'size':
        return 0.0
        
    if datatype == 'num':
        try:
            return int(tag.text.strip())
        except ValueError:
            return 0

    if datatype == 'str':
        return tag.text.strip()
        
    if datatype == 'price':
        return convert_price(tag.text.strip())
        
    if datatype == 'size':
        return convert_size(tag.text.strip())

In [7]:
def scrape_pages(city, pages_range):
    house_info = []
    for page_num in range(1, pages_range+1):
        url = f'https://www.zameen.com/Houses_Property/{city}-{page_num}.html'
        print(url)
        response = requests.get(url)
        soup = bs(response.text, 'html.parser')
        house_list = soup.select("main > div > div > div > div > ul > li")

        prev_len = len(house_info)
        
        for house in house_list:
            location = house.select_one("div[aria-label='Location']")
            area = house.select_one("div[title] > div > div > span:nth-child(1)")
            beds = house.select_one("span[aria-label='Beds']")
            bath = house.select_one("span[aria-label='Baths']")
            price = house.select_one("span[aria-label='Price']")
            
            if price:
                house_info.append(
                    {
                        "location": text(location),
                        "area": text(area, datatype = "size"),
                        "price": text(price, datatype = "price"),
                        "num_of_bedrooms": text(beds, datatype = "num"),
                        "num_of_bathrooms": text(bath, datatype = "num")
                    }
                )
        if len(house_info) == prev_len:
            break
            
    return house_info

In [8]:
if __name__ == "__main__":
    house_info = []
    cities = [
        {'id':1, 'name':'Lahore'},
        {'id':2, 'name':'Karachi'},
        {'id':3, 'name':'Islamabad'},
        {'id':15, 'name':'Multan'},
        {'id':16, 'name':'Faisalabad'},
        {'id':17, 'name':'Peshawar'},
        {'id':18, 'name':'Quetta'},
        {'id':30, 'name':'Hyderabad'},
        {'id':36, 'name':'Murree'},
        {'id':41, 'name':'Rawalpindi'},
        {'id':327, 'name':'Gujranwala'},
        {'id': 1233, 'name': 'Attock'},
    ]
    
    for city in cities:
        house_info.append(
            {
                "city": city.get('name'),
                "info": scrape_pages(f"{city.get('name')}-{city.get('id')}", 50)
            }
        )
    with open("zameen.csv","w") as f:
        f.write("city|location|area_sq_ft|num_of_bedrooms|num_of_bathrooms|price_in_rupees\n")
        for house in house_info:
            for info in house.get('info'):
                #print(f"Writing row: city={house.get('city')}, location={info.get('location')}, area={info.get('area')}, price={info.get('price')}, beds={info.get('num_of_bedrooms')}, baths={info.get('num_of_bathrooms')}")
                f.write(
                    f"{house.get('city')}|{info.get('location')}|{info.get('area')}|{info.get('num_of_bedrooms')}|{info.get('num_of_bathrooms')}|{info.get('price')}\n"
                )

https://www.zameen.com/Houses_Property/Lahore-1-1.html
https://www.zameen.com/Houses_Property/Lahore-1-2.html
https://www.zameen.com/Houses_Property/Lahore-1-3.html
https://www.zameen.com/Houses_Property/Lahore-1-4.html
https://www.zameen.com/Houses_Property/Lahore-1-5.html
https://www.zameen.com/Houses_Property/Lahore-1-6.html
https://www.zameen.com/Houses_Property/Lahore-1-7.html
https://www.zameen.com/Houses_Property/Lahore-1-8.html
https://www.zameen.com/Houses_Property/Lahore-1-9.html
https://www.zameen.com/Houses_Property/Lahore-1-10.html
https://www.zameen.com/Houses_Property/Lahore-1-11.html
https://www.zameen.com/Houses_Property/Lahore-1-12.html
https://www.zameen.com/Houses_Property/Lahore-1-13.html
https://www.zameen.com/Houses_Property/Lahore-1-14.html
https://www.zameen.com/Houses_Property/Lahore-1-15.html
https://www.zameen.com/Houses_Property/Lahore-1-16.html
https://www.zameen.com/Houses_Property/Lahore-1-17.html
https://www.zameen.com/Houses_Property/Lahore-1-18.html
h

In [9]:
# cities = [
#         {'id':1, 'name':'Lahore'},
#         {'id':2, 'name':'Karachi'}]

# for city in cities:
#     print(f"{city.get('name'), city.get('id')}")