# Working with Web Scrapping

In [1]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup # used for webscrapping
import unicodedata # for removing spaces from html

In [2]:
# when we try access the url and we this response then it means robots file detected this as some 
# bots are try to access this page. 
requests.get('https://www.glassdoor.com/Reviews/index.htm?overall_rating_low=3.5&page=1&filterType=RATING_OVERALL')

<Response [403]>

In [3]:
# So to overcome this problem we have to some header information.
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
webpage = requests.get('https://www.businesslist.pk/category/estate-agents',headers = headers).text

In [4]:
soup = BeautifulSoup(webpage, 'lxml')

In [5]:
soup.find_all('h4')[0].text

'Z.M Enterprises'

In [6]:
len(soup.find_all('h4'))

30

In [7]:
for i in soup.find_all('h4'):
    print(i.text)

Z.M Enterprises
Arcon Associates
Wadaan Enterprises
Niazi Builders
Nasar Estate
Rahman Property Dealer Timergara
Royal Business Solutions (RBS)
Sigma Properties and Marketing Pvt Ltd
The Palm Residential Apartments
Sarfaraz Hamid Properties
Arish Associates
Gillani Estate & Builders
Nouman Estate & Builders
Chanab Estate
Lodhi Builders & Estate
Mateen International
Al-Hussain Properties
Al-Hassan Estate , One of the Best Estate Real Estate in North Nazimabad, DHA karachi
BOUTIQUE HOMES ISLAMABAD
ilaan.com
Ismail Estate
Saddam Estates
Saif Group
Ali Builders & Developers
RightDeed.com | Pakistan Largest Property Portal
Sadaf Estate
Sky Marketing
Dha Realtors
Green World Real Estate
Gulistan Colony Nowshera-Mardan Road, Risalpur


In [8]:
company = soup.find_all('div', class_ = 'company with_img g_0')

In [9]:
len(company)

27

In [12]:
comp_name = []
address = []
details = []
verified = []
listed = []
for i in company:
    #print(i.find('h4').text.strip())
    comp_name.append(i.find('h4').text.strip())
    address.append(i.find('div', class_ = "address").text.strip())
    details.append(i.find('div', class_ = "details").text.strip())
    verified.append(i.find('div', class_ = "cont").find('u', class_ = "v").text.strip())
    temp = i.find('div', class_ = "cont").find('u', class_ = "v v4")
    if temp is not None:
        temp = temp.text.strip()
        temp =  unicodedata.normalize('NFKD', temp)
        listed.append(temp)
    else:
        listed.append('New Company')

In [16]:
data = {'Name' : comp_name, 'Address' : address, 'Details' : details, 'Verification' : verified, 'Listed_Years' : listed}
data = pd.DataFrame(data)
data.shape

(27, 5)

In [34]:
final = []
for i in range(1,11):
    url = 'https://www.businesslist.pk/category/estate-agents/{}'.format(i)
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
    webpage = requests.get(url,headers = headers).text

    soup = BeautifulSoup(webpage, 'lxml')
    company = soup.find_all('div', class_ = 'company with_img g_0')
    comp_name = []
    address = []
    details = []
    verified = []
    listed = []
    for i in company:
        comp_name.append(i.find('h4').text.strip())
        address.append(i.find('div', class_ = "address").text.strip())
        details.append(i.find('div', class_ = "details").text.strip())
        temp = i.find('div', class_ = "cont").find('u', class_ = "v")
        if temp is not None:
            temp = temp.text.strip()
            temp =  unicodedata.normalize('NFKD', temp)
            verified.append(temp)
        else:
            verified.append('Not Verified')
            
        temp = i.find('div', class_ = "cont").find('u', class_ = "v v4")
        if temp is not None:
            temp = temp.text.strip()
            temp =  unicodedata.normalize('NFKD', temp)
            listed.append(temp)
        else:
            listed.append('New Company')
    temp_data = {'Name' : comp_name, 'Address' : address, 'Details' : details, 'Verification' : verified, 'Listed_Years' : listed}
    temp_data = pd.DataFrame(temp_data)
    final.append(temp_data)     
    data = pd.concat(final)

In [35]:
data

Unnamed: 0,Name,Address,Details,Verification,Listed_Years
0,Z.M Enterprises,"DHA phase 7 extension Karachi, Karachi, Sindh",Z.M Enterprises is a real estate and general t...,Verified,+4 Years with us
1,Arcon Associates,"Office No 1, First Floor Nehal Plaza KBHS Soci...",Arcon Associates Pakistan is a versatile firm ...,Verified,+7 Years with us
2,Wadaan Enterprises,Plot 29 C Lane 6 Ittehad Commercial DHA Karach...,"We are Dealing in all sort of properties Sale,...",Verified,+8 Years with us
3,Nasar Estate,"335-B, Iqbal Avenue Khyaban-e-jinah Road Near ...",We are real estate compny in Lahore establishe...,Verified,+7 Years with us
4,Rahman Property Dealer Timergara,1st FLOOR LAST BLOCK ZEB CITY CENTER TIMERGARA...,RAHMAN PROPERTY DEALER & MOTER BARGAING CENTER...,Verified,+6 Years with us
...,...,...,...,...,...
2,Cheap Plots in DHA Lahore - Great Real Estate,"2nd floor, 58-K Commercial Phase 1 DHA, Lahore...",Great Real Estate is working as a real estate ...,Verified,+4 Years with us
3,Nasar Estate,"Wapda Town Lahore, Lahore, Punjab",Nasar Estate is real estate base company estab...,Verified,+6 Years with us
4,Two Roads Abode,2024 N 1100 East Road,Two Roads Abode is a vacation rental home conv...,Verified,+8 Years with us
5,Chukhat.com,"Gulberg lahore, Lahore, Punjab",Chukhat.com by Bonjour Media (LTD) is the Paki...,Verified,+6 Years with us


In [36]:
data.shape

(228, 5)

In [37]:
data.to_csv('pak-real-estate.csv')