In [17]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from http.client import responses
from bs4.element import Comment
import urllib.request
import json

In [18]:
# # ## urls list for testing

# 1. https://www.bayut.com/buildings/binghatti-apartments/
# 2. https://www.bayut.com/buildings/palladium-tower/?gclid=EAIaIQobChMIobnRhfyVggMVpp-DBx1Now2SEAAYASAAEgLHcfD_BwE
# 3. https://www.bayut.com/buildings/arno-a/?gad_source=1&gbraid=0AAAAADtWrTc4lLYWsy1OB9wwfzD8_CJF1&gclid=Cj0KCQjwqP2pBhDMARIsAJQ0Czo72QgLRCVBUneSwivWJEI07ZpmJKf_kmRBjNUbDO04p1mG_ED2vuUaAryuEALw_wcB

In [19]:
## Defining a function that extracts useful information from the Bayut website;

def Property_Info_Extraction(url, save_results=False):
    
    response = requests.get(url)

    if response.status_code == 200:
        html_content = response.content
    else:
        print(responses[response.status_code])
        pass 
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract the necessary information from the wesbite
    basic_info = []

    about = soup.find('div', id=lambda value: value and value.startswith('about-')).text
    
    basic_info.append('Name: ' + about.replace('About ', ''))
    
    # Find the <div> with id "in-a-nutshell"
    nutshell_div = soup.find('div', id=lambda value: value and value.startswith('in-a-')).findNext('ul').find_all('li')
    
    # Iterate through the <li> elements and print their text
    for li in nutshell_div:
        basic_info.append(li.text.replace(u'\xa0', u' '))
        
        rest_all = []
    
    # Find the <div> with id begining with "restaurants-"
    rest_div = soup.find('div', id=lambda value: value and value.startswith('rest')).findNext('ul').find_all('li')
    
    # Iterate through the <li> elements and print their text
    for li in rest_div:
        rest_all.append(li.text.replace(u'\xa0', u' '))
    
    basic_info.append('Restaurants Nearby: ' + ", ".join(rest_all))
    
    sup_all = []
    
    # Find the <div> with id begining with "supermarket"
    sup_div = soup.find('div', id=lambda value: value and value.startswith('supermarket')).findNext('ul').find_all('li')
    
    # Iterate through the <li> elements and print their text
    for li in sup_div:
        sup_all.append(li.text.replace(u'\xa0', u' '))
        
    basic_info.append('Supermarkets Nearby: ' + ", ".join(sup_all))
    
    schools_all = []
    
    # Find the <div> with id begining with "supermarket"
    schools_div = soup.find('div', id=lambda value: value and value.startswith('school')).findNext('ul').find_all('li')
    
    # Iterate through the <li> elements and print their text
    for li in schools_div:
        schools_all.append(li.text.replace(u'\xa0', u' '))
        
    basic_info.append('Schools Nearby: ' + ", ".join(schools_all))
    
    if save_results:
        
        # Split the text into lines and create a dictionary
        data = {}
        for line in basic_info:
            key, value = line.split(': ')
            data[key] = value
        
        # Convert the dictionary to a Pandas DataFrame
        df = pd.DataFrame([data])
        
        #Transpose the df
        df_T = df.T
        
        # Save the DataFrame as a CSV file
        df_T.to_csv('{}_data.csv'.format(data['Name']), header=False)
        
    else:
        return basic_info
    

In [20]:
## Just enter the url and run the code!
## If you come across any error, please not it so we can debug it together later on!

Property_Info_Extraction("https://www.bayut.com/buildings/binghatti-apartments/", save_results=True)

## NOTES:
- The reason why I was getting the error `AttributeError: 'NoneType' object has no attribute 'findNext'` is because the id tag on the webapges html had a spelling error; they spelled the id='restaurants-nearby' wrong! Fixed this issue by updated by 'findNext' arguments as 'rest'
- Found an APIFY app to scrape some useful data from PropertyFinder [here](https://apify.com/dhrumil/propertyfinder-scraper)
- [BrowseAI](https://www.browse.ai/?utm_source=branded&gad_source=1&gclid=Cj0KCQjwtJKqBhCaARIsAN_yS_kKOmro5osDM5y19e-_n0Zo61_DtSq5MruLo6ZP0TByV_nuhcF-TvEaAsAjEALw_wcB) could be used to scrape data from PropertyFinder but the way the data is presented on the website is very User Friendly and not that easy to scrape using this bot. 
- Tried scarpping agent data from PropertyFinder but got the response code 403 meaning the website has forbidden access to this data. Bayut site gave 200 status code!

## Steps to scrape agent data from Bayut

1. Get the agent urls - This can be done by using beautifulsoup to extract all urls one the given webpage. Problem right now is that one webpage shows a maximum of 12 agents out of the total 8025 so I need to find a way to get ALL agents urls at once. Another problem is that it does not output its url but its html jump tag instead. The complete url will have to then be created using the parent url.
2. Extract useful information from each agents url - I am trying this right now but I need to locate the information I want with some tag and I'm working on it. 

In [21]:
agent_response = requests.get("https://www.bayut.com/brokers/dubai/")
agent_response.status_code

200

In [22]:
agent_soup = BeautifulSoup(agent_response.content, "html.parser")

In [27]:
aafaq_agent = requests.get("https://www.bayut.com/brokers/aafaq-2188426.html")
aafaq_agent.status_code

200

In [28]:
aafaq_agent_soup = BeautifulSoup(ali_agent.content, "html.parser")

In [29]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

html = urllib.request.urlopen('https://www.bayut.com/brokers/aafaq-2188426.html').read()
print(text_from_html(html))

en Site settings Change Currency Change Area Unit Favourite properties View all Saved searches Log in Find my Agent Floor Plans Guides  Area Guides Building Guides School Guides Market Intelligence  TruValue™ Property Prices Dubai Transactions Trends New Projects Agent Portal Events  B3DXB 2022 Your Home Your Choice Aafaq Oxygen Property Management No reviews yet Active listings Share profile  Email Agent  Call Agent Aafaq · Oxygen Property Management Share profile  Email Agent  Call Agent purpose All location property type Any Type beds & baths Beds & Baths Price (AED) Price (AED) More Filters Showing 1 - 12 of 44 Properties sorted by Popular AED 110,000 Yearly Apartment 3 4 Area : 2,410 sqft Huge 3 BD / Duplex /City View/Khalifa Street Sheikh Khalifa Bin Zayed Street, Abu Dhabi  Call   Email  AED 82,500 Yearly Apartment 3 3 Area : 1,825 sqft Huge 3bhk /Maids room /Wardrobes /All Facilities Al Salam Street, Abu Dhabi  Call   Email  AED 41,999 Yearly Apartment Studio 1 Area : 645 sqft 

In [30]:
aafaq_agent_soup.find('script').contents

['{"@context":"https://schema.org","@type":"RealEstateAgent","name":"Aafaq","image":"https://bayut-production.s3.eu-central-1.amazonaws.com/image/505844090/3eb4a49987c04118bd4353850a5c2f73","url":"https://www.bayut.com/brokers/aafaq-2188426.html","address":{"@type":"PostalAddress","addressRegion":"Abu Dhabi"},"parentOrganization":{"@type":"Organization","name":"Oxygen Property Management","url":"https://www.bayut.com/companies/oxygen-property-management-102662/","address":{"@type":"PostalAddress","addressRegion":"Abu Dhabi"}},"priceRange":"41999 AED - 110000 AED","makesOffer":[{"@type":"Offer","itemOffered":{"@type":["Apartment","Product"],"name":"Huge 3 BD / Duplex /City View/Khalifa Street","image":"https://images.bayut.com/thumbnails/512139006-400x300.jpeg","url":"https://www.bayut.com/property/details-8111008.html","address":{"@type":"PostalAddress","name":"Sheikh Khalifa Bin Zayed Street Abu Dhabi"},"floorSize":{"@type":"QuantitativeValue","value":"2,410","unitText":"SQFT"},"offer

In [36]:
aafaq_agent_script_text = aafaq_agent_soup.select_one('script').contents[0]
aafaq_agent_script_text

'{"@context":"https://schema.org","@type":"RealEstateAgent","name":"Aafaq","image":"https://bayut-production.s3.eu-central-1.amazonaws.com/image/505844090/3eb4a49987c04118bd4353850a5c2f73","url":"https://www.bayut.com/brokers/aafaq-2188426.html","address":{"@type":"PostalAddress","addressRegion":"Abu Dhabi"},"parentOrganization":{"@type":"Organization","name":"Oxygen Property Management","url":"https://www.bayut.com/companies/oxygen-property-management-102662/","address":{"@type":"PostalAddress","addressRegion":"Abu Dhabi"}},"priceRange":"41999 AED - 110000 AED","makesOffer":[{"@type":"Offer","itemOffered":{"@type":["Apartment","Product"],"name":"Huge 3 BD / Duplex /City View/Khalifa Street","image":"https://images.bayut.com/thumbnails/512139006-400x300.jpeg","url":"https://www.bayut.com/property/details-8111008.html","address":{"@type":"PostalAddress","name":"Sheikh Khalifa Bin Zayed Street Abu Dhabi"},"floorSize":{"@type":"QuantitativeValue","value":"2,410","unitText":"SQFT"},"offers

In [37]:
data = json.loads(aafaq_agent_script_text)

In [43]:
data

{'@context': 'https://schema.org',
 '@type': 'RealEstateAgent',
 'name': 'Aafaq',
 'image': 'https://bayut-production.s3.eu-central-1.amazonaws.com/image/505844090/3eb4a49987c04118bd4353850a5c2f73',
 'url': 'https://www.bayut.com/brokers/aafaq-2188426.html',
 'address': {'@type': 'PostalAddress', 'addressRegion': 'Abu Dhabi'},
 'parentOrganization': {'@type': 'Organization',
  'name': 'Oxygen Property Management',
  'url': 'https://www.bayut.com/companies/oxygen-property-management-102662/',
  'address': {'@type': 'PostalAddress', 'addressRegion': 'Abu Dhabi'}},
 'priceRange': '41999 AED - 110000 AED',
 'makesOffer': [{'@type': 'Offer',
   'itemOffered': {'@type': ['Apartment', 'Product'],
    'name': 'Huge 3 BD / Duplex /City View/Khalifa Street',
    'image': 'https://images.bayut.com/thumbnails/512139006-400x300.jpeg',
    'url': 'https://www.bayut.com/property/details-8111008.html',
    'address': {'@type': 'PostalAddress',
     'name': 'Sheikh Khalifa Bin Zayed Street Abu Dhabi'},

In [None]:
master_data = pd.read_excel("../Downloads/Master Data Sheet.xlsx", sheet_name=['Tower (Residential)'])

In [None]:
master_data['Tower (Residential)']['Gps Location 1'].values