# **Dubizzle scrapping**

In [1]:
# Used to send HTTP requests to fetch web pages from the internet (HTML content)
import requests
#Parses HTML or XML documents so you can extract and navigate page content easily
from bs4 import BeautifulSoup
#Powerful tool for storing, processing, and exporting data in tabular form (e.g., CSV or Excel)
import pandas as pd
#Supports regular expressions for pattern matching inside strings (e.g., extracting numbers, units)
import re  
#Provides sleep functionality to pause the script (helps avoid getting blocked during scraping)
import time 
# Built-in module to handle JSON data: reading from APIs or saving data to .json files
import json

In [2]:
base_url = 'https://www.dubizzle.com.om'
cadr_url = f"{base_url}/en/properties/properties-for-sale/"
current_link = base_url

In [3]:
r = requests.get(base_url) # will send a req for get and will save the res into a variable called 'r'
print(r)

<Response [200]>


In [4]:
print(f'Without BeautifulSoup: {r.content}') # This will print unreadable data and we cant deal with it.

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [5]:
# Here the power of BeautifulSoup comes to convert the 'r.content'to readable data.
soup = BeautifulSoup(r.content, 'html') # here what happend is we called a function 'BeautifulSoup' to parse the unreadable 'r.content' to the readable and specify what we want to parse in our setuation is html
print(soup)
# Now we can identify the element what we want to fetch by their classes, id's, elements

<!DOCTYPE html>
<html dir="rtl" itemscope="" itemtype="http://schema.org/WebPage" lang="ar"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1.0, user-scalable=0" name="viewport"/><link href="https://ll8iz711cs-dsn.algolia.net" rel="dns-prefetch"/><link href="https://www.googletagmanager.com" rel="dns-prefetch"/><link href="https://www.google-analytics.com" rel="dns-prefetch"/><link href="https://images.bayut.com" rel="dns-prefetch"/><link href="/assets/apple-touch-icon.318a683f5e331c46c23eb3742a9f3d49.png" rel="apple-touch-icon" sizes="180x180"/><link href="/assets/favicon-16x16.771c69f9ab365d2b39ca63a11a5edc57.png" rel="icon" sizes="16x16" type="image/png"/><link href="/assets/favicon-32x32.db52812416fd3c125e3502aabf54bacf.png" rel="icon" sizes="32x32" type="image/png"/><link crossorigin="use-credentials" href="/assets/413f1a7b03c790e9715b21d365d5283a.json" rel="manifest"/><link color="#28b16d" href="/assets/safari-pinned-tab.4f01bd49d15cf8e03fa01e92f1c03

In [12]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

# Dictionary to hold scraped data
dubizzle_data = {
    'property_name': [],
    'sale_price': [],
    'prop_location': [],
    'prop_area': [],
    'prop_bathrooms': [],
    'number_of_rooms': []
}

# Safely extract text from an element
def extract_text(element, default="not mentioned"):
    return element.get_text(strip=True) if element else default

# Extract relevant data from each property card
def extract_card_data(card):
    title = extract_text(card.find('h2', class_='_562a2db2'))
    price = extract_text(card.find('span', class_='ddc1b288'), "price not mentioned")
    location = extract_text(card.find('span', class_='f7d5e47e'), "no location mentioned")

    def extract_feature(label):
        section = card.find('span', attrs={'aria-label': label})
        value = section.find('span', class_='_3e1113f0') if section else None
        return extract_text(value)

    return {
        'title': title,
        'price': price,
        'location': location,
        'area': extract_feature("Area"),
        'baths': extract_feature("Bathrooms"),
        'rooms': extract_feature("Beds")  # Changed 'beds' to 'rooms'
    }

# Scrape one page and collect data
def scrape_page(session, url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
    }
    res = session.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'html.parser')
    listings = soup.find_all('li', attrs={'aria-label': 'Listing'})
    print(f"Found {len(listings)} listings on {url}")

    for card in listings:
        data = extract_card_data(card)
        dubizzle_data['property_name'].append(data.get('title', 'not mentioned'))
        dubizzle_data['sale_price'].append(data.get('price', 'not mentioned'))
        dubizzle_data['prop_location'].append(data.get('location', 'not mentioned'))
        dubizzle_data['prop_area'].append(data.get('area', 'not mentioned'))
        dubizzle_data['prop_bathrooms'].append(data.get('baths', 'not mentioned'))
        dubizzle_data['number_of_rooms'].append(data.get('rooms', 'not mentioned'))

# Main loop to scrape all pages
with requests.Session() as session:
    base_url = "https://www.dubizzle.com.om/en/properties/properties-for-sale/"
    for page_num in range(1, 105):  # Adjust page range as needed
        if page_num == 1:
            url = base_url
        else:
            url = f"{base_url}?page={page_num}"
        print(f"Scraping page {page_num}: {url}")
        scrape_page(session, url)
        time.sleep(15)  # polite delay to avoid blocking

print("Sale property scraping complete!")

# Export to CSV
df = pd.DataFrame(dubizzle_data)
df.to_csv("dubizzle_oman_sale_properties.csv", index=False, encoding='utf-8-sig')
print("Data saved to 'dubizzle_oman_sale_properties.csv'")


Scraping page 1: https://www.dubizzle.com.om/en/properties/properties-for-sale/
Found 45 listings on https://www.dubizzle.com.om/en/properties/properties-for-sale/
Scraping page 2: https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=2
Found 45 listings on https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=2
Scraping page 3: https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=3
Found 45 listings on https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=3
Scraping page 4: https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=4
Found 45 listings on https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=4
Scraping page 5: https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=5
Found 45 listings on https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=5
Scraping page 6: https://www.dubizzle.com.om/en/properties/properties-for-sale/?page=6
Found 45 listings on https://www.dubi

In [15]:
df

Unnamed: 0,property_name,sale_price,prop_location,prop_area,prop_bathrooms,number_of_rooms
0,Furnished 3-Bedroom Lakefront Villa for Sale i...,"OMR 249,000","Salalah, Dhofar•",573 SQM,3,3
1,Luxury Apartment|Mouj Muscat |Freehold Ownersh...,"OMR 125,000","The Wave (Almouj), Muscat•",110 SQM,2,1
2,5 BR Spacious Villa in Azaiba Nearby Amenities,"OMR 197,000","Azaiba, Muscat•",600 SQM,4,5
3,Two Bedroom Seaview Apartment at Mandarin Orie...,"OMR 405,000","Qurum, Muscat•",131 SQM,3,2
4,Villa for Sale (Garden) - Sea View,"OMR 517,000","Yiti, Muscat•",506 SQM,6,4
...,...,...,...,...,...,...
4656,شقة مفروشة مميزة للبيع في الخوض السابعة,"OMR 38,900","Al Khoud, Muscat•",76 SQM,2,2
4657,أرض الطيب موقع جميل,"OMR 2,200","Ibri, Al Dhahirah•",600 SQM,not mentioned,not mentioned
4658,all types of interlock avaialble watsapp me 95...,"OMR 2,000","Al Rusayl, Muscat•",10 SQM,not mentioned,not mentioned
4659,فرصة ذهبية بمخطط الوطن المقابل للمعبيلة 8 بكاف...,"OMR 31,000","Barka, Al Batinah•",600 SQM,not mentioned,not mentioned
