# Nawy Scrapping Tool

In [None]:
import requests
import xml.etree.ElementTree as ET

all_links = []
# Loop through each sitemap index (1 to 148)
for i in range(1, 182):
    # Construct the URL for the current sitemap file
    sitemap_url = f"https://www.nawy.com/sitemap/properties-{i}.xml"
    print(f"Fetching {sitemap_url}...")
    
    response = requests.get(sitemap_url)
    if response.status_code == 200:
        # Parse the XML content
        root = ET.fromstring(response.text)
        
        # Extract <loc> values. 
        # The {http://www.sitemaps.org/schemas/sitemap/0.9} is a wildcard approach to handle the default namespace:
        for loc_tag in root.findall('.//{*}loc'):
            all_links.append(loc_tag.text)
    else:
        print(f"Failed to retrieve {sitemap_url}, status code: {response.status_code}")

# Print or otherwise use the collected links
print("\nCollected Links:")
for link in all_links[:5]:
    print(link)


Fetching https://www.nawy.com/sitemap/properties-1.xml...
Fetching https://www.nawy.com/sitemap/properties-2.xml...
Fetching https://www.nawy.com/sitemap/properties-3.xml...
Fetching https://www.nawy.com/sitemap/properties-4.xml...
Fetching https://www.nawy.com/sitemap/properties-5.xml...
Fetching https://www.nawy.com/sitemap/properties-6.xml...
Fetching https://www.nawy.com/sitemap/properties-7.xml...
Fetching https://www.nawy.com/sitemap/properties-8.xml...
Fetching https://www.nawy.com/sitemap/properties-9.xml...
Fetching https://www.nawy.com/sitemap/properties-10.xml...
Fetching https://www.nawy.com/sitemap/properties-11.xml...
Fetching https://www.nawy.com/sitemap/properties-12.xml...
Fetching https://www.nawy.com/sitemap/properties-13.xml...
Fetching https://www.nawy.com/sitemap/properties-14.xml...
Fetching https://www.nawy.com/sitemap/properties-15.xml...
Fetching https://www.nawy.com/sitemap/properties-16.xml...
Fetching https://www.nawy.com/sitemap/properties-17.xml...
Fetchi

KeyboardInterrupt: 

In [2]:
len(all_links)

181000

Saving the data stage

In [3]:
import pandas as pd

# Check how many links we have
print(len(all_links))

# Create DataFrame
df = pd.DataFrame(all_links, columns=["Property_Link"])

# Save to CSV
df.to_csv("nawy_all_props_links_08142025.csv", index=False, encoding="utf-8")


181000


In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("nawy_all_props_links_08142025.csv")

# Convert to list if needed
all_links = df["Property_Link"].tolist()

# Check length
print(len(all_links))


181000


In [3]:
df['is_arabic'] =df['Property_Link'].apply(lambda x: 'https://www.nawy.com/ar/' in x)
df.head()


Unnamed: 0,Property_Link,is_arabic
0,https://www.nawy.com/compound/3-new-giza/prope...,False
1,https://www.nawy.com/ar/compound/3-نيو-جيزة/pr...,True
2,https://www.nawy.com/compound/3-new-giza/prope...,False
3,https://www.nawy.com/ar/compound/3-نيو-جيزة/pr...,True
4,https://www.nawy.com/compound/3-new-giza/prope...,False


In [4]:
new_data = df[~df['is_arabic']].reset_index(drop=True)
new_data.head()

Unnamed: 0,Property_Link,is_arabic
0,https://www.nawy.com/compound/3-new-giza/prope...,False
1,https://www.nawy.com/compound/3-new-giza/prope...,False
2,https://www.nawy.com/compound/3-new-giza/prope...,False
3,https://www.nawy.com/compound/3-new-giza/prope...,False
4,https://www.nawy.com/compound/3-new-giza/prope...,False


In [5]:
new_data.shape

(90500, 2)

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
from tqdm import tqdm

# Define the full set of columns to ensure consistency
columns = [
    "url", "logo_url","developer_url", "developer_name", "property_title", "property_type", "location", "status", 
    "reference_no", "bedrooms", "bathrooms", "compound", "sale_type", "finishing", 
    "description", "phone_placeholder", "country_flag_url", "message_box_placeholder", 
    "price", "image_urls", "payment_plan_raw","payment_plan", "amenities", "delivery_in", 
    "property_size"
]

# Function to check if a URL has already been scraped
def is_scraped(url, scraped_links_df):
    return url in scraped_links_df['url'].values

# Function to save the scraped data to a CSV file using pandas
def save_to_csv(data, csv_file):
    df = pd.DataFrame([data], columns=columns)
    df.to_csv(csv_file, mode='a', header=not os.path.exists(csv_file), index=False)

# Scrape property data
def scrape_property_data(url):
    data = {"url": url}  # Store the URL as part of the data
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url} - {e}")
        return None
    
    # 1. Logo URL
    logo_elem = soup.select_one("a.logo-container img")
    data["logo_url"] = logo_elem["src"] if logo_elem and logo_elem.has_attr("src") else None

    developer_elem = soup.select_one("a.logo-container")
    data["developer_url"] = (
        "https://www.nawy.com" + developer_elem["href"]
        if developer_elem and developer_elem.has_attr("href")
        else None
    )

    # -------------------------------
    # (b) developer_name  (new)
    data["developer_name"] = None        # default

    if data["developer_url"]:
        try:
            dev_resp = requests.get(data["developer_url"], timeout=10)
            dev_resp.raise_for_status()
            dev_soup = BeautifulSoup(dev_resp.text, "html.parser")

            # the <h1> inside .entity-name always holds the name
            dev_name_elem = dev_soup.select_one(".entity-name h1")
            data["developer_name"] = (
                dev_name_elem.get_text(strip=True) if dev_name_elem else None
            )

        except requests.exceptions.RequestException as e:
            print(f"[Developer page] Error fetching {data['developer_url']} – {e}")
            # keep developer_name as None and continue

    # 2. Property Title
    title_elem = soup.select_one(".entity-name h1")
    data["property_title"] = title_elem.get_text(strip=True) if title_elem else None

    # 3. Property Type (Apartment, etc.)
    property_type_elem = soup.select_one(".propertyDetails .header div")
    data["property_type"] = property_type_elem.get_text(strip=True) if property_type_elem else None

    # 4. Location
    location_elem = soup.select_one(".entity-location .text-2")
    data["location"] = location_elem.get_text(strip=True) if location_elem else None

    # 5. Property Status (Sold Out, Available, etc.)
    status_elem = soup.select_one(".sc-bfc43a5-3.gmqTjv")
    data["status"] = status_elem.get_text(strip=True) if status_elem else None

    # 6. Reference Number
    reference_elem = soup.find("div", string="Reference No.")
    reference_value = reference_elem.find_next_sibling("div") if reference_elem else None
    data["reference_no"] = reference_value.get_text(strip=True) if reference_value else None

    # 7. Bedrooms
    bedrooms_elem = soup.find("div", string="Bedrooms")
    bedrooms_value = bedrooms_elem.find_next_sibling("div") if bedrooms_elem else None
    data["bedrooms"] = bedrooms_value.get_text(strip=True) if bedrooms_value else None

    # 8. Bathrooms
    bathrooms_elem = soup.find("div", string="Bathrooms")
    bathrooms_value = bathrooms_elem.find_next_sibling("div") if bathrooms_elem else None
    data["bathrooms"] = bathrooms_value.get_text(strip=True) if bathrooms_value else None

    # 9. Compound Name
    compound_elem = soup.select_one(".compound-link span")
    data["compound"] = compound_elem.get_text(strip=True) if compound_elem else None

    # 10. Sale Type (Developer Sale)
    sale_type_elem = soup.find("div", string="Sale Type")
    sale_type_value = sale_type_elem.find_next_sibling("div") if sale_type_elem else None
    data["sale_type"] = sale_type_value.get_text(strip=True) if sale_type_value else None

    # 11. Finishing Type
    finishing_elem = soup.find("div", string="Finishing")
    finishing_value = finishing_elem.find_next_sibling("div") if finishing_elem else None
    data["finishing"] = finishing_value.get_text(strip=True) if finishing_value else None

    # 12. Property Size (Parent div of span)
    size_elem = soup.select_one(".propertyDetails .header span")  # Find the span with size
    parent_div = size_elem.find_parent("div") if size_elem else None  # Find the parent div

    # Extract the content of the parent div (i.e., size and any other relevant information)
    data["property_size"] = parent_div.get_text(strip=True) if parent_div else None
    
    # 13. Description of the property
    description_elem = soup.select(".description p")  # Select all <p> tags inside the description div
    description_text = [p.get_text(strip=True) for p in description_elem]  # Extract text from each <p> tag

    # Join the individual descriptions with a separator (e.g., newline or comma) to create a full description
    data["description"] = "\n".join(description_text) if description_text else None

    # 14. Phone Number Placeholder
    phone_elem = soup.select_one('input[placeholder="Phone Number"]')
    data["phone_placeholder"] = phone_elem["placeholder"] if phone_elem else None

    # 15. Country Selector and Flag
    country_flag_elem = soup.select_one(".select-country img")
    data["country_flag_url"] = country_flag_elem["src"] if country_flag_elem and country_flag_elem.has_attr("src") else None

    # 16. Input Fields (Message Box, etc.)
    message_elem = soup.select_one('textarea#message')
    data["message_box_placeholder"] = message_elem["placeholder"] if message_elem else None

    # 17. Price
    price_elem = soup.select_one(".property-price-details")
    data["price"] = price_elem.get_text(strip=True) if price_elem else None

    # 18. Images Gallery URLs
    image_elements = soup.select(".gallery-container .image-container img")
    image_urls = [img["src"] for img in image_elements if img.has_attr("src")]
    data["image_urls"] = image_urls if image_urls else None

    # 19. Payment Plans (if available)
    payment_elems = soup.select(".payments-container.property-plan")  # Select all individual payment plans
    payment_plans = []
    payment_plans_raw = []

    # Loop through each payment plan and extract the raw HTML content
    for payment_elem in payment_elems:
        # Get the raw text of the entire payment plan
        payment_plans_raw.append(payment_elem.get_text(strip=True))  # Get the raw text for each payment plan

    # Store the raw payment plans in the data dictionary
    data["payment_plan_raw"] = payment_plans_raw if payment_plans_raw else None

    # Loop through each payment plan and extract the details
    for payment_elem in payment_elems:
        # Extracting the details for each payment plan
        installment_value = payment_elem.select_one(".installment-value")
        installment_periodicity = payment_elem.select_one(".installment-periodicity")
        downpayment_value = payment_elem.select_one(".downpayment-value")
        installment_years = payment_elem.select_one(".installment-years")
        
        # Creating a structured payment plan for this entry
        payment_plan = {
            "installment_value": installment_value.get_text(strip=True) if installment_value else None,
            "installment_periodicity": installment_periodicity.get_text(strip=True) if installment_periodicity else None,
            "downpayment_value": downpayment_value.get_text(strip=True) if downpayment_value else None,
            "installment_years": installment_years.get_text(strip=True) if installment_years else None
        }
        
        payment_plans.append(payment_plan)  # Append this plan to the list

    # If no payment plans found, set as None
    data["payment_plan"] = payment_plans if payment_plans else None

    # 20. Amenities
    amenities_elem = soup.select(".amenities-wrapper .amenity .text-3")  # Select all <span> elements with class 'text-3' (amenity names)
    amenities = [amenity.get_text(strip=True) for amenity in amenities_elem]  # Extract text from each amenity

    # Store the amenities in the data dictionary
    data["amenities"] = ', '.join(amenities) if amenities else None

    # 21. Delivery In
    delivery_elem = soup.find("div", string="Delivery In")
    delivery_value = delivery_elem.find_next_sibling("div") if delivery_elem else None
    data["delivery_in"] = delivery_value.get_text(strip=True) if delivery_value else None

    return data


In [13]:
all_links = set(new_data["Property_Link"].to_list())
len(all_links)

89876

In [None]:
import random
# Read the CSV file of previously scraped links (if exists)
scraped_links_file = 'scraped_links_08142025.csv'
scraped_links_df = pd.read_csv(scraped_links_file) if os.path.exists(scraped_links_file) else pd.DataFrame(columns=['url'])

all_scraped = []

# Scraping process
for i, link in tqdm(enumerate(all_links), total=len(all_links)):

    # Check if the URL has already been scraped
    #print(link)
    if is_scraped(link, scraped_links_df):
        #print(f"URL already scraped: {link}")
        continue
    
    # Scrape the property data
    result = scrape_property_data(link)
    if result:
        all_scraped.append(result)
        save_to_csv(result, 'property_data_08142025.csv')  # Save the data to CSV

        # Add the scraped URL to the scraped links DataFrame
        new_row = pd.DataFrame({'url': [link]})
        scraped_links_df = pd.concat([scraped_links_df, new_row], ignore_index=True)

        # Save the updated list of scraped URLs to CSV after each entry
        scraped_links_df.to_csv(scraped_links_file, index=False)

    # Introduce a random pause after every 50th request to avoid triggering rate limits
    if (i + 1) % 1000 == 0:
        print("Pausing for a longer duration (10-20 seconds) to avoid rate limit...")
        long_pause = random.randint(10, 20)
        time.sleep(long_pause)

print("Scraping process completed!")

In [None]:
all_scraped