In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

# Bypass 300 status errors
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Referer": "https://www.google.com/"
}


BASE_URL = "https://www.saudiayp.com/" # change here
START_PATH = "/category/estate_agents" # change here

def get_profile_details(profile_url):
    try:
        res = requests.get(profile_url, headers=HEADERS)
        print(res)
        soup = BeautifulSoup(res.content, "html.parser")

        web_block = soup.find("div", class_="text weblinks")
        print(web_block)
        if web_block:
            link_tag = web_block.find("a")
            link = link_tag.get_text(strip=True) if link_tag else None
            return link

        return None

    except Exception as e:
        print(f"Error fetching profile: {e}")
        return None



def get_page_data(url):
    page = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(page.content, "html.parser")
    agents = []
    print(page)

    agent_blocks = soup.find_all("div", class_="company with_img g_0") # Change here for more data company with_img g_0 to company g_0
    print(agent_blocks)
    for block in agent_blocks:
        name_div = block.find("div", class_="company_header")
        name_tag = name_div.find("a") if name_div else None
        name = name_tag.get_text(strip=True) if name_tag else None
        profile_link = BASE_URL + name_tag['href'] if name_tag and name_tag.has_attr('href') else None

        address_tag = block.find("div", class_="address")
        tagline_tag = block.find("div", class_="tagline")
        rating_tag = block.find("div", class_="rate")

        # Extract phone and establishment
        phone = None
        established = None
        info_blocks = block.find_all("div", class_="s")
        for info in info_blocks:
            icon = info.find("i")
            if icon:
                label = icon.get("aria-label")
                span = info.find("span")
                if label == "Phone" and span:
                    phone = span.get_text(strip=True)
                elif label == "Calendar" and span:
                    established = span.get_text(strip=True).replace("Established", "").strip()

        # Verified status
        verified_tag = block.find("u", class_="v")
        is_verified = False
        if verified_tag and "Verified" in verified_tag.get_text(strip=True):
            is_verified = True

        # Manager name from profile page
        website = get_profile_details(profile_link) if profile_link else (None, None)

        time.sleep(0.5)  # Be polite to the server

        agent = {
            "Name": name,
            "Profile Link": profile_link,
            "Website": website,
            "Address": address_tag.get_text(strip=True) if address_tag else None,
            "Description": tagline_tag.get_text(strip=True) if tagline_tag else None,
            "Phone": phone,
            "Established": established,
            "Rating": rating_tag.get_text(strip=True) if rating_tag else None,
            "Verified": is_verified
        }

        agents.append(agent)

    return agents

# Loop through pages
all_agents = []
for page_num in range(1, 4):  # Scraped the first 3 pages for best quality
    if page_num == 1:
        url = f"{BASE_URL}{START_PATH}"
    else:
        url = f"{BASE_URL}/category/estate_agents/{page_num}" # change here could be of the form of Estate_agents, estate-agents or Estate-agents 
    print(f"Scraping page {page_num}: {url}")
    all_agents.extend(get_page_data(url))

# Save to JSON
with open("saudi_top_real_estate_companies.json", "w", encoding="utf-8") as f:
    json.dump(all_agents, f, ensure_ascii=False, indent=4)

print(f"Scraped {len(all_agents)} agents across pages.")
