In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
import re

# Function to extract latitude and longitude from Google Maps URL
def extract_lat_long_from_url(url):
    try:
        lat_long_pattern = re.compile(r'!8m2!3d(-?\d+\.\d+)!4d(-?\d+\.\d+)')
        match = lat_long_pattern.search(url)
        if match:
            return float(match.group(1)), float(match.group(2))
    except Exception as e:
        print(f"Error extracting lat/long: {e}")
    return None, None

# Function to clean reviews count
def clean_reviews_count(reviews_count):
    match = re.search(r'\d+', reviews_count)
    if match:
        return int(match.group())
    return 0

# Function to clean star rating
def clean_star_rating(star_rating):
    try:
        return float(star_rating) if star_rating != "No Rating" else None
    except ValueError:
        return None

# Function to scrape Google Maps
def scrape_google_maps(query, max_places=300):
    # Set up the Chrome driver with options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)

    # Open Google Maps
    driver.get("https://www.google.com/maps")
    wait = WebDriverWait(driver, 10)

    # Search for the query
    search_box = wait.until(EC.presence_of_element_located((By.ID, "searchboxinput")))
    search_box.send_keys(query)
    search_box.send_keys(Keys.ENTER)

    # Initialize lists to store data
    names = []
    ratings = []
    reviews_counts = []
    locations = []
    urls = []

    # Scroll and extract data
    while len(names) < max_places:
        time.sleep(3)  # Wait for new data to load

        # Find all the places on the current page
        places = driver.find_elements(By.CLASS_NAME, 'Nv2PK')

        for place in places:
            try:
                # Name
                name = place.find_element(By.CLASS_NAME, 'qBF1Pd').text
                # Rating
                rating_element = place.find_elements(By.CLASS_NAME, 'MW4etd')
                rating = rating_element[0].text if rating_element else "No Rating"
                # Reviews count
                reviews_count_element = place.find_elements(By.CLASS_NAME, 'UY7F9')
                reviews_count = reviews_count_element[0].text if reviews_count_element else "0"
                # Location
                url_element = place.find_element(By.TAG_NAME, 'a')
                url = url_element.get_attribute('href')
                print(f"Extracted URL: {url}")  # Debugging line
                latitude, longitude = extract_lat_long_from_url(url)
                print(f"Extracted Coordinates: {latitude}, {longitude}")  # Debugging line

                # Append to lists
                names.append(name)
                ratings.append(rating)
                reviews_counts.append(reviews_count)
                locations.append((latitude, longitude))
                urls.append(url)

                if len(names) >= max_places:
                    break

            except Exception as e:
                print(f"Error extracting place details: {e}")

        # Scroll down to load more places
        actions = ActionChains(driver)
        actions.send_keys(Keys.PAGE_DOWN).perform()

    driver.quit()

    # Create DataFrame and clean data
    df = pd.DataFrame({
        'Name': names,
        'Rating': ratings,
        'Reviews Count': reviews_counts,
        'Location': locations,
        'URL': urls
    })

    # Clean the data
    df['Reviews Count'] = df['Reviews Count'].apply(clean_reviews_count)
    df['Rating'] = df['Rating'].apply(clean_star_rating)

    return df

# Scrape data for New Cairo
new_cairo_restaurants = scrape_google_maps("restaurants and cafes in New Cairo")

# Save the data to an Excel file
if new_cairo_restaurants is not None:
    new_cairo_restaurants.to_excel("New_Cairo_Restaurants.xlsx", index=False)
    print("Scraping completed and data saved to New_Cairo_Restaurants.xlsx")

    # For Jupyter Notebook / JupyterLab, use:
    # from IPython.display import FileLink
    # FileLink('New_Cairo_Restaurants.xlsx')

    # For Google Colab, use:
    from google.colab import files
    files.download("New_Cairo_Restaurants.xlsx")
else:
    print("No data found. Please check the scraping function.")


Extracted URL: https://www.google.com/maps/place/Brunch+%26+Cake+Lake+View/data=!4m7!3m6!1s0x145823527cb1fc73:0x94dee58cd722bdb1!8m2!3d30.0243389!4d31.4537953!16s%2Fg%2F11sbslxcnz!19sChIJc_yxfFIjWBQRsb0i14zl3pQ?authuser=0&hl=en&rclk=1
Extracted Coordinates: 30.0243389, 31.4537953
Extracted URL: https://www.google.com/maps/place/Duchess/data=!4m7!3m6!1s0x145823b2ae49e0a7:0x4e4616743b19b401!8m2!3d30.0283217!4d31.4937452!16s%2Fg%2F11t808n_cs!19sChIJp-BJrrIjWBQRAbQZO3QWRk4?authuser=0&hl=en&rclk=1
Extracted Coordinates: 30.0283217, 31.4937452
Extracted URL: https://www.google.com/maps/place/Boulevard+O1/data=!4m7!3m6!1s0x145823020b13dd61:0xbbb6b15b443d9eed!8m2!3d30.0481922!4d31.475591!16s%2Fg%2F11kqzfcksx!19sChIJYd0TCwIjWBQR7Z49RFuxtrs?authuser=0&hl=en&rclk=1
Extracted Coordinates: 30.0481922, 31.475591
Extracted URL: https://www.google.com/maps/place/Venti+waterway/data=!4m7!3m6!1s0x145823c7e91f994f:0xf05e5c2d67c466d7!8m2!3d30.0408136!4d31.4753874!16s%2Fg%2F11t1jx7kvj!19sChIJT5kf6ccjWBQR12

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>