### Install all this before running the code

pip install selenium beautifulsoup4 ndjson requests

In [114]:
import time
import json
import gzip
import ndjson
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import threading


In [121]:
class GrabFoodScraper:
    """
    This class allows you to fetch and parse data from the given website and
    get the details of the restaurants in a compressed file format.
    """
    
    def __init__(self, location_url):
        self.location_url = location_url
        self.restaurants = []
        self.base_url = "https://food.grab.com"
        self.init_webdriver()
    
    def init_webdriver(self):
        """
        Initialize Selenium WebDriver with Chrome options to mimic a regular browser session and avoid detection.
        """
        ### Troubleshooting to prevent cloudfront to block out IP ###
        
        # proxy = "socks5://127.0.0.1:9050"
        # firefox_options = webdriver.FirefoxOptions()
        # profile = webdriver.FirefoxOptions()
        # driver = webdriver.Firefox(options=options)

        # firefox_options.add_argument(f"User-Agent={user_agent}")
        # profile.set_preference("general.useragent.override", user_agent)
        # firefox_options.add_argument(f"user-agent={user_agent}")
        # chrome_options.add_argument('--proxy-server=%s' % proxy)
        # chrome_options.add_argument("--headless")
        # chrome_options.add_argument("--disable-gpu")
        # self.driver = webdriver.Firefox(profile)
        
        chrome_options = Options()
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.e.4472.124 Safari/537.36"
        chrome_options.add_argument(f"user-agent={user_agent}")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 10)
    
    def navigate_to_location(self):
        """
        Open the location URL in a WebDriver session.
        """
        self.driver.get(self.location_url)
        # Allow time for page to load
        time.sleep(5)
        
        
        
    def go_to_address(self, address):
        """
        Navigate to a specific address within the site to fetch restaurant data.
        """
        try:
            # We have to waith until the location search bar is visible
            self.wait.until(
                EC.presence_of_element_located((By.ID, "location-input"))
            )
            input_address = self.driver.find_element('id', 'location-input')
            input_address.send_keys(address)
            # Troubleshooting to prevent cloudfront to block out IP
            time.sleep(5)
            search_button = self.driver.find_element(By.CLASS_NAME, 'submitBtn___2roqB')
            search_button.click()
        except TimeoutException:
            print("Timed out waiting for page to load")
            
                
    def reach_end_of_page(self):
        """
        Scroll to the bottom of the page dynamically to load all restaurant listings.
        """
        last_height = self.driver.execute_script('return document.body.scrollHeight')
        
        while True:
            self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            # Wait for the page to load new content
            time.sleep(3)
            
            new_height = self.driver.execute_script('return document.body.scrollHeight')
            
            if new_height == last_height:
                # End of page reached
                break
            last_height = new_height
    
    
    def extract_restaurant_data(self):
        """
        Extract raw HTML data from the page for parsing.
        """
        try:
            page_source = self.driver.page_source
            self.soup = BeautifulSoup(page_source, 'html.parser')
            print("Yay! Soup created!")
        except TimeoutException:
            print("Timed out waiting for page to load")
        finally:
            # To quit the current webdriver session
            self.driver.quit()
            
    def parse_restaurant_data(self):
        """
        Parse the restaurant data from HTML and store it in structured format.
        """
        restaurants = self.soup.find_all(class_='RestaurantListCol___1FZ8V')
        for restaurant in restaurants:
            name = restaurant.find(class_='name___2epcT').text
            cuisine = restaurant.find(class_='cuisine___T2tCh').text
            num_info = restaurant.find_all(class_='numbersChild___2qKMV')
            time_taken, dist = parse_time_distance(num_info[-1].text)
            rating = None if len(num_info)==1 else num_info[0].text
            discount_text = [discount.text for discount in restaurant.find_all(class_='discountText___GQCkj')]

            # Can't find any notice on any restaurants
            # notice = restaurant.find(class_='TBD')
            image_link = restaurant.find(class_='realImage___2TyNE').get('src')
            has_promo = len(restaurant.find_all(class_='promoTagHead___1bjRG'))>0
            restaurant_href = restaurant.find('a')['href']
            restaurant_id = restaurant_href.split('/')[-1].strip(' ?')

            data = {
                'Id': restaurant_id,
                'Name': name,
                'Cuisine': cuisine,
                'Time Taken': time_taken,
                'Distance': dist,
                'Restaurant Rating': rating,
                'Promo Text': discount_text[0] if discount_text else None,
                'Image Link': image_link,
                'Has Promo': has_promo,
            }
            # print(data)
            self.restaurants.append(data)

    
    def get_lat_lon(self, restaurant_id):
        # Longitude and Latitude aren't on the detail pages as far as I know. If we can find those on
        # detail page, we can make a soup of those page like this (below code) and fetch the details similarly.

        # detail_page = requests.get(self.base_url + restaurant_href)
        # grab_soup = BeautifulSoup(detail_page.text, 'html.parser')
        
        # Placeholder method to fetch latitude and longitude using restaurant_id
        return "1.3521", "103.8198"
    

    def save_to_ndjson(self, filename):
        """
        Save extracted data to NDJSON format.
        """
        with open(filename, 'w') as f:
            writer = ndjson.writer(f, ensure_ascii=False)
            for restaurant in self.restaurants:
                writer.writerow(json.dumps(restaurant))

    def compress_file(self, filename):
        """
        Compress the NDJSON file into GZip format.
        """
        with open(filename, 'rb') as f_in:
            with gzip.open(filename + '.gz', 'wb') as f_out:
                f_out.writelines(f_in)
    
    def run(self):
        """
        Execute the scraper: setting up, navigating, data extraction, parsing, saving, and compressing.
        """
        address1 = "Chong Boon Dental Surgery - Block 456 Ang Mo Kio Avenue 10, #01-1574, Singapore, 560456"
        address2 = "PT Singapore - Choa Chu Kang North 6, Singapore, 689577"
        
        # Use threading to handle different parts of the task concurrently
        threads = []
        t1 = threading.Thread(target=self.navigate_to_location)
        threads.append(t1)
        t2 = threading.Thread(target=self.go_to_address, args=(address,))
        threads.append(t2)
        
        # Starting navigation and input threads
        for t in threads:
            t.start()
        
        # Wait for navigation and input to complete
        for t in threads:
            t.join()
            
        # self.navigate_to_location()
        # self.go_to_address(address2)
        time.sleep(5)
        self.reach_end_of_page()
        self.extract_restaurant_data()
        self.parse_restaurant_data()
        self.save_to_ndjson("restaurants.ndjson")
        self.compress_file("restaurants.ndjson")


In [124]:
scraper = GrabFoodScraper("https://food.grab.com/sg/en/")
scraper.run()

Yay! Soup created!


#### How many restaurents are fetched - 

In [125]:

len(scraper.restaurants)

277

#### Debug code acting like a playground

In [104]:
def parse_time_distance(string):
    string = string.replace('\xa0', ' ')
    parts = string.split('•')
    if len(parts) == 2:
        time_part = parts[0].strip()
        distance_part = parts[1].strip()
        return time_part, distance_part
    else:
        return None, None

base_url = "https://food.grab.com"
yo = []
    
for restaurant in restaurants[:4]:
    name = restaurant.find(class_='name___2epcT').text
    cuisine = restaurant.find(class_='cuisine___T2tCh').text
    num_info = restaurant.find_all(class_='numbersChild___2qKMV')
    time_taken, dist = parse_time_distance(num_info[-1].text)
    rating = None if len(num_info)==1 else num_info[0].text
    discount_text = [discount.text for discount in restaurant.find_all(class_='discountText___GQCkj')]
    
    # Can't find any notice on any restaurants
    # notice = restaurant.find(class_='TBD')
    image_link = restaurant.find(class_='realImage___2TyNE')['src']
    has_promo = len(restaurant.find_all(class_='promoTagHead___1bjRG'))>0
    restaurant_href = restaurant.find('a')['href']
    restaurant_id = restaurant_href.split('/')[-1].strip(' ?')
    
    
    # Longitude and Latitude aren't on the detail pages as far as I know. If we can find those on
    # detail page, we can make a soup of those page like this (below code) and fetch the details similarly.
    
    # detail_page = requests.get(base_url + restaurant_href)
    # grab_soup = BeautifulSoup(detail_page.text, 'html.parser')
    
    
    data = {
        'Id': restaurant_id,
        'Name': name,
        'Cuisine': cuisine,
        'Time Taken': time_taken,
        'Distance': dist,
        'Restaurant Rating': rating,
        'Promo Text': discount_text[0] if discount_text else None,
        'Image Link': image_link,
        'Has Promo': has_promo,
    }
    print(data)
    yo.append(data)


# headers = ['Id', 'Name', 'Cuisine', 'Delivery Time', 'Distance', 'Rating', 'Promo Text', 'Image Link', 'Has Promo']
print(yo)
restaurant_df = pd.DataFrame(yo)
restaurant_df.to_csv('lets_see.csv')
ndjson_str = '\n'.join(json.dumps(record) for record in yo) + '\n'

# Compress the ndjson string
with gzip.open('restaurants.ndjson.gz', 'wt', encoding='utf-8') as f:
    f.write(ndjson_str)


    

{'Id': '4-C6L3ERDYFACEGE', 'Name': 'Fiesta Mexico - Three Amigos Indian and Mexican Restaurant', 'Cuisine': 'Mexican, Western, Indian', 'Time Taken': '25 mins', 'Distance': '1.3 km', 'Restaurant Rating': None, 'Promo Text': None, 'Image Link': 'https://food-cms.grab.com/compressed_webp/merchants/4-C6L3ERDYFACEGE/hero/95d909e4cd014c8e95cb9fa951db6da5_1715937184937605107.webp', 'Has Promo': False}
{'Id': 'SGDD04944', 'Name': "McDonald's - Boat Quay", 'Cuisine': 'Burger, Fast Food, Halal', 'Time Taken': '20 mins', 'Distance': '1.5 km', 'Restaurant Rating': '4.2', 'Promo Text': 'S$0.20 off on selected items', 'Image Link': 'https://food-cms.grab.com/compressed_webp/merchants/SGDD04944/hero/d2447ac38a2541769b115b2a1d25291d_1677816675499200090.webp', 'Has Promo': True}
{'Id': '4-C6K3LLJBMFKZCT', 'Name': 'Burger King - Boat Quay', 'Cuisine': 'Burgers, Fast Food', 'Time Taken': '25 mins', 'Distance': '1.3 km', 'Restaurant Rating': '4.9', 'Promo Text': None, 'Image Link': 'https://food-cms.grab