In [1]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import re
from tqdm.notebook import tqdm, trange
from IPython.display import clear_output

import numpy as np

In [36]:
class TripAdvisorScrapper():
    restaurants_data = {}
    def __init__(self, city, show=False):
        self.PATH = 'chromedriver.exe'
        self.options = Options()
        if not show:
            self.options.add_argument("--headless")
        self.city = city
        # List for storing links to restaurants data
        self.restaurants_hrefs = []
        # Dict for storing dicts with restaurants information
        self.restaurants_data = []
    
    
    def _verbose(message):
        def decorator(func):
            def wrapper(*args, **kwargs):
                print(message)
                clear_output(wait=True)
                return func(*args, **kwargs) 
            return wrapper
        return decorator


    @_verbose(message = 'starting driver...')
    def start_driver(self):
        self.driver = webdriver.Chrome(options=self.options)
        self.driver.set_window_size(600, 800)
        
        
    @_verbose(message = 'opening TripAdvisor...')
    def open_tripadvisor(self):
        self.driver.get('https://www.tripadvisor.com/')
        
        
    @_verbose(message = 'accepting cookies...')
    def accept_cookies(self):
        # id
        button_accept = 'onetrust-accept-btn-handler'
        WebDriverWait(self.driver, timeout=10).until(lambda d: d.find_element_by_id(button_accept))
        self.driver.find_element_by_id(button_accept).click()
        
        
    @_verbose(message = 'searching the city...')
    def search_the_city(self, city):
        # classes
        search_field = "[aria-label='Search']"
        

        WebDriverWait(self.driver, timeout=3).until(lambda d: d.find_element(By.CSS_SELECTOR, search_field))
        Krakow.driver.find_element(By.CSS_SELECTOR, search_field).click()
        time.sleep(1.5)
        element = Krakow.driver.find_element(By.CSS_SELECTOR, search_field)
        element.send_keys('Krakow')
        element.submit()
    
    
    @_verbose(message = 'choosing the list of restaurants category...')
    def filter_restaurants(self):
        # CSS
        restaurants_filter = '[data-tab-name="Restaurants"]'
        time.sleep(1.5)
        WebDriverWait(self.driver, timeout=10).until(lambda d: d.find_element(By.CSS_SELECTOR, restaurants_filter))
        self.driver.find_element(By.CSS_SELECTOR, restaurants_filter).click()
        
        # attempts = 0
        # while True:
        #     try:
        #         WebDriverWait(self.driver, timeout=10).until(lambda d: d.find_element(By.CSS_SELECTOR, restaurans_filter))
        #         restaurants_filter = self.driver.find_element(By.CSS_SELECTOR, restaurans_filter)
        #         restaurants_filter.click()
        #         break
        #     except:
        #             if attempts !=10:
        #                 WebDriverWait(self.driver, timeout=10).until(lambda d: d.find_element_by_css_selector("[aria-label='Next']"))
        #                 time.sleep(3)
        #                 self.driver.find_element_by_css_selector("[aria-label='Next']").click()
        #             else:
        #                 raise RuntimeError('Cannot filter restaurants, FATAL')
        #                 break
                
                
    @_verbose(message = 'turning off filters...')                                              
    def clear_filters(self):
        try:
            WebDriverWait(self.driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, '_3bGkSG3Z'))
            self.driver.find_element(By.CLASS_NAME, '_3bGkSG3Z').click()
        except:
            pass
    
    
    @_verbose(message = 'getting links to the restaurants pages...')
    def get_restaurants_hrefs(self):
        
        def find_href_in_string(string):
            return re.findall(r"this, (.*), {type", string )[0]
        
        # class
        names = 'result-title'

        WebDriverWait(self.driver, timeout=10, ).until(lambda d: d.find_element(By.CLASS_NAME, names))
        return ['https://www.tripadvisor.com/'+ find_href_in_string(elem.get_attribute('onclick')) for elem in self.driver.find_elements(By.CLASS_NAME, names)]
    
    
    @_verbose(message = 'going to the next page...')
    def next_page(self):
        self.driver.find_elements(By.CLASS_NAME, 'next')[0].click()
        time.sleep(3)
                       
            
    @_verbose(message = 'collecting hrefs from restaurant lists...')
    def collect_hrefs(self, n_pages):
        if n_pages == 'all':
            n_pages = 9999
        n = 0
        pages=0
        while pages!=n_pages:
            while n!=40: # if links loaded
                time.sleep(1)
                hrefs_from_page = self.get_restaurants_hrefs()
                if len(hrefs_from_page)!=0:
                    
                    break
                else:
                    n+=1
            self.restaurants_hrefs = self.restaurants_hrefs + hrefs_from_page
            try:
                self.next_page()
                pages +=1
                print (f'collected {len(self.restaurants_hrefs)} links from {pages} pages')
                clear_output(wait=True)
            except:
                break
        
    
    @_verbose(message = 'collecting restaurant name...')
    def collect_name(self, rest_data):
        # class
        name = 'fHibz'
        # Make few attempts to be sure that page loaded correctly 
        attempts = 0
        while True:
            try:
                WebDriverWait(self.driver, timeout=10).until(lambda d: self.driver.find_element(By.CLASS_NAME, name))
                rest_data['Name'] = self.driver.find_element(By.CLASS_NAME, name).text
                break
            except:
                if attempts !=2:
                    self.driver.refresh()
                    attempts +=1
                else:
                    rest_data['Name'] = None
                    break
                    
    
    @_verbose(message = 'collecting restaurant reviews count...')
    def collect_reviews_count(self, rest_data):
        # class 
        n_reviews = 'eBTWs'
        ratings_filter = 'ui_tag_box'
        n_ratings = 'row_num'
        # Collecting data
        try:
            WebDriverWait(self.driver, timeout=2).until(lambda d: self.driver.find_element(By.CLASS_NAME, n_reviews))
            rest_data['N_reviews'] = re.findall('\d+', self.driver.find_element(By.CLASS_NAME, n_reviews).text)[0]
        except:
            rest_data['N_reviews'] = None   
        # Turning off review filters
        try:
            if len(self.driver.find_elements(By.CLASS_NAME, ratings_filter)) > 1:
                [i for i in self.driver.find_elements(By.CLASS_NAME, ratings_filter) if i.get_attribute('data-tracker') == 'English'][0].click()
                time.sleep(1)
        except: 
            pass
        rest_data['Review_ratings'] = {}
        # Collecting data
        rev_names = ['Excellent', 'Very good', 'Average', 'Poor', 'Terrible'] # Reviews marks
        try:
            WebDriverWait(self.driver, timeout=2).until(lambda d: self.driver.find_element(By.CLASS_NAME, n_ratings))
            rev_vals = [el.text for el in self.driver.find_elements(By.CLASS_NAME, n_ratings) if len(el.text)>0] # Reviews count
        except:
            rev_vals = [None]*5
        # Filling dictionary
        for k, v in zip(rev_names, rev_vals):
            rest_data['Review_ratings'][k] = v 

            
    @_verbose(message = 'collecting restaurant location data...')
    def collect_location_data(self, rest_data):
        # class
        location_map = 'eCPON'
        adres_district = 'brMTW'
        try:
            WebDriverWait(self.driver, timeout=2).until(lambda d: self.driver.find_elements(By.CLASS_NAME, location_map))
            src = self.driver.find_elements(By.CLASS_NAME, location_map)[0].get_attribute('src')
            rest_data['Latitude'], rest_data['Longitude'] = re.findall('\d+.\d+,\d+.\d+', src )[0].split(',')
        except:
            rest_data['Latitude'], rest_data['Longitude'] = (None, None)
        try:
            rest_data['Adres'] = self.driver.find_elements(By.CLASS_NAME, adres_district)[0].text
        except:
            rest_data['Adres'] = None
        try:
            rest_data['District'] = self.driver.find_elements(By.CLASS_NAME, adres_district)[1].text
        except: 
            rest_data['District'] = None
            
    
    @_verbose(message = 'collecting restaurant working hours...')
    def collect_working_hours_data(self, rest_data):
        # class
        working_time_table = 'cSjwK'
        working_time_data = 'ferBE'
        rest_data['Open_hours'] = {}
        rest_data['Close_hours'] = {}
        days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        try:
            # Open working time table
            WebDriverWait(self.driver, timeout=5).until(lambda d: self.driver.find_element(By.CLASS_NAME, working_time_table))
            self.driver.find_element(By.CLASS_NAME, working_time_table).click()
            # Collecting data
            
            working_days, working_hours = (np.array([i.text.split('\n') for i in self.driver.find_elements(By.CLASS_NAME, working_time_data)])).T.tolist()
            # Filling dictionary
            for i in days:
                if i in working_days:
                    opened, closed = working_hours[working_days.index(i)].split(' - ')
                    rest_data['Open_hours'][i] = opened
                    rest_data['Close_hours'][i] = closed
                else:
                    rest_data['Open_hours'][i] = 'Closed'
                    rest_data['Close_hours'][i] = 'Closed'
            # Close working time table
            self.driver.find_element(By.CLASS_NAME, working_time_table).click()
        except:
            for i in days:
                rest_data['Open_hours'][i] = None
                rest_data['Close_hours'][i] = None
    
    
    @_verbose(message = 'collecting restaurant details...')
    def collect_details(self, rest_data):
        # class
        all_details = 'ZlyLX'
        details_names = 'dMshX'
        details_info = 'cfvAV'
        # Open all the details
        try:
            WebDriverWait(self.driver, timeout=5).until(lambda d: self.driver.find_element(By.CLASS_NAME, all_details))
            self.driver.find_element(By.CLASS_NAME, all_details).click()
        except: 
            pass
        rest_data['Details'] = {}
        details_titles = ['Price_range', 'Cuisines', 'Special_diets', 'About', 'Meals', 'Features']
        # Setting all details to None
        for k in details_titles:
            rest_data['Details'][k] = None
        try:
            keys = [i.text for i in self.driver.find_elements(By.CLASS_NAME, details_names)] # Collected details names
            keys = [i[0]+str.lower(i[1:].replace(' ', '_')) for i in keys]
            values = [i.text for i in self.driver.find_elements(By.CLASS_NAME, details_info)] # Collected details info
            for k, v in zip(keys, values):
                rest_data['Details'][k] = v
        except:
            pass
        
    
    def get_hrefs(self, n_pages):
        self.start_driver()
        self.open_tripadvisor()
        self.accept_cookies()
        self.search_the_city(self.city)
        self.filter_restaurants()
        time.sleep(1)
        # self.clear_filters()
        self.collect_hrefs(n_pages)
        self.driver.close()
        clear_output()
        print ('Done')
        
        
    def get_restaurants_data(self):
        self.start_driver()
        assert len(self.restaurants_hrefs) !=0, "Can't find links to restaurants pages. Try get_hrefs(n_pages) before."
        for num, rest_href in enumerate(self.restaurants_hrefs):
            
            self.driver.execute_script(f"window.open('{rest_href}');")
            time.sleep(1)
            self.driver.switch_to.window(self.driver.window_handles[-1])
            
            rest_data = {}
            self.collect_name(rest_data)
            self.collect_reviews_count(rest_data)
            self.collect_location_data(rest_data)
            self.collect_working_hours_data(rest_data)
            self.collect_details(rest_data)
        
            print (f'Scrapped {num} from {len(self.restaurants_hrefs)} restauratns pages')
            clear_output(wait=True)
            self.restaurants_data.append(rest_data)
            self.driver.close()
            # switching to window with list of restaurants
            self.driver.switch_to.window(self.driver.window_handles[0])
        self.driver.close()
        clear_output()
        print ('Done')
        
        
    @_verbose('saving restaurants data...')        
    def save_collected_data(self, folder=""):
        with open(f'{folder}restaurants_data_{self.city}.json', 'w') as outfile:
            json.dump(self.restaurants_data, outfile)
        clear_output()
        print ('Done')
            
            
    @_verbose('saving restaurants links...')        
    def save_collected_hrefs(self, folder=""):
        with open(f'{folder}restaurants_hrefs_{self.city}.json', 'w') as outfile:
            json.dump(self.restaurants_hrefs, outfile)
        clear_output()
        print ('Done')
    
    
    @_verbose('loading restaurants hrefs...')
    def load_collected_hrefs(self, folder=""):
        with open(f'{folder}restaurants_hrefs_{self.city}.json') as inputfile:
            self.restaurants_hrefs = json.load(inputfile)
        clear_output()
        print ('Done')
    
    
    @_verbose('loading restaurants data...')
    def load_collected_data(self, folder=""):
        with open(f'{folder}restaurants_data_{self.city}.json') as inputfile:
            self.restaurants_data = json.load(inputfile)
        clear_output()
        print ('Done')

Algorithm to refactor. Selenium works to slow. Using async with bs4 is more efficient

In [39]:
Krakow = TripAdvisorScrapper('Krakow', show=True)
# try:
#     Krakow.load_collected_hrefs()
# except:
#     Krakow.get_hrefs(1)
#     Krakow.save_collected_hrefs()
# Krakow.load_collected_data()
# Krakow.save_collected_data('data/')
# Krakow.save_collected_hrefs('data/')

In [41]:
n_pages = 1
Krakow.get_hrefs(n_pages)

Done


In [42]:
Krakow.restaurants_hrefs[0:3]

["https://www.tripadvisor.com/'/Restaurant_Review-g274772-d1820442-Reviews-Hard_Rock_Cafe-Krakow_Lesser_Poland_Province_Southern_Poland.html'",
 "https://www.tripadvisor.com/'/Restaurant_Review-g274772-d1749958-Reviews-Restauracja_Starka-Krakow_Lesser_Poland_Province_Southern_Poland.html'",
 "https://www.tripadvisor.com/'/Restaurant_Review-g274772-d5953938-Reviews-Czarna_Kaczka_The_Black_Duck-Krakow_Lesser_Poland_Province_Southern_Poland.html'"]

In [43]:
len(Krakow.restaurants_hrefs)

30

In [55]:

Krakow.start_driver()
rest_href = Krakow.restaurants_hrefs[0]

Krakow.driver.execute_script(f'window.open("{rest_href}");')
time.sleep(5)
Krakow.accept_cookies()
Krakow.driver.switch_to.window(Krakow.driver.window_handles[-1])

rest_data = {}
Krakow.collect_name(rest_data)
Krakow.collect_reviews_count(rest_data)
Krakow.collect_location_data(rest_data)
Krakow.collect_working_hours_data(rest_data)
Krakow.collect_details(rest_data)

  WebDriverWait(self.driver, timeout=10).until(lambda d: d.find_element_by_id(button_accept))


TimeoutException: Message: 
Stacktrace:
#0 0x5577d2bfdee3 <unknown>
#1 0x5577d26cb608 <unknown>
#2 0x5577d2701aa1 <unknown>
#3 0x5577d2701c61 <unknown>
#4 0x5577d2734714 <unknown>
#5 0x5577d271f29d <unknown>
#6 0x5577d27323bc <unknown>
#7 0x5577d271f163 <unknown>
#8 0x5577d26f4bfc <unknown>
#9 0x5577d26f5c05 <unknown>
#10 0x5577d2c2fbaa <unknown>
#11 0x5577d2c45651 <unknown>
#12 0x5577d2c30b05 <unknown>
#13 0x5577d2c46a68 <unknown>
#14 0x5577d2c2505f <unknown>
#15 0x5577d2c61818 <unknown>
#16 0x5577d2c61998 <unknown>
#17 0x5577d2c7ceed <unknown>
#18 0x7f156f742609 <unknown>


In [12]:
Krakow.driver.execute_script(f'window.open("{rest_href}");')

In [53]:
rest_data

{'Name': 'Hard Rock Cafe',
 'N_reviews': '4',
 'Review_ratings': {'Excellent': '1,779',
  'Very good': '1,276',
  'Average': '709',
  'Poor': '255',
  'Terrible': '164'},
 'Latitude': '50.061474',
 'Longitude': '19.938503',
 'Adres': 'Main Market / Mariacki Square 9, Krakow 31-042 Poland',
 'District': "Stare Miasto\n0.0 miles from Krakow's Rynek Glowny Central Square",
 'Open_hours': {'Mon': None,
  'Tue': None,
  'Wed': None,
  'Thu': None,
  'Fri': None,
  'Sat': None,
  'Sun': None},
 'Close_hours': {'Mon': None,
  'Tue': None,
  'Wed': None,
  'Thu': None,
  'Fri': None,
  'Sat': None,
  'Sun': None},
 'Details': {'Price_range': 'PLN 14 - PLN 89',
  'Cuisines': 'American, Bar',
  'Special_diets': 'Vegetarian Friendly, Vegan Options, Gluten Free Options',
  'About': 'Hard Rock Cafe is a global phenomenon with 185 cafes that are visited by nearly 80 million guests each year. The first Hard Rock Cafe opened on June 14, 1971, in London, England, and from there the brand has expanded t

In [25]:
[i.text for i in Krakow.driver.find_elements(By.CLASS_NAME, 'brMTW')]

['Main Market / Mariacki Square 9, Krakow 31-042 Poland',
 "Stare Miasto\n0.0 miles from Krakow's Rynek Glowny Central Square",
 'Website',
 'Email',
 'Call']

In [50]:
Krakow.collect_working_hours_data(rest_data)

collecting restaurant working hours...


In [46]:
details_names = 'dMshX'
details_info = 'cfvAV'
[i.text for i in Krakow.driver.find_elements(By.CLASS_NAME, details_names)]

['PRICE RANGE',
 'CUISINES',
 'SPECIAL DIETS',
 'ABOUT',
 'PRICE RANGE',
 'CUISINES',
 'SPECIAL DIETS',
 'MEALS',
 'FEATURES']

In [47]:
[i.text for i in Krakow.driver.find_elements(By.CLASS_NAME, details_info)]

['PLN 14 - PLN 89',
 'American, Bar',
 'Vegetarian Friendly, Vegan Options, Gluten Free Options',
 'Hard Rock Cafe is a global phenomenon with 185 cafes that are visited by nearly 80 million guests each year. The first Hard Rock Cafe opened on June 14, 1971, in London, England, and from there the brand has expanded to major cities and exotic locations around the world.',
 'PLN 14 - PLN 89',
 'American, Bar',
 'Vegetarian Friendly, Vegan Options, Gluten Free Options',
 'Breakfast, Lunch, Dinner, Late Night, Drinks',
 'Wheelchair Accessible, Reservations, Outdoor Seating, Buffet, Seating, Television, Highchairs Available, Serves Alcohol, Full Bar, Wine and Beer, Accepts American Express, Accepts Mastercard, Accepts Visa, Cash Only, Free Wifi, Accepts Credit Cards, Table Service, Gift Cards Available']