In [2]:
import bs4 
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from multiprocessing.pool import ThreadPool
from tqdm import tqdm
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import ElementClickInterceptedException, StaleElementReferenceException
import os
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import pickle
import urllib
from functools import partial
sns.set()

In [3]:
from multiprocessing.pool import Pool
from contextlib import closing

In [4]:
def multiprocess_func(main_input, func, additional_inputs=None,
                      gather_func=None, to_split=True, gather_func_args=None,
                      chunk_size=100, n_processes=8, use_threads=False):
    if not gather_func_args:
        gather_func_args = []
    if not additional_inputs:
        additional_inputs = []
    if not gather_func:
        gather_func = lambda x: [z for i in x for z in i]
    if to_split:
        splitted = [(main_input[i:i + chunk_size], *additional_inputs) if additional_inputs else main_input[i:i + chunk_size]\
                    for i in range(0, len(main_input), chunk_size)]
    else:
        splitted = [(i, *additional_inputs) if additional_inputs else i for i in main_input]
        
    if not use_threads:
        with closing(Pool(n_processes)) as p:
            result = list(tqdm(p.imap(func, splitted),
                               total=len(splitted)))
    else:
        with closing(ThreadPool(n_processes)) as p:
            result = list(tqdm(p.imap(func, splitted),
                               total=len(splitted)))
    return gather_func(result, *gather_func_args)

# First level parsing

In [5]:
software_names = [SoftwareName.CHROME.value]
operating_systems = [OperatingSystem.LINUX.value, OperatingSystem.WINDOWS.value, OperatingSystem.MACOS.value]
user_agent_rotator = UserAgent(software_names=software_names,
                              operating_systems=operating_systems,
                              limit=100)
main_link = 'https://www.tripadvisor.ru/Restaurants-g294473-Ukraine.html#LOCATION_LIST'

In [6]:
main_url = 'https://www.tripadvisor.ru/'

In [7]:
def parse_sites(main_link, user_agent_rotator, max_ex=100):
    first_button_cls = 'nav.next'
    next_page_cls = 'guiArw.sprite-pageNext'
    prox = "socks5://localhost:9050"

    user = user_agent_rotator.get_random_user_agent()
    custom_options = webdriver.ChromeOptions()
    custom_options.add_argument(f'user_agent={user}')
    custom_options.add_argument('--proxy-server=%s' % prox)
    
    driver = webdriver.Chrome(options=custom_options)
    driver.get(main_link)
    WebDriverWait(driver, 90).until(EC.presence_of_element_located((By.CLASS_NAME, first_button_cls)))
    driver.find_element(by=By.CLASS_NAME, value=first_button_cls).click()
    
    pages = [driver.page_source]
    ex_counter=0
    while True:
        try:
            WebDriverWait(driver, 90).until(EC.presence_of_element_located((By.CLASS_NAME, next_page_cls)))
            driver.find_element(by=By.CLASS_NAME, value=next_page_cls).click()
            ex_counter = 0
        except Exception as ex:
            if not isinstance(ex, (StaleElementReferenceException, ElementClickInterceptedException)):
                print(ex)
                if not ex_counter:
                    pages.append(driver.page_source)
                break
            else:
                ex_counter+=1
                
            if ex_counter>=max_ex:
                break
        time.sleep(15)
        pages.append(driver.page_source)
    driver.quit()
    return pages

In [8]:
def parse_first_lvl(page):
    list_links = bs4.BeautifulSoup(page).find('ul', {'class':'geoList'})
    links = []
    for link in list_links.find_all('a'):
        links.append([link.get('href'), link.text])
    return links

In [9]:
pages = parse_sites(main_link, user_agent_rotator)

Message: 
Stacktrace:
0   chromedriver                        0x00000001010d62c8 chromedriver + 4752072
1   chromedriver                        0x0000000101056463 chromedriver + 4228195
2   chromedriver                        0x0000000100cb9b18 chromedriver + 441112
3   chromedriver                        0x0000000100cf6e21 chromedriver + 691745
4   chromedriver                        0x0000000100cf7061 chromedriver + 692321
5   chromedriver                        0x0000000100d325e4 chromedriver + 935396
6   chromedriver                        0x0000000100d17d2d chromedriver + 826669
7   chromedriver                        0x0000000100d30134 chromedriver + 926004
8   chromedriver                        0x0000000100d17b33 chromedriver + 826163
9   chromedriver                        0x0000000100ce89fd chromedriver + 633341
10  chromedriver                        0x0000000100cea051 chromedriver + 639057
11  chromedriver                        0x00000001010a330e chromedriver + 4543246
12 

In [10]:
sites_links = pd.DataFrame(multiprocess_func(pages, parse_first_lvl,
                      gather_func=None, to_split=False,
                      n_processes=8), columns=['link_to_city', 'title_of_city'])

100%|████████████████████████████████████████| 25/25 [00:00<00:00, 89.79it/s]


In [11]:
sites_links['title_of_city'] = sites_links['title_of_city'].apply(lambda x: x.replace('/','\\'))

In [12]:
sites_links['link_to_city'] = sites_links['link_to_city'].apply(lambda x: urllib.parse.urljoin(main_url, x))

In [13]:
sites_links.shape

(500, 2)

# Second level parsing

In [14]:
n_threads = 10

In [15]:
ABS_PATH = 'trip_advisor_data_restaurants_links'
if not os.path.exists(ABS_PATH):
    os.mkdir(ABS_PATH)
    
for i in sites_links['title_of_city']:
    dir_path = os.path.join(ABS_PATH,i)

    if not os.path.exists(dir_path):
        os.mkdir(dir_path)


In [16]:
def save_html(file, path):
    with open(path+'.html', 'w') as f:
        f.write(file)

In [17]:
def parse_sites_lvl2(main_link, path, 
                     abs_path, 
                     user_agent_rotator, max_ex=5):
    
    next_page_cls = 'nav.next'
    prox = "socks5://localhost:9050"

    user = user_agent_rotator.get_random_user_agent()
    custom_options = webdriver.ChromeOptions()
    custom_options.add_argument(f'user_agent={user}')
    custom_options.add_argument('--proxy-server=%s' % prox)
    custom_options.add_argument('headless')
    
    
    # overall path
    path_to_save = os.path.join(abs_path, path)

    
    try:
        driver = webdriver.Chrome(options=custom_options)
        driver.get(main_link)
    except:
        return 0
    
    c = 0
    ex_counter=0
    while True:
        try:
            WebDriverWait(driver, 90).until(EC.presence_of_element_located((By.CLASS_NAME, next_page_cls)))
            driver.find_element(by=By.CLASS_NAME, value=next_page_cls).click()
            ex_counter = 0
            c+=1
        except Exception as ex:
            if not isinstance(ex, (StaleElementReferenceException, ElementClickInterceptedException)):
                if not ex_counter:
                    save_html(driver.page_source, os.path.join(path_to_save, f'page_{str(c)}'))
                break
            else:
                ex_counter+=1
                
            if ex_counter>=max_ex:
                break
        time.sleep(2)
        save_html(driver.page_source, os.path.join(path_to_save, f'page_{str(c)}'))
        
    return 0

In [18]:
parse_sites_lvl2_partial = partial(parse_sites_lvl2, user_agent_rotator=user_agent_rotator,
                                  abs_path=ABS_PATH)

In [19]:
def parse_links_multiprocessing(input_tuple):
    link, title = input_tuple
    code = parse_sites_lvl2_partial(link, path=title)
    return code

In [20]:
with closing(ThreadPool(n_threads)) as p:
     list(tqdm(p.imap(parse_links_multiprocessing, sites_links[['link_to_city', 'title_of_city']].values),\
                            total=sites_links.shape[0]))

100%|████████████████████████████████████| 500/500 [2:00:19<00:00, 14.44s/it]


In [21]:
def read_file(path):
    with open(path, 'r') as f
        return f.read()

In [22]:
def bs4_parse_links(page, name):
    records = []
    for review_page in bs4.BeautifulSoup(page).find_all('div', {'class':'zdCeB Vt o'}):
        record = {}
        if  review_page.find('svg', {'class':'UctUV d H0'}):
            record['rating'] = review_page.find('svg', {'class':'UctUV d H0'}).get('aria-label')
            found_a = review_page.find('div',{'class':'RfBGI'}).a
            record['link'] = found_a.get('href')
            record['title_of_restaurant'] = found_a.text
            record['title_of_city'] = name
            records.append(record)
    return records

In [23]:
def parse_lvl2(name, abs_path):
    path = os.path.join(abs_path,name)
    records = []
    for path_page in os.listdir(path):
        page = read_file(os.path.join(path,path_page))
        records.extend(bs4_parse_links(page, name))
    return records

In [24]:
parse_lvl2_partial = partial(parse_lvl2, abs_path=ABS_PATH)

In [25]:
links = multiprocess_func([i for i in os.listdir(ABS_PATH) if not i.startswith('.')],
                  func=parse_lvl2_partial,
                  to_split=False,
                 n_processes=8,
                         use_threads=False)

100%|██████████████████████████████████████| 465/465 [01:32<00:00,  5.05it/s]


In [26]:
links = pd.DataFrame(links)

In [27]:
links = links.drop_duplicates(subset=['link'])

In [28]:
links.shape

(7397, 4)

In [45]:
links['rating'] = links['rating'].apply(lambda x: float(x.split('из')[0].replace(',','.')))

In [46]:
links['link'] = links['link'].apply(lambda x: urllib.parse.urljoin(main_url, x))

In [52]:
links['title_of_restaurant'] = links['title_of_restaurant'].apply(lambda x: '.'.join(x.split('.')[1:]).strip())

In [60]:
links['name_to_save'] = links['link'].apply(lambda x: x.split('/')[-1].split('.')[0])

In [6]:
links['parsed'] = False

In [7]:
links.to_csv('links_to_restaurants.csv', index=False) 