In [1]:
import bs4 
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from multiprocessing.pool import ThreadPool
from tqdm import tqdm
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import ElementClickInterceptedException, StaleElementReferenceException
import os
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import pickle
import urllib
sns.set()

In [2]:
from multiprocessing.pool import Pool
from contextlib import closing

In [3]:
def multiprocess_func(main_input, func, additional_inputs=None,
                      gather_func=None, to_split=True, gather_func_args=None,
                      chunk_size=100, n_processes=8):
    if not gather_func_args:
        gather_func_args = []
    if not additional_inputs:
        additional_inputs = []
    if not gather_func:
        gather_func = lambda x: [z for i in x for z in i]
    if to_split:
        splitted = [(main_input[i:i + chunk_size], *additional_inputs) if additional_inputs else main_input[i:i + chunk_size]\
                    for i in range(0, len(main_input), chunk_size)]
    else:
        splitted = [(i, *additional_inputs) if additional_inputs else i for i in main_input]
    with closing(Pool(n_processes)) as p:
        result = list(tqdm(p.imap(func, splitted),
                           total=len(splitted)))
    return gather_func(result, *gather_func_args)

# First level parsing

In [4]:
software_names = [SoftwareName.CHROME.value]
operating_systems = [OperatingSystem.LINUX.value, OperatingSystem.WINDOWS.value, OperatingSystem.MACOS.value]
user_agent_rotator = UserAgent(software_names=software_names,
                              operating_systems=operating_systems,
                              limit=100)
main_link = 'https://www.tripadvisor.ru/Hotels-g294473-Ukraine-Hotels.html#LEAF_GEO_LIST'

In [5]:
main_url = 'https://www.tripadvisor.ru/'

In [9]:
def parse_sites(main_link, user_agent_rotator, max_ex=100):
    first_button_xpath = '//*[@id="component_7"]/div/button'
    next_page_xpath = '//*[@id="taplc_main_pagination_bar_hotels_less_links_v2_0"]/div/div/div/span[2]'
    
    user = user_agent_rotator.get_random_user_agent()
    custom_options = webdriver.ChromeOptions()
    custom_options = webdriver.ChromeOptions()
    custom_options.add_argument(f'user_agent={user}')
    
    driver = webdriver.Chrome(options=custom_options)
    driver.get(main_link)
    WebDriverWait(driver, 90).until(EC.presence_of_element_located((By.XPATH, first_button_xpath)))
    driver.find_element(by=By.XPATH, value=first_button_xpath).click()
    
    pages = [driver.page_source]
    ex_counter=0
    while True:
        try:
            WebDriverWait(driver, 90).until(EC.presence_of_element_located((By.XPATH, next_page_xpath)))
            driver.find_element(by=By.XPATH, value=next_page_xpath).click()
            ex_counter = 0
        except Exception as ex:
            if not isinstance(ex, (StaleElementReferenceException, ElementClickInterceptedException)):
                print(ex)
                break
            else:
                ex_counter+=1
                
            if ex_counter>=max_ex:
                break
        time.sleep(15)
        pages.append(driver.page_source)
    driver.quit()
    return pages

In [10]:
def parse_first_lvl(page):
    soup = bs4.BeautifulSoup(page)
    to_save = []
    for ui_column in soup.find_all('div', {'class':'ui_column is-8 main_col allowEllipsis'}):
        try:
            bubble_rating_parsed = ui_column.find('a', {'data-clicksource':'BubbleRating'})

            to_save.append((bubble_rating_parsed.get('alt'), bubble_rating_parsed.get('href'),
                        ui_column.find('div', {'class':'listing_title'}).text))
        except:
            pass
    return to_save

In [None]:
pages = parse_sites(main_link, user_agent_rotator)

In [None]:
hotels_df = pd.DataFrame(multiprocess_func(pages, parse_first_lvl,
                      gather_func=None, to_split=False,
                      n_processes=8), columns=['rating', 'link', 'title'])

In [None]:
hotels_df = hotels_df.drop_duplicates()

In [13]:
hotels_df['link'] = hotels_df['link'].apply(lambda x: urllib.parse.urljoin(main_url, x))
hotels_df['title'] = hotels_df['title'].apply(lambda x: '.'.join(x.split('.')[1:]).strip())
hotels_df['rating'] = hotels_df['rating'].apply(lambda x: float(x.split('of')[0].strip().replace(',','.')))

In [46]:
hotels_df['parsed'] = False

In [47]:
hotels_df.to_csv('hotels_links.csv', index=False)

# Second level parsing with translate + selenium

In [61]:
hotels_df = pd.read_csv('hotels_links.csv')

In [6]:
from functools import partial
from selenium.webdriver.common.action_chains import ActionChains
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import OperatingSystem, SoftwareName
import pyautogui
import threading
import multiprocessing
from selenium.webdriver.common.proxy import Proxy, ProxyType

In [7]:
software_names = [SoftwareName.CHROME.value]
operating_systems = [OperatingSystem.LINUX, OperatingSystem.MACOS.value,
                    OperatingSystem.WINDOWS]

In [8]:
user_agent_rotator = UserAgent(software_names=software_names,
                               operating_systems=operating_systems, limit=hotels_df.shape[0]*2)

In [9]:
max_ex = 100

In [10]:
def decline_offer(driver, time_sleep=15):
    not_interested_xpath = '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/div[14]/div/div/div[2]/div/div/div/div[2]/div[2]/button[1]'
    try:
        WebDriverWait(driver, time_sleep).until(EC.presence_of_element_located((By.XPATH, not_interested_xpath)))
        driver.find_element(by=By.XPATH, value=button_xpath).click()
    except:
        pass
    

In [11]:
def wait_and_click_by_xpath(driver, button_xpath, time_sleep=15):
    WebDriverWait(driver, time_sleep).until(EC.presence_of_element_located((By.XPATH, button_xpath)))
    driver.find_element(by=By.XPATH, value=button_xpath).click()
    

In [12]:
def wait_and_click_by_css(driver, button_css, time_sleep=15):
    WebDriverWait(driver, time_sleep).until(EC.presence_of_element_located((By.CSS_SELECTOR, button_css)))
    driver.find_element(by=By.CSS_SELECTOR, value=button_css).click()
    

In [13]:
def click_translate(driver, time_sleep=2):
    ActionChains(driver).context_click().perform()
    time.sleep(time_sleep)
    for i in range(8):
        pyautogui.press('down')
    pyautogui.press('enter')
    

In [14]:
def init(lock):
    global starting
    starting = lock

In [15]:
def save_pickle(file, path):
    with open(path, 'wb') as f:
        pickle.dump(file, f)

In [16]:
def get_driver(address, run_headless=False):
    custom_options = webdriver.ChromeOptions()
    capabilities = capabilities = webdriver.DesiredCapabilities.CHROME
    prox = Proxy({
        'proxyType': ProxyType.MANUAL,
        'httpProxy': address,
        'ftpProxy' : address,
        'sslProxy' : address,
        'noProxy'  : ''
    })

    prox.add_to_capabilities(capabilities)
    prox.autodetect = False
    
    if run_headless:
        custom_options.add_argument('headless')
    custom_options.add_argument("lang=uk")
    custom_options.add_argument('--ignore-certificate-errors')
    custom_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=custom_options, desired_capabilities=capabilities)
    return driver

In [17]:
def parse_reviews(link, path, abs_path, user_agent, max_ex=100,
                 parts_scroll=8, to_translate=False, to_scroll=False, 
                 parallel=False):
    #aquire item from queue 
    driver = drivers.get()
    
    if parallel and to_translate:
        starting.acquire() 
        
    #change user agent
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": user_agent})

    #next page css
    next_page_css = '#component_16 > div > div:nth-child(3) > div.MD > div > a'
    
    #exception handling 
    passed = {'got_initial_link' : False,
    'see_all_languages' : False}
    
    caught_ex = None
    
    
    #initial link getting
    time.sleep(5)
    driver.get(link)

    passed['got_initial_link'] = True
    
    #decline offer
    decline_offer(driver, time_sleep=15)
    
    #see all languages
    try:
        wait_and_click_by_xpath(driver, '//*[@id="component_16"]/div/div[3]/div[1]/div[1]/div[4]/ul/li[1]/label',
                               180)
        passed['see_all_languages'] = True
        time.sleep(5)    
    except:
        passed['see_all_languages'] = False
    
    
    #click translate
    if to_translate:
        driver.switch_to.window(driver.current_window_handle)
        time.sleep(2)
        click_translate(driver)
        time.sleep(10)
        passed['click_translate'] = True
    
        if parallel:
            threading.Timer(1, starting.release).start()
        
    pages = []
    ex_counter=0
    c = 0
    
    current_height = driver.execute_script("return window.scrollY")
    
    while True:
        passed['show_more'] = False
        
        if to_scroll:
            passed['scroll'] = False
            
        passed['next_page'] = False
        
        if c!=0:
            next_page_css = '#component_16 > div > div:nth-child(3) > div.MD > div > a.ui_button.nav.next.primary'
            
        try:
            #show more 
            wait_and_click_by_css(driver, '#component_16 > div > div:nth-child(3) > div:nth-child(3) > div.WAllg._T > div.vTVDc > div._T.FKffI.bmUTE > div.lszDU > div > span.Ignyf._S.Z',
                                 30)
            time.sleep(2)
            passed['show_more'] = True
            
            if to_scroll:
                # scrolling
                overall_height = driver.execute_script("return document.documentElement.scrollHeight")
                batch = (overall_height-current_height)//parts_scroll
                scroll_len = batch
                for i in range(parts_scroll-1):
                    driver.execute_script(f"window.scrollTo(0, {current_height+scroll_len});")
                    scroll_len+=batch
                    time.sleep(2)

                driver.execute_script(f"window.scrollTo(0, {current_height});")
                time.sleep(2)

                passed['scroll'] = True
            
            pages.append(driver.page_source)
            
            #next page
            wait_and_click_by_css(driver, next_page_css,90)                
            passed['next_page'] = True
            
            ex_counter = 0
            c+=1
        except Exception as ex:
            if not isinstance(ex, (StaleElementReferenceException, ElementClickInterceptedException)):
                caught_ex = ex
                break
            else:
                ex_counter+=1
            if ex_counter>=max_ex:
                break   
        finally:
            #needed only if we wait for translate to load
            if to_translate:
                time.sleep(3)
                
    time.sleep(2)
        
    if not caught_ex:
        passed = dict([(k, True) for k in passed.keys()])
        
    try:
        passed['num_overall'] = int(bs4.BeautifulSoup(pages[0]).find_all('span', {'data-test-target':'CC_TAB_Reviews_LABEL'})[0]\
        .find('span', {'class':'iypZC Mc _R b'}).text)
        passed['got_overall_num'] = True
    except:
        passed['got_overall_num'] = False
        passed['num_overall'] = 0
        
    passed['num_parsed'] = len(pages)*5
    passed['exception'] = caught_ex
    passed['link'] = link
    passed['hotel_name'] = path
    
    #saving to file in order not to consume to much memory
    save_pickle(pages, os.path.join(abs_path, path))
    time.sleep(1)
    
    #put item to queue
    drivers.put(driver)
    
    return passed

In [18]:
def check_ip_proxy(address):
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('headless')
    prox = Proxy({
            'proxyType': ProxyType.MANUAL,
            'httpProxy': address,
            'ftpProxy' : address,
            'sslProxy' : address,
            'noProxy'  : ''
        })
    prox.autodetect = True
    
    capabilities = webdriver.DesiredCapabilities.CHROME
    prox.add_to_capabilities(capabilities)

    driver = webdriver.Chrome(options=options, desired_capabilities=capabilities)
    driver.get('https://api.ipify.org/')
    ip_address = driver.find_element(By.TAG_NAME, "body").text
    driver.quit()
    
    return ip_address

In [19]:
def check_change_ip(address, default_ip_address, debug=False):
    try:
        ip_address = check_ip_proxy(address)
    except:
        ip_address = None
    
    if debug:
        print(f'Old ip: {default_ip_address}, new ip : {ip_address}')
        
    if default_ip_address!=ip_address and ip_address:
        if debug:
            print('IPs are different')
        return True
    return False

## proxies list

In [20]:
proxies_list = """
66.29.154.105:3128
45.79.111.38:9994
139.59.1.14:8080
149.129.191.110:8080
159.138.169.48:8080
169.57.1.85:8123
146.196.48.2:80
121.1.41.162:111
219.78.228.211:80
138.68.235.51:80
128.199.202.122:8080
47.254.47.61:8080
202.6.233.133:80
173.212.195.139:80
103.152.112.162:80
35.193.113.186:80
185.125.125.157:80
62.171.188.233:8000
139.99.237.62:80
68.183.111.90:80
47.74.152.29:8888
51.159.207.156:3128
83.229.72.174:80
20.54.56.26:8080
183.89.165.39:8080
158.193.206.136:8080
200.105.215.18:33630
20.111.54.16:80
20.206.106.192:8123
42.3.182.149:80
219.78.194.41:80
43.154.216.109:80
203.198.207.253:80
81.200.123.74:80
110.164.3.7:8888
112.120.41.171:80
219.77.188.21:80
20.210.113.32:8123
20.24.43.214:8123
201.91.18.82:8000
8.209.246.6:80
45.153.185.174:80
185.15.172.212:3128
193.122.71.184:3128
80.48.119.28:8080
41.65.252.105:1981
41.65.236.43:1976
198.59.191.234:8080
177.12.238.1:3128
45.169.162.1:3128
177.12.238.100:3128
198.49.68.80:80
165.154.243.252:80
130.41.55.190:8080
164.132.170.100:80
78.157.42.105:80
41.188.149.79:80
165.154.226.12:80
180.211.158.122:58375
165.154.243.53:80
221.132.18.26:8090
165.154.225.65:80
221.132.28.18:8090
129.154.54.57:3128
49.207.36.81:80
164.92.75.10:80
187.32.147.196:80
139.162.153.173:80
58.27.59.249:80
78.157.42.106:80
118.26.110.48:8080
23.238.33.186:80
161.35.223.141:80
149.129.239.170:8080
41.33.219.141:8080
51.15.242.202:8888
204.185.204.64:8080
187.60.46.119:80
178.79.191.47:54417
8.209.64.208:8080
137.184.197.190:80
118.42.15.57:4003
70.90.138.109:8080
190.120.250.122:999
187.188.167.108:8080
45.32.101.24:80
96.126.124.197:81
47.241.245.186:80
167.99.236.14:80
117.54.114.35:80
178.79.138.253:8080
178.217.172.206:55443
47.253.214.60:57114
144.217.240.185:9300
72.55.155.80:80
61.78.63.200:30000
47.252.1.180:3128
3.212.9.208:80
173.255.209.155:1080
194.195.216.153:4145
185.216.116.18:80
188.166.164.141:80
51.91.62.219:80
103.66.196.218:23500
188.0.147.102:3128
172.105.13.173:80
135.181.199.92:9300
45.79.94.19:80
172.104.206.170:80
148.204.171.217:80
103.167.134.31:80
172.104.111.212:80
213.14.174.70:3128
43.129.223.147:38080
113.161.59.136:8080
62.33.8.148:8081
143.0.67.18:8080
141.11.182.4:3128
62.122.201.105:8080
103.174.234.71:8080
45.56.75.90:5344
45.79.142.211:3128
47.254.195.196:8080
8.209.249.96:8080
201.229.250.22:8080
172.105.172.220:3128
146.59.199.12:80
207.154.252.96:80
44.193.229.111:80
51.38.71.232:80
143.244.134.24:80
142.93.223.246:80
141.95.122.232:80
158.193.206.137:8080
54.161.135.247:80
100.20.101.185:80
178.128.55.68:36894
82.223.102.92:9443
44.197.16.217:80
161.97.92.160:80
50.18.70.79:80
54.86.198.153:80
94.242.55.10:80
72.170.220.17:8080
52.47.137.181:80
116.111.93.89:5000
20.81.62.32:3128
34.81.72.31:80
187.60.46.118:80
47.90.202.21:24007
64.227.14.149:80
202.169.229.139:53281
194.31.55.247:80
34.142.43.20:80
167.99.210.216:80
13.81.217.201:80
207.154.228.158:80
3.109.85.109:80
167.99.174.59:80
45.120.136.104:80
128.199.13.74:80
74.208.212.244:3128
112.78.137.106:8080
77.68.117.95:3128
13.127.179.255:80
5.189.184.6:80
149.129.179.23:8080
185.105.102.179:80
158.193.206.135:8080
3.1.248.232:80
14.139.242.7:80
143.198.228.250:80
121.155.237.155:8080
116.58.166.194:8080
45.79.253.142:3128
187.60.46.114:80
124.106.226.240:3128
115.96.208.124:8080
45.149.43.56:53281
50.192.195.69:52018
207.180.228.55:80
46.242.131.110:8000
159.203.13.121:80
209.126.6.159:80
14.207.205.75:8080
91.233.169.23:8081
212.73.73.234:8081
187.87.206.97:3128
12.144.254.185:9080
192.53.163.144:3128
206.189.146.13:8080
62.193.68.91:1976
159.138.158.36:8888
188.235.0.207:8282
172.105.184.208:8001
162.144.233.16:80
158.193.206.131:8080
187.60.46.112:80
161.35.214.127:46795
41.39.244.130:32604
103.248.120.5:8080
103.80.237.211:3888
69.75.140.157:8080
138.121.161.83:8190
168.196.124.149:999
193.46.80.15:8086
45.79.208.64:44554
212.71.255.43:38613
170.187.141.197:8080
172.104.252.86:80
8.210.83.33:80
15.236.134.130:80
118.140.152.250:80
96.126.103.64:9991
172.105.190.51:8017
149.129.187.190:3128
47.245.34.161:8080
20.212.168.206:80
38.94.109.12:80
116.58.254.126:8080
198.74.56.87:3128
47.243.244.2:80
216.137.184.253:80
143.198.182.218:80
147.139.164.26:8080
149.129.213.200:8080
43.255.113.232:8082
54.210.239.35:80
187.60.46.117:80
187.60.46.116:80
173.249.37.45:5005
128.0.179.234:41258
134.209.241.12:80
52.200.191.158:80
80.81.243.26:8800
200.114.97.200:999
187.188.108.114:8080
212.23.217.47:8080
170.155.2.119:80
18.188.246.60:80
82.66.196.208:80
8.210.149.174:80
155.4.244.218:80
103.59.41.180:10919
200.7.10.158:8080
45.79.158.235:1080
198.11.175.192:8080
187.60.46.113:80
3.226.168.144:80
110.238.113.119:8080
196.1.95.117:80
103.133.26.107:8181
173.197.167.242:8080
203.24.103.63:80
45.85.160.67:7159
194.110.150.44:8085
23.95.69.132:3128
23.94.242.35:3128
106.107.205.112:80
23.227.38.200:80
193.233.138.229:8085
193.233.228.142:8085
154.201.34.215:3128
193.202.15.74:8085
149.18.56.148:8085
203.22.223.39:80
149.18.59.55:8085
154.201.33.156:3128
193.233.137.238:8085
154.201.38.52:3128
193.233.228.45:8085
91.188.247.130:8085
154.201.33.220:3128
45.199.132.85:3128
154.201.34.39:3128
203.34.28.124:80
203.24.102.76:80
185.162.231.163:80
45.145.129.81:8085
154.201.37.27:3128
85.208.209.219:8085
154.201.34.152:3128
154.201.37.222:3128
154.201.33.131:3128
170.244.93.150:7711
203.32.121.9:80
91.226.97.248:80
203.13.32.1:80
45.8.105.254:80
154.201.34.140:3128
85.208.209.154:8085
66.151.50.58:6861
193.233.143.232:8085
85.209.151.221:8085
154.201.34.244:3128
154.201.33.15:3128
154.201.37.45:3128
149.18.56.174:8085
5.183.253.175:8085
45.72.40.35:9129
"""

In [21]:
proxies_list = [i for i in proxies_list.split('\n') if i]

In [22]:
default_ip = check_ip_proxy('')

In [23]:
default_ip

'185.235.174.60'

In [24]:
n_threads = 8

In [25]:
check_change_ip_partial = partial(check_change_ip, default_ip_address=default_ip)

In [26]:
with closing(ThreadPool(n_threads)) as p:
        mask = list(tqdm(p.imap(check_change_ip_partial, proxies_list), total=len(proxies_list)))

100%|█████████████████████████████████████████| 300/300 [05:14<00:00,  1.05s/it]


In [27]:
proxies_list = [i for c, i in enumerate(proxies_list) if mask[c]]

In [28]:
len(proxies_list)

18

## parsing itself

In [45]:
import os
import queue

In [46]:
ABS_PATH = 'trip_advisor_data'
if not os.path.exists(ABS_PATH):
    os.mkdir(ABS_PATH)

In [47]:
n_threads = 2
to_translate = False
to_scroll = False
parallel = True
headless = False

In [48]:
drivers = queue.Queue(maxsize=n_threads)
[drivers.put(get_driver(address=proxies_list[3],
                        run_headless=headless)) for i in range(n_threads)];

In [49]:
parse_reviews_partial = partial(parse_reviews,
                                 max_ex=max_ex, parts_scroll=5,
                                 to_translate=to_translate,
                                 to_scroll=to_scroll,
                               parallel=parallel,
                               abs_path=ABS_PATH)

In [50]:
user_agents = [user_agent_rotator.get_random_user_agent() for i in range(hotels_df.shape[0])]

In [51]:
sub_df = hotels_df[hotels_df['parsed']==False]
input_tuples = list(zip(sub_df['link'].values.tolist(), sub_df['title'].values.tolist(), user_agents))[:4]

In [52]:
def parse_reviews_multiprocessing(input_tuple):
    link, path, user_agent = input_tuple
    passed_dict = parse_reviews_partial(link, path=path, user_agent=user_agent)
    return passed_dict

In [53]:
if parallel and to_translate:
    with closing(ThreadPool(n_threads, init, initargs=[multiprocessing.Lock()])) as p:
        results = list(tqdm(p.imap(parse_reviews_multiprocessing, input_tuples), total=len(input_tuples)))
else:
    with closing(ThreadPool(n_threads)) as p:
        results = list(tqdm(p.imap(parse_reviews_multiprocessing, input_tuples), total=len(input_tuples)))

100%|████████████████████████████████████████████| 4/4 [17:18<00:00, 259.54s/it]


In [54]:
for _ in range(drivers.qsize()):
    drivers.get().quit()

## mark those we parsed sucessfully

In [56]:
mask_passed = dict([(i['link'], i['num_parsed']/i['num_overall']>0.8) for i in results])

In [57]:
mask_passed

{'https://www.tripadvisor.ru/Hotel_Review-g295377-d8318640-Reviews-Ibis_Styles_Lviv_Center-Lviv_Lviv_Oblast.html#REVIEWS': True,
 'https://www.tripadvisor.ru/Hotel_Review-g295377-d305221-Reviews-Grand_Hotel_Lviv_Luxury_Spa-Lviv_Lviv_Oblast.html#REVIEWS': False,
 'https://www.tripadvisor.ru/Hotel_Review-g1816350-d3426924-Reviews-Radisson_Blu_Resort_Bukovel-Yaremche_Ivano_Frankivsk_Oblast.html#REVIEWS': True,
 'https://www.tripadvisor.ru/Hotel_Review-g295377-d4061858-Reviews-Astoria_Hotel-Lviv_Lviv_Oblast.html#REVIEWS': True}

In [62]:
hotels_df.loc[hotels_df['parsed']==False,'parsed'] = hotels_df.loc[hotels_df['parsed']==False,'link']\
.apply(lambda x: mask_passed.get(x, False))

In [63]:
hotels_df.head()

Unnamed: 0,rating,link,title,parsed
0,4.5,https://www.tripadvisor.ru/Hotel_Review-g29537...,ibis Styles Lviv Center,True
1,4.5,https://www.tripadvisor.ru/Hotel_Review-g29536...,NEMO Hotel Resort & Spa,True
2,4.5,https://www.tripadvisor.ru/Hotel_Review-g29537...,Grand Hotel Lviv Luxury & Spa,False
3,4.5,https://www.tripadvisor.ru/Hotel_Review-g29447...,Ibis Kiev Railway Station,True
4,4.5,https://www.tripadvisor.ru/Hotel_Review-g18163...,Radisson Blu Resort Bukovel,True


In [64]:
hotels_df['parsed'].value_counts()

False    2924
True        5
Name: parsed, dtype: int64

In [65]:
hotels_df.to_csv('hotels_links.csv', index=False)

## parse reviews using bs4

In [66]:
def process_buble(x):
    return float('.'.join(x))

In [67]:
def load_pickle(path):
    with open(path,'rb') as f:
        file = pickle.load(f)
    return file

In [68]:
def bs4_parse_reviews(input_tuple):
    page_to_parse, hotel_name = input_tuple
    records = []
    try:
        for review_page in bs4.BeautifulSoup(page_to_parse).find_all('div', {'class':'WAllg _T'}):

            record = {}
            record['overall_rating'] = process_buble(review_page.find('div',{'data-test-target':'review-rating'})\
                                           .span['class'][-1].split('_')[-1])
            per_type_bubble = review_page.find_all('div', {'class':'hemdC S2 H2 WWOoy'})

            if per_type_bubble:
                for j in per_type_bubble:
                    record[j.text+'_rating'] = process_buble(j.span.span['class'][-1].split('_')[-1])


            record['review'] = review_page.find('div',{'class':'fIrGe _T'}).text
            record['hotel_name'] = hotel_name
            
            records.append(record)
    except Exception as ex:
        print(ex)
        
    return records

In [69]:
def parse_reviews_multiproc(name, abs_path):
    pages_to_parse = load_pickle(os.path.join(abs_path,name))
    records = []
    for page in pages_to_parse:
        records.extend(bs4_parse_reviews((page, name)))
    return records

In [70]:
partial_parse_reviews_multiproc = partial(parse_reviews_multiproc, abs_path=ABS_PATH)

In [71]:
inputs = hotels_df[hotels_df['parsed']==True]['title'].values.tolist()

In [72]:
reviews = multiprocess_func(inputs,
                  func=partial_parse_reviews_multiproc,
                  to_split=False,
                 n_processes=8)

100%|█████████████████████████████████████████████| 5/5 [00:24<00:00,  4.80s/it]


In [73]:
reviews = pd.DataFrame(reviews)

In [74]:
reviews['overall_rating'].value_counts()

5.0    2298
4.0     534
3.0     170
1.0      67
2.0      59
Name: overall_rating, dtype: int64

In [76]:
reviews.isna().sum()

overall_rating             0
review                     0
hotel_name                 0
Цена/качество_rating    2261
Расположение_rating     2199
Обслуживание_rating     1731
Номера_rating           2217
Качество сна_rating     2285
Чистота_rating          2230
dtype: int64

In [77]:
reviews.shape

(3128, 9)