## Demo

In [33]:
import time
import random as rd
import pandas as pd
import numpy as np
from tqdm import tqdm

import requests # 페이지를 요청하는 기능 
from bs4 import BeautifulSoup # 파이썬으로 HTML을 다루는 기능
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
# driver = webdriver.Chrome(ChromeDriverManager().install())

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

user_agent = "Mozilla/5.0 (Linux; Android 9; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.83 Mobile Safari/537.36"
headers = {
    "User-Agent": user_agent
}

In [34]:
def setRestaurantPages(url: str):
    url = url   
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # 200번대 코드가 아니면 코드가 멈춤
    print("Response successful!")
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    restraurant_pages = []
    total_restraurants = int(soup.find_all('span', class_="b")[1].text)
    main_pages = total_restraurants//30 if total_restraurants%30==0 else total_restraurants//30+1
    for i in tqdm(range(main_pages-1), desc="Set restaurant pages"):
        if i > 0:
            current_url = url[:48] + f'oa{i*30}' + url[47:]
            # print(f'current_url: {current_url}')
            response = requests.get(current_url, headers=headers)
            response.raise_for_status()
            html = response.text
            soup = BeautifulSoup(html, "html.parser")

        for restraurant_page in soup.find_all('div', class_="biGQs _P fiohW alXOW NwcxK GzNcM ytVPx UTQMg RnEEZ ngXxk"):
            restraurant_pages.append("https://www.tripadvisor.com/" + restraurant_page.find('a')['href'])
            # print(restraurant_page.find('a')['href'])
        time.sleep
        # print(f'Total restraurant pages: {len(restraurant_pages)}')
        
    print(f'Total restraurant pages: {len(restraurant_pages)}')
    
    return restraurant_pages

In [39]:
def createDataset():
    dataset = pd.DataFrame({
        "name": [],
        "category": [],
        "description": [],
        "time": [],
        "URL": [],
        "priceLow": [],
        "priceHigh": [],
        "reviews": [],
        "adress": [],
        "rating": []
    })
    
    return dataset

In [36]:
def setWebdriver():
    options = Options()
    options.add_argument('user-agent=' + user_agent)
    # options.add_argument('headless') #headless모드 브라우저가 뜨지 않고 실행됩니다.
    # options.add_argument('--window-size= x, y') #실행되는 브라우저 크기를 지정할 수 있습니다.
    # options.add_argument('--start-maximized') #브라우저가 최대화된 상태로 실행됩니다.
    # options.add_argument('--start-fullscreen') #브라우저가 풀스크린 모드(F11)로 실행됩니다.
    # options.add_argument('--blink-settings=imagesEnabled=false') #브라우저에서 이미지 로딩을 하지 않습니다.
    options.add_argument('--mute-audio') #브라우저에 음소거 옵션을 적용합니다.
    options.add_argument('incognito') #시크릿 모드의 브라우저가 실행됩니다.
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

    return driver

In [50]:
def buildDataset(dataset, pages: list):
    cnt = 0
    for restaurant_page in tqdm(pages, desc="Restaurants"):
        response = requests.get(restaurant_page, headers=headers)
        response.raise_for_status()
        html = response.text
        soup = BeautifulSoup(html, "html.parser")

        # name 
        name = soup.find('h1', class_="HjBfq").text;    print(f'Name: {name}')

        # category
        category = ''
        for x in soup.find('span', class_="DsyBj DxyfE"):
            if '$' not in x.text:
                category += f'{x.text}, '
        category = category[:-2]
        print(f'Category: {category}')

        # adress
        adress = soup.find_all('a', class_="AYHFM")[1].text;   print(f'Adress: {adress}')

        # rating
        rating = soup.find('span', class_="ZDEqb").text;    print(f'Rating: {rating}')


        # url
        url = restaurant_page;    print(f'URL: {url}')

        # price
        try:
            priceLow = soup.find('div', class_="SrqKb").text.split(' - ')[0] 
            priceHigh = soup.find('div', class_="SrqKb").text.split(' - ')[1]
        except:
            priceLow = np.NaN; priceHigh = np.NaN
        print (f'PriceLow: {priceLow}, PriceHigh: {priceHigh}')

        # reviews, only english reviews
        total_reviews = int(soup.find_all('span', class_="count")[0].text[1:-1].replace(',', ''));    print(f'Total_reviews: {total_reviews}')
        review_pages = total_reviews//15 if total_reviews%15 == 0 else total_reviews//15 + 1;    print(f'Review pages: {review_pages}')
        review_list = []
        for i in tqdm(range(0, review_pages), desc='Review crwaling...'):
            if i > 1:
                url_list = url.split('-Reviews-')
                current_url = url_list[0] + f'-Reviews-or{15*i}-' + url_list[1]
                # print(f'current_url: {current_url}')
                response = requests.get(current_url, headers=headers)
                response.raise_for_status()
                html = response.text
                soup = BeautifulSoup(html, "html.parser") 

            reviews = soup.find_all('p', class_="partial_entry");
            for review in reviews:
                review_list.append(review.text)
            #     print(review.text)
            # print(f'Reviews: {len(review_list)}')

            time.sleep(rd.uniform(0.1, 0.5))
        print(f'Reviews: {len(review_list)}')


        # Using selenium
        driver.get(url)

        # # time
        # openingTime = soup.find('span', class_="mMkhr").text[11:19];    
        # closingTime = soup.find('span', class_="mMkhr").text[22:30];   
        # print(f'OpeningTime: {openingTime}, ClosingTime: {closingTime}')
        try:
            WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#component_50 > div > div:nth-child(3) > span.DsyBj.YTODE > div > span.mMkhr")))
            element = driver.find_element(By.CSS_SELECTOR, "#component_50 > div > div:nth-child(3) > span.DsyBj.YTODE > div > span.mMkhr")
            time.sleep(1)
            element.click() 
        except:
            times = np.NaN

        time_list = [] 
        time_elements = driver.find_elements(By.CLASS_NAME, "RiEuX.f")
        for time_element in time_elements:
            time_list.append(time_element.text.replace('\n', ':'))
        print(f'Times: {time_list}')

        # description
        try:
            WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#component_52 > div.hILIJ > div > div:nth-child(2) > div > div > div.gmbZC > a")))
            element = driver.find_element(By.CSS_SELECTOR, "#component_52 > div.hILIJ > div > div:nth-child(2) > div > div > div.gmbZC > a")
            element.click()
            WebDriverWait(driver, 3).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#BODY_BLOCK_JQUERY_REFLOW > div.VZmgo.D.X0.X1.Za > div > div.TocEc._Z.S2.H2._f > div > div > div.kwVln > div > div:nth-child(1) > div > div.jmnaM")))
            element = driver.find_element(By.CSS_SELECTOR, "#BODY_BLOCK_JQUERY_REFLOW > div.VZmgo.D.X0.X1.Za > div > div.TocEc._Z.S2.H2._f > div > div > div.kwVln > div > div:nth-child(1) > div > div.jmnaM")
            description = element.text
            print(description)
        except:
            description = np.NaN
        print(f'Description: {description}\n')
        
        time.sleep(rd.uniform(1, 2))

        dataset.loc[cnt] = [name, category, description, time_list, url, priceLow, priceHigh, review_list, adress, rating]
        cnt += 1
        
        
    return dataset 

In [51]:
url = "https://www.tripadvisor.com/Restaurants-g298085-Da_Nang.html"
restraurant_pages = setRestaurantPages(url)

# create dataset
dataset = createDataset()

# set webdriver
driver = setWebdriver()

# start crawling
dataset = buildDataset(dataset, restraurant_pages[:5])

# save dataset
dataset.to_csv('test.csv')
print(f'Dataset build complete!\n')
dataset

Response successful!


Set restaurant pages: 100%|██████████| 49/49 [01:12<00:00,  1.49s/it]


Total restraurant pages: 1811


Restaurants:   0%|          | 0/5 [00:00<?, ?it/s]

Name: Rainbowl Poke
Category: Japanese, Fusion, Healthy
Adress: 02 An Thượng 32 Bắc Mỹ Phú, Ngũ Hành Sơn, Da Nang 550000 Vietnam
Rating: 5.0 
URL: https://www.tripadvisor.com//Restaurant_Review-g298085-d24038346-Reviews-Rainbowl_Poke-Da_Nang.html
PriceLow: nan, PriceHigh: nan
Total_reviews: 19
Review pages: 2


Review crwaling...: 100%|██████████| 2/2 [00:00<00:00,  2.26it/s]


Reviews: 30
Times: []
Description: nan



  element = np.asarray(element)
Restaurants:  20%|██        | 1/5 [00:19<01:17, 19.34s/it]

Name: Cardi Pizzeria Bach Dang
Category: American, Steakhouse, Brew Pub
Adress: 124 Bạch Đằng, Hải Châu, Đà Nẵng, Da Nang Vietnam
Rating: 5.0 
URL: https://www.tripadvisor.com//Restaurant_Review-g298085-d24985405-Reviews-Cardi_Pizzeria_Bach_Dang-Da_Nang.html
PriceLow: nan, PriceHigh: nan
Total_reviews: 333
Review pages: 23


Review crwaling...: 100%|██████████| 23/23 [00:49<00:00,  2.17s/it]


Reviews: 333
Times: ['Sun:12:00 AM - 11:59 PM', 'Mon:12:00 AM - 11:59 PM', 'Tue:12:00 AM - 11:59 PM', 'Wed:12:00 AM - 11:59 PM', 'Thu:12:00 AM - 11:59 PM', 'Fri:12:00 AM - 11:59 PM', 'Sat:12:00 AM - 11:59 PM']
Description: nan



Restaurants:  40%|████      | 2/5 [01:20<02:12, 44.03s/it]

Name: Ca Chuon Co Vietnamese - Seafood Restaurant.
Category: Cafe, International, Asian
Adress: 99 Vo Nguyen Giap Street Ngu Hanh Son District, Da Nang 550000 Vietnam
Rating: 5.0 
URL: https://www.tripadvisor.com//Restaurant_Review-g298085-d7000524-Reviews-Ca_Chuon_Co_Vietnamese_Seafood_Restaurant-Da_Nang.html
PriceLow: ₩12,953, PriceHigh: ₩38,860
Total_reviews: 226
Review pages: 16


Review crwaling...: 100%|██████████| 16/16 [00:32<00:00,  2.03s/it]


Reviews: 226
Times: ['Sun:06:00 AM - 10:00 PM', 'Mon:06:00 AM - 10:00 PM', 'Tue:06:00 AM - 10:00 PM', 'Wed:06:00 AM - 10:00 PM', 'Thu:06:00 AM - 10:00 PM', 'Fri:06:00 AM - 10:00 PM', 'Sat:06:00 AM - 10:00 PM']
Description: nan



Restaurants:  60%|██████    | 3/5 [02:03<01:27, 43.67s/it]

Name: Bistecca Restaurant Danang
Category: Italian, Southwestern, European
Adress: 20 Dong Da 7F New Orient Hotel, Thuan Phuoc Ward, Hai Chau District, Da Nang 550000 Vietnam
Rating: 5.0 
URL: https://www.tripadvisor.com//Restaurant_Review-g298085-d13236259-Reviews-Bistecca_Restaurant_Danang-Da_Nang.html
PriceLow: ₩3,886, PriceHigh: ₩64,767
Total_reviews: 77
Review pages: 6


Review crwaling...: 100%|██████████| 6/6 [00:09<00:00,  1.53s/it]


Reviews: 77
Times: ['Sun:06:00 AM - 10:00 PM', 'Mon:06:00 AM - 10:00 PM', 'Tue:06:00 AM - 10:00 PM', 'Wed:06:00 AM - 10:00 PM', 'Thu:06:00 AM - 10:00 PM', 'Fri:06:00 AM - 10:00 PM', 'Sat:06:00 AM - 10:00 PM']
Description: nan



Restaurants:  80%|████████  | 4/5 [02:24<00:34, 34.51s/it]

Name: 3 Big - Nướng & Lẩu
Category: Pizza, Barbecue, Korean
Adress: 134 Phan Châu Trinh, Phước Ninh, Hải Châu, Đà Nẵng, Da Nang 550000 Vietnam
Rating: 5.0 
URL: https://www.tripadvisor.com//Restaurant_Review-g298085-d24082246-Reviews-3_Big_Nu_ng_L_u-Da_Nang.html
PriceLow: nan, PriceHigh: nan
Total_reviews: 13
Review pages: 1


Review crwaling...: 100%|██████████| 1/1 [00:00<00:00,  2.88it/s]


Reviews: 13
Times: ['Sun:11:00 AM - 11:00 PM', 'Mon:11:00 AM - 10:00 PM', 'Tue:11:00 AM - 10:00 PM', 'Wed:11:00 AM - 10:00 PM', 'Thu:11:00 AM - 10:00 PM', 'Fri:11:00 AM - 10:00 PM', 'Sat:11:00 AM - 10:00 PM']
Description: nan



Restaurants: 100%|██████████| 5/5 [02:36<00:00, 31.27s/it]

Dataset build complete!






Unnamed: 0,name,category,description,time,URL,priceLow,priceHigh,reviews,adress,rating
0,Rainbowl Poke,"Japanese, Fusion, Healthy",,[],https://www.tripadvisor.com//Restaurant_Review...,,,[Delicious food. Friendly staff and great musi...,"02 An Thượng 32 Bắc Mỹ Phú, Ngũ Hành Sơn, Da N...",5.0
1,Cardi Pizzeria Bach Dang,"American, Steakhouse, Brew Pub",,"[Sun:12:00 AM - 11:59 PM, Mon:12:00 AM - 11:59...",https://www.tripadvisor.com//Restaurant_Review...,,,[I came in for pizza because they look like th...,"124 Bạch Đằng, Hải Châu, Đà Nẵng, Da Nang Vietnam",5.0
2,Ca Chuon Co Vietnamese - Seafood Restaurant.,"Cafe, International, Asian",,"[Sun:06:00 AM - 10:00 PM, Mon:06:00 AM - 10:00...",https://www.tripadvisor.com//Restaurant_Review...,"₩12,953","₩38,860",[We are very impressed by the excellent servic...,99 Vo Nguyen Giap Street Ngu Hanh Son District...,5.0
3,Bistecca Restaurant Danang,"Italian, Southwestern, European",,"[Sun:06:00 AM - 10:00 PM, Mon:06:00 AM - 10:00...",https://www.tripadvisor.com//Restaurant_Review...,"₩3,886","₩64,767",[Beautiful dinner and great service! All meals...,"20 Dong Da 7F New Orient Hotel, Thuan Phuoc Wa...",5.0
4,3 Big - Nướng & Lẩu,"Pizza, Barbecue, Korean",,"[Sun:11:00 AM - 11:00 PM, Mon:11:00 AM - 10:00...",https://www.tripadvisor.com//Restaurant_Review...,,,[Great service. Good price and good food. The ...,"134 Phan Châu Trinh, Phước Ninh, Hải Châu, Đà ...",5.0


---
### Partial testing

In [48]:
# reviews, only english reviews

url = "https://www.tripadvisor.com//Restaurant_Review-g298085-d24082246-Reviews-3_Big_Nu_ng_L_u-Da_Nang.html"
response = requests.get(url, headers=headers)
response.raise_for_status()
html = response.text
soup = BeautifulSoup(html, "html.parser")

total_reviews = int(soup.find_all('span', class_="count")[0].text[1:-1].replace(',', ''));    print(f'Total_reviews: {total_reviews}')
review_pages = total_reviews//15 if total_reviews%15 == 0 else total_reviews//15 + 1;    print(f'Review pages: {review_pages}')
review_list = []
for i in tqdm(range(0, review_pages), desc='Review crwaling...'):
    if i > 1:
        url_list = url.split('-Reviews-')
        current_url = url_list[0] + f'-Reviews-or{15*i}-' + url_list[1]
        # print(f'current_url: {current_url}')
        response = requests.get(current_url, headers=headers)
        response.raise_for_status()
        html = response.text
        soup = BeautifulSoup(html, "html.parser") 

    reviews = soup.find_all('p', class_="partial_entry");
    for review in reviews:
        review_list.append(review.text)
    #     print(review.text)
    # print(f'Reviews: {len(review_list)}')

    time.sleep(rd.uniform(0.1, 0.5))
print(f'Reviews: {len(review_list)}')

Total_reviews: 13
Review pages: 1


Review crwaling...: 100%|██████████| 1/1 [00:00<00:00,  2.97it/s]

Reviews: 13





# time test

driver = setWebdriver()
url = "https://www.tripadvisor.com/Restaurant_Review-g15296807-d13810289-Reviews-Thia_G_Restaurant_Da_Nang-My_An_Da_Nang.html"
response = driver.get(url)

try:
    WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#component_50 > div > div:nth-child(3) > span.DsyBj.YTODE > div > span.mMkhr")))
    element = driver.find_element(By.CSS_SELECTOR, "#component_50 > div > div:nth-child(3) > span.DsyBj.YTODE > div > span.mMkhr")
    time.sleep(1)
    element.click() 
except:
    times = np.NaN

time_list = [] 
time_elements = driver.find_elements(By.CLASS_NAME, "RiEuX.f")
for time_element in time_elements:
    time_list.append(time_element.text.replace('\n', ':'))
    print(time_element)
print(time_list)