In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install selenium
!pip install webdriver_manager
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
from selenium.webdriver.common.by import By
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from openpyxl import Workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
import time
import datetime
import requests
import traceback

In [None]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

In [None]:
session = requests.Session()
headers = {
    "User-Agent": "user value"}

retries = Retry(total=5,
                backoff_factor=0.1,
                status_forcelist=[500, 502, 503, 504])

session.mount('http://', HTTPAdapter(max_retries=retries))

In [None]:
# Maps 장소 ID를 통해 크롤링
# kakao place IDs
places = ["1140972979", "810636715", "1621874658", "18783705", "1497915448", "1879186093",
          "963022709", "592472562", "12009254", "1512705135", "1353342342"]

In [None]:
# kakao
for place in places:
    url = 'https://place.map.kakao.com/'+place
    try:
        now = datetime.datetime.now()
        xlsx = Workbook()
        list_sheet = xlsx.create_sheet('output')
        list_sheet.append(['nickname', 'content', 'date', 'star'])

        res = driver.get(url)
        driver.implicitly_wait(30)

        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)

        try:
            while True:
                elem = driver.find_element(By.CSS_SELECTOR, '#mArticle > div.cont_evaluation > div.evaluation_review > a')
                if 'link_unfold' in elem.get_attribute('class'):
                    print('finish')
                    break
                else:
                    elem.click()
                    time.sleep(0.4)
        except Exception as e:
            print(e)

        time.sleep(25)
        html = driver.page_source
        bs = BeautifulSoup(html, 'lxml')
        reviews = bs.select('#mArticle > div.cont_evaluation > div.evaluation_review > ul > li')

        for r in reviews:
            nickname = r.select('div.unit_info > a')
            content = r.select('div.comment_info > p > span')
            date = r.select('div.unit_info > span.time_write')

            star_span = r.select_one('div.star_info > div > span > span')
            star = int(star_span.get('style').split(':')[1].strip().split("%;")[0]) // 20

            # exception handling
            nickname = nickname[0].text if nickname else ''
            content = content[0].text if content else ''
            date = date[0].text if date else ''
            time.sleep(0.06)

            list_sheet.append([nickname, content, date, star])
            time.sleep(0.06)
        # 크롤링 결과 파일로 저장
        file_name = 'kakao_review_' + place + "_" + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
        xlsx.save(file_name)
        print(file_name)

    except Exception as e:
        print(traceback.format_exc())
        # 에러 발생 시 temp 파일 저장
        file_name = 'kakao_review_' + place + "_" + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
        xlsx.save(file_name)

In [None]:
# naver place IDs
places = ["1307938559", "1501864632", "1825537456", "30811729", "38762477", "1356864288", "1903382169", "1202531417",
          "1655931682", "1051363994", "20050482", "33109015", "1278462966", "32839677",
          "34818294", "33860261", "20145002", "1655063202", "38463465", "1097784516"]

In [None]:
# naver
for place in places:
    url = 'https://m.place.naver.com/restaurant/'+place+'/review/visitor?entry=ple&reviewSort=recent'
    try:
        now = datetime.datetime.now()
        xlsx = Workbook()
        list_sheet = xlsx.create_sheet('output')
        list_sheet.append(['nickname', 'content', 'date', 'revisit'])

        res = driver.get(url)
        driver.implicitly_wait(30)

        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)

        try:
            while True:
                driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a').click()
                time.sleep(0.4)
        except Exception as e:
            print('finish')

        time.sleep(25)
        html = driver.page_source
        bs = BeautifulSoup(html, 'lxml')
        reviews = bs.select('li.YeINN')

        for r in reviews:
            nickname = r.select('div.SdWYt>a.QAxJb>div.VYGLG')
            content = r.select('div.ZZ4OK.IwhtZ>a>span')
            date = r.select('div.qM6I7>div>div._7kR3e>span:nth-child(1)>time')
            revisit = r.select('div.qM6I7>div>div._7kR3e>span:nth-child(2)')

            # exception handling
            nickname = nickname[0].text if nickname else ''
            content = content[0].text if content else ''
            date = date[0].text if date else ''
            revisit = revisit[0].text if revisit else ''
            time.sleep(0.06)

            list_sheet.append([nickname, content, date, revisit])
            time.sleep(0.06)
        # 크롤링 결과 파일로 저장
        file_name = 'naver_review_' + place + "_" + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
        xlsx.save(file_name)
        print(file_name)

    except Exception as e:
        print(traceback.format_exc())
        # 에러 발생 시 temp 파일 저장
        file_name = 'naver_review_' + place + "_" + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
        xlsx.save(file_name)