In [None]:
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import requests
import re

REVIEW_REGEX_FORMAT = r'\((\d+|\d+\+?)\)'
MINIMUM_REVIEW_COUNT = 200

class ProductReviewCrawler:

    def __init__(self):
        self.crawler = ProductDetailReviewCrawler()

    def run(self):

        review_dictionary = {
            'id': [],
            'name': [],
            'date': [],
            'content': []
        }

        for page_idx in range(1, 15):
            response = requests.get(
                f'https://www.oliveyoung.co.kr/store/display/getMCategoryList.do?dispCatNo=100000100010013&fltDispCatNo=&prdSort=01&pageIdx={page_idx}&rowsPerPage=24&searchTypeSort=btn_list&plusButtonFlag=N&isLoginCnt=0&aShowCnt=0&bShowCnt=0&cShowCnt=0&trackingCd=Cat100000100010013_Small&amplitudePageGubun=&t_page=&t_click=&midCategory=%EC%8A%A4%ED%82%A8%2F%ED%86%A0%EB%84%88&smallCategory=%EC%A0%84%EC%B2%B4&checkBrnds=&lastChkBrnd=')
            html = response.text

            soup = BeautifulSoup(html, 'html.parser')
            product_id_list = soup.select('li[criteo-goods]')
            product_rating_list = soup.select("p.prd_point_area")

            for idx in range(len(product_id_list)):
                review_count_string = product_rating_list[idx].text

                match = re.search(REVIEW_REGEX_FORMAT, review_count_string)

                if match:
                    review_count = match.group(1)

                    if review_count[-1] == "+":
                        review_count = review_count[:-1]

                    if int(review_count) >= MINIMUM_REVIEW_COUNT:
                        product_id = product_id_list[idx].attrs['criteo-goods']

                        # A000000192697001 -> 실제 product id + 001을 붙여서 저장되어 있음
                        product_id = product_id[:-3]
                        review_dictionary = self.crawler.run(product_id, review_dictionary)

        df = pd.DataFrame(review_dictionary)
        print(df)

class ProductDetailReviewCrawler:

    def run(self, product_id, review_dictionary):
        # 크롬 사용
        driver = webdriver.Chrome()

        url = f'https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo={product_id}&dispCatNo=100000100010013&trackingCd=Cat100000100010013_Small&t_page='
        driver.get(url)

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        product_name = soup.select_one('p.prd_name')

        review_tab = driver.find_element(By.ID, 'reviewInfo')
        review_tab.click()

        wait = WebDriverWait(driver, 10)  # 10 seconds timeout

        # 대기 후 리뷰 탭이 나타날 때까지 wait
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.date')))

        for page_index in range(1, 21):
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            # 리뷰 작성 날짜
            review_dates = soup.select('span.date')
            # 리뷰 내용
            review_contents = soup.select('div.txt_inner')

            # 필요한 데이터 저장
            for idx in range(10):
                review_dictionary['id'].append(product_id)
                review_dictionary['name'].append(product_name)
                review_dictionary['date'].append(review_dates[idx].text)
                
                # 리뷰 텍스트가 존재하지 않는 경우도 있음. 빈 string값으로 채운다.
                try:
                    review_dictionary['content'].append(review_contents[idx].text)
                except(IndexError):
                    review_dictionary['content'].append("")

            wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'a[data-page-no="{page_index + 1}"]')))
            next_page_button = driver.find_element(By.CSS_SELECTOR, f'a[data-page-no="{page_index + 1}"]')
            next_page_button.click()
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.date')))

        # 상품 상세페이지 종료
        driver.quit()

        return review_dictionary

if __name__ == "__main__":
    crawler = ProductReviewCrawler()
    crawler.run()