In [9]:
# 상품 리뷰 통계 정보 크롤링

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import requests
import re

REVIEW_REGEX_FORMAT = r'\((\d+|\d+\+?)\)'
MINIMUM_REVIEW_COUNT = 200

class ProductReviewCrawler:
    review_info_dictionary = {
            'id': [],
            'review_score': [],
            'review_count' : [],
            'skin_type1' : [],
            'skin_type2' : [],
            'skin_type3' : [],
            'skin_effect1' : [],
            'skin_effect2' : [],
            'skin_effect3' : [],
            'skin_stim1' : [],
            'skin_stim2' : [],
            'skin_stim3' : []
    }

    def __init__(self):
        self.crawler = ProductDetailReviewInfoCrawler()

    def run(self):

        # 상품 일련번호(id)
        # 사용자들의 리뷰 점수 평균과 수
        # skin type (1: 건성, 2: 복합성, 3: 지성), skin_effect(1: 보습 효과, 2: 진정 효과, 3: 주름/미백 효과), skin_stim(1: 자극 없음, 2: 보통, 3: 자극 있음)
        
        review_info_dictionary = {
            'id': [],
            'review_score': [],
            'review_count' : [],
            'skin_type1' : [],
            'skin_type2' : [],
            'skin_type3' : [],
            'skin_effect1' : [],
            'skin_effect2' : [],
            'skin_effect3' : [],
            'skin_stim1' : [],
            'skin_stim2' : [],
            'skin_stim3' : []
        }
        
        for page_idx in range(1, 15):
            response = requests.get(
                f'https://www.oliveyoung.co.kr/store/display/getMCategoryList.do?dispCatNo=100000100010013&fltDispCatNo=&prdSort=01&pageIdx={page_idx}&rowsPerPage=24&searchTypeSort=btn_list&plusButtonFlag=N&isLoginCnt=0&aShowCnt=0&bShowCnt=0&cShowCnt=0&trackingCd=Cat100000100010013_Small&amplitudePageGubun=&t_page=&t_click=&midCategory=%EC%8A%A4%ED%82%A8%2F%ED%86%A0%EB%84%88&smallCategory=%EC%A0%84%EC%B2%B4&checkBrnds=&lastChkBrnd=')
            html = response.text

            soup = BeautifulSoup(html, 'html.parser')
            product_id_list = soup.select('li[criteo-goods]')
            product_rating_list = soup.select("p.prd_point_area")

            for idx in range(len(product_id_list)):
                review_count_string = product_rating_list[idx].text

                match = re.search(REVIEW_REGEX_FORMAT, review_count_string)

                if match:
                    review_count = match.group(1)
                    
                    # 리뷰 수가 999+ 인 경우 처리
                    if review_count[-1] == "+":
                        review_count = review_count[:-1]

                    if int(review_count) >= MINIMUM_REVIEW_COUNT:
                        product_id = product_id_list[idx].attrs['criteo-goods']

                        # A000000192697001 -> 실제 product id + 001을 붙여서 저장되어 있음
                        product_id = product_id[:-3]
                        review_info_dictionary = self.crawler.run(product_id, review_info_dictionary)
                        
        review_info_df = pd.DataFrame(review_info_dictionary)
        
        return review_info_df

class ProductDetailReviewInfoCrawler:

    def run(self, product_id, review_info_dictionary):
        # 크롬 사용
        driver = webdriver.Chrome()

        url = f'https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo={product_id}&dispCatNo=100000100010013&trackingCd=Cat100000100010013_Small&t_page='
        driver.get(url)

        review_tab = driver.find_element(By.ID, 'reviewInfo')
        review_tab.click()

        wait = WebDriverWait(driver, 10)  # 10 seconds timeout

        # 대기 후 리뷰 탭이 나타날 때까지 wait
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.date')))

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        review_score = soup.select_one('p.num strong')
        review_count = soup.select_one('p.total em')
        skin_ratio_list = soup.select('em[data-value]')
        
        review_info_dictionary['id'].append(product_id)
        review_info_dictionary['review_score'].append(review_score.text)
        review_info_dictionary['review_count'].append(review_count.text)

        for idx, skin_ratio in enumerate(skin_ratio_list):
            value = int(skin_ratio['data-value'])
            
            if idx < 3:
                # 첫 번째 3개의 값은 skin_type 관련 데이터
                review_info_dictionary[f'skin_type{idx + 1}'].append(value)
                
            elif 3 <= idx < 6:
                # 다음 3개의 값은 skin_effect 관련 데이터
                review_info_dictionary[f'skin_effect{idx - 2}'].append(value)
                
            else:
                # 마지막 3개의 값은 skin_stim 관련 데이터
                review_info_dictionary[f'skin_stim{idx - 5}'].append(value)
        
        # 상품 상세페이지 종료
        driver.quit()
    
        return review_info_dictionary

if __name__ == "__main__":
    crawler = ProductReviewCrawler()
    review_info_df = crawler.run()

In [7]:
# 상품 리뷰 내용(댓글) 크롤링

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import requests
import re

REVIEW_REGEX_FORMAT = r'\((\d+|\d+\+?)\)'
MINIMUM_REVIEW_COUNT = 200

class ProductReviewCrawler:

    def __init__(self):
        self.crawler = ProductDetailReviewContentCrawler()

    def run(self):

        # 실제 댓글의 리뷰 내용을 담는다.
        review_content_dictionary = {
            'id': [],
            'name': [],
            'date': [],
            'content': []
        }
        
        for page_idx in range(1, 15):
            response = requests.get(
                f'https://www.oliveyoung.co.kr/store/display/getMCategoryList.do?dispCatNo=100000100010013&fltDispCatNo=&prdSort=01&pageIdx={page_idx}&rowsPerPage=24&searchTypeSort=btn_list&plusButtonFlag=N&isLoginCnt=0&aShowCnt=0&bShowCnt=0&cShowCnt=0&trackingCd=Cat100000100010013_Small&amplitudePageGubun=&t_page=&t_click=&midCategory=%EC%8A%A4%ED%82%A8%2F%ED%86%A0%EB%84%88&smallCategory=%EC%A0%84%EC%B2%B4&checkBrnds=&lastChkBrnd=')
            html = response.text

            soup = BeautifulSoup(html, 'html.parser')
            product_id_list = soup.select('li[criteo-goods]')
            product_rating_list = soup.select("p.prd_point_area")

            for idx in range(len(product_id_list)):
                review_count_string = product_rating_list[idx].text

                match = re.search(REVIEW_REGEX_FORMAT, review_count_string)

                if match:
                    review_count = match.group(1)

                    if review_count[-1] == "+":
                        review_count = review_count[:-1]

                    if int(review_count) >= MINIMUM_REVIEW_COUNT:
                        product_id = product_id_list[idx].attrs['criteo-goods']

                        # A000000192697001 -> 실제 product id + 001을 붙여서 저장되어 있음
                        product_id = product_id[:-3]
                        review_content_dictionary = self.crawler.run(product_id, review_content_dictionary)

        review_content_df = pd.DataFrame(review_content_dictionary)
        return review_content_df

class ProductDetailReviewContentCrawler:

    def run(self, product_id, review_content_dictionary):
        # 크롬 사용
        driver = webdriver.Chrome()

        url = f'https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo={product_id}&dispCatNo=100000100010013&trackingCd=Cat100000100010013_Small&t_page='
        driver.get(url)

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        product_name = soup.select_one('p.prd_name')

        review_tab = driver.find_element(By.ID, 'reviewInfo')
        review_tab.click()

        wait = WebDriverWait(driver, 10)  # 10 seconds timeout

        # 대기 후 리뷰 탭이 나타날 때까지 wait
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.date')))

        for page_index in range(1, 21):
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            # 리뷰 작성 날짜
            review_dates = soup.select('span.date')
            # 리뷰 내용
            review_contents = soup.select('div.txt_inner')

            # 필요한 데이터 저장
            for idx in range(10):
                review_content_dictionary['id'].append(product_id)
                
                if not product_name:
                    review_content_dictionary['name'].append("")
                else:
                    review_content_dictionary['name'].append(product_name.text)
                    
                review_content_dictionary['date'].append(review_dates[idx].text)
                
                # 리뷰 텍스트가 존재하지 않는 경우도 있음. 빈 string값으로 채운다.
                try:
                    review_content_dictionary['content'].append(review_contents[idx].text)
                except(IndexError):
                    review_content_dictionary['content'].append("")

            wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'a[data-page-no="{page_index + 1}"]')))
            next_page_button = driver.find_element(By.CSS_SELECTOR, f'a[data-page-no="{page_index + 1}"]')
            next_page_button.click()
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.date')))

        # 상품 상세페이지 종료
        driver.quit()

        return review_content_dictionary

if __name__ == "__main__":
    crawler = ProductReviewCrawler()
    review_content_df = crawler.run()

In [12]:
# 두 dataframe을 join

review_df = pd.concat([review_info_df, review_content_df],axis=1)
print(review_df)

                  id review_score review_count  skin_type1  skin_type2  \
0      A000000192697          4.8      16,418         19.0        61.0   
1      A000000192752          4.8      27,734         26.0        60.0   
2      A000000155253          4.8       7,477         23.0        62.0   
3      A000000192545          4.8       8,214         22.0        53.0   
4      A000000174370          4.8       7,217         35.0        52.0   
...              ...          ...          ...         ...         ...   
33595            NaN          NaN          NaN         NaN         NaN   
33596            NaN          NaN          NaN         NaN         NaN   
33597            NaN          NaN          NaN         NaN         NaN   
33598            NaN          NaN          NaN         NaN         NaN   
33599            NaN          NaN          NaN         NaN         NaN   

       skin_type3  skin_effect1  skin_effect2  skin_effect3  skin_stim1  \
0            20.0          20.0     

In [17]:
review_content_df.to_csv('review.csv', index=False, encoding='utf-8-sig')