## 라이브러리

In [1]:
import time
# Selenium 모듈에서 webdriver를 가져옵니다.
from selenium import webdriver 
# Chrome 브라우저 옵션을 다루는 Options 클래스를 가져옵니다.
from selenium.webdriver.chrome.options import Options
# Chrome 드라이버 서비스를 다루는 Service 클래스를 가져옵니다.
from selenium.webdriver.chrome.service import Service
# Chrome 드라이버를 다운로드하고 설치하는 데 사용되는 ChromeDriverManager를 가져옵니다.
from webdriver_manager.chrome import ChromeDriverManager

 
# Options 클래스의 인스턴스를 생성합니다.
options = Options()
 
# Chrome 브라우저 창이 즉시 닫히는 것을 방지합니다.
options.add_experimental_option('detach', True)
 
# 불필요한 콘솔 메시지를 제거합니다.
options.add_experimental_option('excludeSwitches', ['enable-logging'])
 
# Chrome 드라이버 서비스를 생성합니다.
service = Service(ChromeDriverManager().install())

## 네이버 api

In [2]:
from PyNaver import Naver

from dotenv import load_dotenv
import os
import random

load_dotenv()

# 애플리케이션 인증 정보
client_id = os.getenv("client_id")
client_secret = os.getenv("client_secret")

# 네이버 API 인스턴스 생성
api = Naver(client_id, client_secret)

## Shop api 추출

In [3]:
from bs4 import BeautifulSoup
import requests

def getdata_naverapi(url, keyword, params):
    headers = {'X-Naver-Client-Id': client_id,
               'X-Naver-Client-Secret': client_secret}

    r = requests.get(url, params=params, headers=headers)
    rescode = r.status_code
    
    if rescode == 200:
        j = r.json()
        # for i, item in enumerate(j['items'][:1], start=1):
        #     print(i, item)
        return j
    else:
        print("Error Code:", rescode)



url = 'https://openapi.naver.com/v1/search/shop.json' # api 주소
keyword = '정관장' # 키워드
# s = 1, d = 100, 100 
# s = 101, d = 100, 201
# s = 201, d = 100, 301
# ~~ 
# s = 901, d = 100, 1001
# s = 1000, d = 100, 1100
total_params = []
for s in range(1,1001,100):
    d = 100
    params = {'query': keyword,
              'display': d,
              'start': s,
              'sort': 'sim'} #파라미터
    total_params.append(params) 

d = 100
params = {'query': keyword,
              'display': d,
              'start': 1000,
              'sort': 'sim'} #파라미터
total_params.append(params) 


# title link image Iprice hprice mallName 
# productId productType brand maker category1~4
data = []
for params in total_params:
    result = getdata_naverapi(url, keyword, params)
    if result:
        for i, item in enumerate(result['items'], start=1):
            name = item['title']
            lprice = item['lprice']
            hprice = item['hprice']
            link = item['link']
            mallName = item['mallName']
            maker = item['maker']
            brand = item['brand']
            category1 = item['category1']
            category2 = item['category2']
            category3 = item['category3']
            category4 = item['category4']
            product_naver_id = item['productId']
            
            soup = BeautifulSoup(name, 'html.parser')
            cl_name = soup.get_text()
            dic = {
                'name': cl_name,
                'lprice': lprice,
                'hprice': hprice,
                'link': link,
                'mallName': mallName,
                'maker': maker,
                'brand': brand,
                'category1': category1,
                'category2': category2,
                'category3': category3,
                'category4': category4,
                'productId': product_naver_id


            }
           
            data.append(dic)



  soup = BeautifulSoup(name, 'html.parser')


In [4]:
data

[{'name': '정관장 홍삼정 에브리타임 밸런스 10ml x 30개입',
  'lprice': '17960',
  'hprice': '',
  'link': 'https://search.shopping.naver.com/gate.nhn?id=11259655619',
  'mallName': '네이버',
  'maker': '한국인삼공사',
  'brand': '정관장',
  'category1': '식품',
  'category2': '건강식품',
  'category3': '홍삼',
  'category4': '홍삼액',
  'productId': '11259655619'},
 {'name': '정관장 홍삼정 240g',
  'lprice': '150240',
  'hprice': '',
  'link': 'https://search.shopping.naver.com/gate.nhn?id=5944644989',
  'mallName': '네이버',
  'maker': '한국인삼공사',
  'brand': '정관장',
  'category1': '식품',
  'category2': '건강식품',
  'category3': '홍삼',
  'category4': '홍삼액',
  'productId': '5944644989'},
 {'name': '정관장 활기력 세트 20ml x 16개입',
  'lprice': '23800',
  'hprice': '',
  'link': 'https://search.shopping.naver.com/gate.nhn?id=5948365403',
  'mallName': '네이버',
  'maker': '한국인삼공사',
  'brand': '정관장',
  'category1': '식품',
  'category2': '건강식품',
  'category3': '홍삼',
  'category4': '홍삼액',
  'productId': '5948365403'},
 {'name': '정관장 홍삼정 에브리타임 밸런스 홍삼스틱 애브리타임 

## 링크만

In [6]:
links = [item['link'] for item in data]
links

['https://search.shopping.naver.com/gate.nhn?id=11259655619',
 'https://search.shopping.naver.com/gate.nhn?id=5944644989',
 'https://search.shopping.naver.com/gate.nhn?id=5948365403',
 'https://search.shopping.naver.com/gate.nhn?id=82064672715',
 'https://search.shopping.naver.com/gate.nhn?id=9255057462',
 'https://search.shopping.naver.com/gate.nhn?id=9689425728',
 'https://search.shopping.naver.com/gate.nhn?id=6663522597',
 'https://search.shopping.naver.com/gate.nhn?id=12022041834',
 'https://search.shopping.naver.com/gate.nhn?id=22188734073',
 'https://search.shopping.naver.com/gate.nhn?id=5770614300',
 'https://search.shopping.naver.com/gate.nhn?id=17789393164',
 'https://search.shopping.naver.com/gate.nhn?id=22206111184',
 'https://search.shopping.naver.com/gate.nhn?id=14758481330',
 'https://search.shopping.naver.com/gate.nhn?id=5594729682',
 'https://search.shopping.naver.com/gate.nhn?id=14240053390',
 'https://search.shopping.naver.com/gate.nhn?id=5769800164',
 'https://search

In [7]:
# 테스트용 링크 데이터 샘플 3개
test_link = links[:2].copy()

## 링크 데이터 추출

In [8]:
from selenium.webdriver.common.by import By
import re

# 리뷰 데이터 추출
def score_extract(element):
    return int(re.findall(r'\d+', element.text)[0])

def idx_extract(infos):
    try:
        try:
            if infos.text[-1] == '*':
                idx = True
            else:
                idx = False
        except:
            idx = False
        return idx
    except:
        # print(infos.text)
        return 'error'

def text_extract(infos):
    return infos.text

def extract_review_data(reviews):
    # 평점
    scores = reviews.find_elements(By.CLASS_NAME, "reviewItems_average__0kLWX")
    scores = list(map(score_extract, scores))
    
    # 리뷰에서 스토어, ID, DATE 추출
    infos = reviews.find_elements(By.CLASS_NAME, "reviewItems_etc__9ej69")

    # print(infos)
    id_idxs = list(map(idx_extract, infos))
    infos  = list(map(text_extract, infos))

    ids = []
    dates = []
    
    for id, b in enumerate(id_idxs):
        if b == True:
            ids.append(infos[id])
            dates.append(infos[id+1])
    
    # 리뷰에서 데이터 추출
    datas = reviews.find_elements(By.CLASS_NAME, "reviewItems_text__XrSSf")
    datas  = list(map(text_extract, datas))
    return list(zip(ids, scores, dates, datas))

# 활성 페이지 찾기
def search_index(a_list):
    for idx, a in enumerate(a_list):
        try:
            now = [True, a.find_element(By.TAG_NAME, 'span')]
            return idx
        except:
            now = [False, []]

In [10]:
# 링크입력
driver = webdriver.Chrome(service=service, options=options)
review_ls_list = []
for l in test_link:
    driver.get(l)
    time.sleep(random.uniform(2,4))

    # 사러가기 클릭
    # xpath = '//*[@id="wrap"]/div[2]/a[2]'
    # element = driver.find_element(By.XPATH, xpath)
    # category_1 = {idx:name for idx, name in enumerate(element.text.split('\n'), start=1)}
    # category_1

    xpath = '//*[@id="wrap"]/div[2]/a[2]'
    element = driver.find_element(By.XPATH, xpath)
    element.click()

    while True:
        # 리뷰 엘리먼트 탐색
        time.sleep(random.uniform(0,4,0.7))
        # start_time = time.time() # 시작
        xpath = '//*[@id="section_review"]/ul'
        reviews = driver.find_element(By.XPATH, xpath)
        
        review_ls_list += extract_review_data(reviews)

        # 페이지 넘기기
        xpath = '//*[@id="section_review"]/div[3]'
        page_views = driver.find_element(By.XPATH, xpath)
        a_list = page_views.find_elements(By.TAG_NAME, 'a')

        idx = search_index(a_list)
        page_num = a_list[idx].text.split('\n')[1]
        # print(f"{page_num}. {time.time()-start_time:.4f} sec") # 종료와 함께 수행시간 출력
        # print(f"{page_num} {time.time()-start_time:.4f} sec") # 종료와 함께 수행시간 출력
        print('Page', page_num)
        if int(page_num) == 100:
            break
        if (len(a_list)-1) != idx:
            # print(a_list[idx].text.split('\n')[1])
            a_list[idx+1].click()
        else:
            print('end')
            break

driver.close()

1. 0.9158 sec
2. 0.8449 sec
3. 0.7898 sec
4. 0.8058 sec
5. 0.8209 sec
6. 0.8172 sec
7. 0.8507 sec
8. 0.8203 sec
9. 0.8461 sec
10. 0.8422 sec
11. 0.8082 sec
12. 0.8221 sec
13. 0.7802 sec
14. 0.8122 sec
15. 0.8105 sec
16. 0.8404 sec
17. 0.8289 sec
18. 0.8224 sec
19. 0.8506 sec
20. 0.8509 sec
21. 0.8864 sec
22. 0.9294 sec
23. 0.9254 sec
24. 0.8037 sec
25. 0.8388 sec
26. 0.8105 sec
27. 0.8578 sec
28. 0.8516 sec
29. 1.0031 sec
30. 0.9290 sec
31. 0.8103 sec
32. 0.8110 sec
33. 0.8352 sec
34. 0.8216 sec
35. 0.8208 sec
36. 0.8123 sec
37. 0.9516 sec
38. 0.9463 sec
39. 0.9668 sec
40. 0.8642 sec
41. 0.8248 sec
42. 0.8188 sec
43. 0.8708 sec
44. 0.7934 sec
45. 0.8072 sec
46. 0.8144 sec
47. 0.8225 sec
48. 0.9190 sec
49. 0.8215 sec
50. 0.8712 sec
51. 0.8288 sec
52. 0.8109 sec
53. 0.8239 sec
54. 0.8240 sec
55. 0.8126 sec
56. 0.8512 sec
57. 0.8233 sec
58. 0.8224 sec
59. 0.8454 sec
60. 0.9059 sec
61. 0.7870 sec
62. 0.8074 sec
63. 0.8186 sec
64. 0.8567 sec
65. 0.8366 sec
66. 0.8289 sec
67. 0.8434 sec
68. 

StaleElementReferenceException: Message: stale element reference: stale element not found
  (Session info: chrome=113.0.5672.92)
Stacktrace:
0   chromedriver                        0x00000001009338ac chromedriver + 4257964
1   chromedriver                        0x000000010092bf40 chromedriver + 4226880
2   chromedriver                        0x00000001005689d4 chromedriver + 281044
3   chromedriver                        0x000000010056d200 chromedriver + 299520
4   chromedriver                        0x000000010056eb34 chromedriver + 305972
5   chromedriver                        0x000000010056ec24 chromedriver + 306212
6   chromedriver                        0x000000010059e0ec chromedriver + 499948
7   chromedriver                        0x00000001005991cc chromedriver + 479692
8   chromedriver                        0x00000001005da7e4 chromedriver + 747492
9   chromedriver                        0x000000010059798c chromedriver + 473484
10  chromedriver                        0x000000010059898c chromedriver + 477580
11  chromedriver                        0x00000001008f2900 chromedriver + 3991808
12  chromedriver                        0x00000001008f6354 chromedriver + 4006740
13  chromedriver                        0x00000001008f6940 chromedriver + 4008256
14  chromedriver                        0x00000001008fc33c chromedriver + 4031292
15  chromedriver                        0x00000001008f6f34 chromedriver + 4009780
16  chromedriver                        0x00000001008cf490 chromedriver + 3847312
17  chromedriver                        0x00000001009149f4 chromedriver + 4131316
18  chromedriver                        0x0000000100914b4c chromedriver + 4131660
19  chromedriver                        0x0000000100925230 chromedriver + 4198960
20  libsystem_pthread.dylib             0x000000018b603fa8 _pthread_start + 148
21  libsystem_pthread.dylib             0x000000018b5feda0 thread_start + 8


In [43]:
len(review_ls_list)

4020

In [64]:
#중복값 제거
unique_reviews = list(set(map(tuple, review_ls_list)))
len(unique_reviews)

3987

In [63]:
import pandas as pd

df = pd.DataFrame(unique_reviews)
df.to_excel('리스트.xlsx')