# Edmunds 크롤링 코드 
- 코랩 환경에서는 크롬드라이버 경로 지정에 문제가 발생해 코드파일을 첨부합니다. 
- URl : https://www.edmunds.com/
- 크롤링 대상 : ravo4의 소비자 리뷰, 평점
- 리뷰기간 :  2010~2023년 

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
def get_reviews(year):
    options = ChromeOptions()
    options.headless = True
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("lang=ko_KR")
    user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
    options.add_argument(f"user-agent={user_agent}")

    # ChromeDriver 경로 지정
    chromedriver_path = 'C:/Users/USER/Desktop/chromedriver-win64/chromedriver.exe'
    driver = webdriver.Chrome(service=ChromeService(executable_path=chromedriver_path), options=options)

    all_reviews = []
    page = 1

    while True:
        url = f"https://www.edmunds.com/toyota/rav4/{year}/consumer-reviews/?pagenum={page}&pagesize=50&sorting=%7B%22updated%22%3A%22DESC%22%7D"
        driver.get(url)

        time.sleep(5)  # 페이지 로드 대기

        # HTML 파싱
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 리뷰 데이터 추출
        reviews_list = soup.find('div', attrs={'aria-label': 'Consumer Reviews', 'class': 'reviews-list'})
        if not reviews_list:
            print("No reviews list found.")
            break

        reviews = reviews_list.find_all('div', class_='review-item text-gray-darker')

        if not reviews:
            print("No reviews found on page", page)
            break

        for review in reviews:
            review_text_div = review.find('div', class_='truncated-text size-16')
            if review_text_div:
                review_text = review_text_div.get_text(strip=True)
            else:
                review_text = "No Review Text Found"

            review_date_divs = review.find_all('div', class_='small text-gray-dark mb-2')
            review_date = review_date_divs[0].get_text(strip=True) if review_date_divs else "No Date Found"

            # 평점 추출
            rating_span = review.find('span', class_='sr-only')
            rating = rating_span.get_text(strip=True) if rating_span else "No Rating Found"

            all_reviews.append({
                "날짜": review_date,
                "리뷰": review_text,
                "평점": rating
            })

        page += 1

    driver.quit()

    return all_reviews

In [None]:
def process_reviews(year):
    reviews = get_reviews(year)
    df = pd.DataFrame(reviews)

    # 데이터 확인용 출력
    #print(df.head())

    # 리뷰 텍스트에서 'Safety'를 포함 그 뒤 텍스트 삭제
    if '리뷰' in df.columns:
        df['리뷰'] = df['리뷰'].str.split('Safety').str[0].str.strip()
    else:
        print("Error: '리뷰' column not found")

    return df

In [2]:
#용량 문제로 두 기간으로 나누어서 크롤링함
years = range(2015, 2019)
combined_df = pd.DataFrame()

for year in years:
    year_df = process_reviews(year)
    combined_df = pd.concat([combined_df, year_df], ignore_index=True)

No reviews list found.
No reviews list found.
No reviews list found.
No reviews list found.


In [4]:
combined_df.to_excel("combined_reviews_2018.xlsx", index=False)

In [3]:
years = range(2019, 2024)
combined_df = pd.DataFrame()

for year in years:
    year_df = process_reviews(year)
    combined_df = pd.concat([combined_df, year_df], ignore_index=True)

No reviews list found.
No reviews list found.
No reviews list found.
No reviews list found.
No reviews list found.


In [5]:
combined_df.to_excel("combined_reviews_2023.xlsx", index=False)