In [6]:
# 무신사 허락 user-agents
USER_AGENTS = [
    "Googlebot/2.1 (+http://www.google.com/bot.html)",
    "Mozilla/5.0 (compatible; Yeti/1.1; +http://naver.me/bot)",
    "Daumoa/5.0 (+http://partner.daum.net)",
    "Mozilla/5.0 (compatible; DaumWebCrawler; +http://partner.daum.net)",
    "Mozilla/5.0 (compatible; FacebookBot/1.0; +http://www.facebook.com/externalhit_uatext.php)",
    "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"
]

In [10]:
import aiohttp
import asyncio
import random
from bs4 import BeautifulSoup
import csv
import nest_asyncio
from tqdm.asyncio import tqdm
import pandas as pd

nest_asyncio.apply()

# 입력 데이터 설정
years = [2023, 2024]
max_pages = float('inf')  # 페이지 수에 제한 없음



async def fetch(session, url, params, headers):
    async with session.get(url, params=params, headers=headers) as response:
        return await response.text()

async def get_total_review_count(session, year, hash_id):
    base_url = "https://www.musinsa.com/goods/reviews/lists"
    params = {
        'searchYear': year,
        'searchMonth': 0,
        'type': '',
        'page': 1,
        'shop_domain': 'https://www.musinsa.com',
        'hashId': hash_id
    }
    headers = {
        'User-Agent': random.choice(USER_AGENTS)
    }
    response_text = await fetch(session, base_url, params, headers)
    soup = BeautifulSoup(response_text, 'html.parser')
    count_element = soup.select_one("body > main > form > div.wrap-estimate-list > div.n-list-util > div.count > strong")
    
    if not count_element:
        print(f"No review count element found for {year}.")
        return 0
    
    count_text = count_element.text.strip().replace(',', '')
    
    try:
        total_count = int(count_text)
    except ValueError:
        print(f"Failed to convert review count to an integer for {year}.")
        return 0
    
    return total_count

async def get_reviews_page(session, year, hash_id, page):
    base_url = "https://www.musinsa.com/goods/reviews/lists"
    params = {
        'searchYear': year,
        'searchMonth': 0,
        'type': '',
        'page': page,
        'shop_domain': 'https://www.musinsa.com',
        'hashId': hash_id
    }
    headers = {
        'User-Agent': random.choice(USER_AGENTS)
    }
    response_text = await fetch(session, base_url, params, headers)
    soup = BeautifulSoup(response_text, 'html.parser')
    review_list = soup.select("#reviewContentWrap > div")

    reviews = []
    for review in review_list:
        # Parsing logic for each review
        user_id = review.select_one('div.review-profile > div > div.review-profile__text > p.review-profile__name').text.strip() if review.select_one('div.review-profile > div > div.review-profile__text > p.review-profile__name') else ''
        user_info = review.select_one('div.review-profile > div > div.review-profile__information.review-profile__information--brandshop > p > span').text.strip() if review.select_one('div.review-profile > div > div.review-profile__information.review-profile__information--brandshop > p > span') else ''
        review_date = review.select_one('p.review-profile__date').text.strip() if review.select_one('p.review-profile__date') else ''
        product_id = ''
        product_link = review.select_one('a.review-goods-information__name')['href'] if review.select_one('a.review-goods-information__name') else ''
        if product_link:
            product_id = product_link.split('/')[-2]
        product_brand = review.select_one('a.review-goods-information__brand').text.strip() if review.select_one('a.review-goods-information__brand') else ''
        product_name = review.select_one('a.review-goods-information__name').text.strip() if review.select_one('a.review-goods-information__name') else ''
        product_option = review.select_one('div.review-goods-information__item > p > span').text.strip() if review.select_one('div.review-goods-information__item > p > span') else ''

        review_text = review.select_one('div.review-contents__text').text.strip() if review.select_one('div.review-contents__text') else ''
        star_rating_elem = review.select_one('div.review-list__rating-wrap > span > span > span.review-list__rating__active')
        star_rating = int(star_rating_elem['style'].split(': ')[1].replace('%', '')) // 20 if star_rating_elem else 0

        evaluate_tags = {
            '사이즈': '',
            '밝기': '',
            '색감': '',
            '두께감': ''
        }
        evaluation_elements = review.select('div.review-contents > div.review-evaluation--type2 > ul > li')
        for elem in evaluation_elements:
            key = elem.text.split(' ')[0].strip()
            value = elem.select_one('span').text.strip() if elem.select_one('span') else ''
            if key in evaluate_tags:
                evaluate_tags[key] = value

        helpful_count = review.select_one('div.review-evaluation-button--type3 > ul > li:nth-child(1) > label > span').text.strip() if review.select_one('div.review-evaluation-button--type3 > ul > li:nth-child(1) > label > span') else ''
        nicestyle_count = review.select_one('div.review-evaluation-button--type3 > ul > li:nth-child(2) > label > span').text.strip() if review.select_one('div.review-evaluation-button--type3 > ul > li:nth-child(2) > label > span') else ''

        review_data = {
            'user_id': user_id,
            'user_info': user_info,
            'review_date': review_date,
            'product_id': product_id,
            'product_brand': product_brand,
            'product_name': product_name,
            'product_option': product_option,
            'review_text': review_text,
            'star_rating': star_rating,
            'helpful_count': helpful_count,
            'nicestyle_count': nicestyle_count,
            'hash_id': hash_id  # Add hash_id to each review
        }
        review_data.update(evaluate_tags)
        reviews.append(review_data)

    return reviews

async def get_reviews_year(session, year, hash_id, max_pages):
    total_count = await get_total_review_count(session, year, hash_id)
    pages_to_crawl = min((total_count // 20) + 1, max_pages)

    tasks = [get_reviews_page(session, year, hash_id, page) for page in range(1, pages_to_crawl + 1)]
    results = []

    for task in tqdm(asyncio.as_completed(tasks), total=pages_to_crawl, desc=f"{year} 크롤링 진행 상황"):
        result = await task
        results.append(result)

    reviews = [review for result in results for review in result]
    return reviews

def remove_duplicates(reviews):
    unique_reviews = []
    seen = set()

    for review in reviews:
        identifier = (review['review_date'], review['product_id'])

        if identifier not in seen:
            unique_reviews.append(review)
            seen.add(identifier)

    return unique_reviews

async def crawl_reviews_for_all_hash_ids(hash_ids, years, max_pages):
    async with aiohttp.ClientSession() as session:
        all_reviews = []

        for hash_id in hash_ids:
            for year in years:
                try:
                    reviews = await get_reviews_year(session, year, hash_id, max_pages)
                    all_reviews.extend(reviews)
                except Exception as e:
                    print(f"Error on {year} for hash_id {hash_id}: {e}")

        unique_reviews = remove_duplicates(all_reviews)
        return unique_reviews

def read_hash_ids_from_csv(file_path):
    df = pd.read_csv(file_path, encoding='utf-8')
    return df['hash_id'].tolist()

async def main(file_path, years, max_pages):
    hash_ids = read_hash_ids_from_csv(file_path)
    all_reviews = await crawl_reviews_for_all_hash_ids(hash_ids, years, max_pages)

    with open('review_test.csv', 'w', newline='', encoding='utf-8') as f:
        fieldnames = ['user_id', 'user_info', 'hash_id', 'review_date', 'product_id', 'product_brand', 'product_name', 'product_option', 'review_text', 'star_rating', 'helpful_count', 'nicestyle_count', '사이즈', '밝기', '색감', '두께감']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_reviews)

    print(f"크롤링 완료! 총 {len(all_reviews)}개의 리뷰를 저장했습니다.")
#################################################
# CSV 파일 경로와 실행 설정
csv_file_path = 'data.csv'  # hash_id가 포함된 CSV 파일 경로를 여기에 입력하세요
asyncio.run(main(csv_file_path, years, max_pages))
##################################################

2023 크롤링 진행 상황: 100%|██████████| 28/28 [00:03<00:00,  8.14it/s]
2024 크롤링 진행 상황: 100%|██████████| 15/15 [00:01<00:00,  8.15it/s]
2023 크롤링 진행 상황: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
2024 크롤링 진행 상황: 100%|██████████| 23/23 [00:02<00:00,  9.81it/s]
2023 크롤링 진행 상황: 100%|██████████| 22/22 [00:02<00:00,  8.87it/s]
2024 크롤링 진행 상황: 100%|██████████| 19/19 [00:02<00:00,  8.92it/s]
2023 크롤링 진행 상황: 100%|██████████| 15/15 [00:01<00:00,  8.27it/s]
2024 크롤링 진행 상황: 100%|██████████| 23/23 [00:02<00:00,  8.40it/s]
2023 크롤링 진행 상황: 100%|██████████| 2/2 [00:00<00:00,  4.40it/s]
2024 크롤링 진행 상황: 100%|██████████| 24/24 [00:02<00:00,  9.27it/s]
2023 크롤링 진행 상황: 100%|██████████| 23/23 [00:02<00:00, 10.33it/s]
2024 크롤링 진행 상황: 100%|██████████| 15/15 [00:01<00:00,  9.80it/s]
2023 크롤링 진행 상황: 100%|██████████| 41/41 [00:04<00:00, 10.23it/s]
2024 크롤링 진행 상황: 100%|██████████| 10/10 [00:01<00:00,  7.04it/s]
2023 크롤링 진행 상황: 100%|██████████| 14/14 [00:01<00:00,  9.47it/s]
2024 크롤링 진행 상황: 100%|██████████| 23/23 [00:0

크롤링 완료! 총 38905개의 리뷰를 저장했습니다.
