# 1990~2023년도 TOP50 크롤링

In [None]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# 기본 설정
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36')
}
age_url = "https://www.melon.com/chart/age/list.htm"
columns = ['chartDate', 'rank', 'song_id', 'title', 'singer', 'lyric']

# 저장 디렉토리 설정
output_dir = "melon_charts"
os.makedirs(output_dir, exist_ok=True)

def fetch_song_data(year):
    params = {
        'idx': '1',
        'chartType': 'YE',
        'chartGenre': 'KPOP',
        'chartDate': str(year),
        'moved': 'Y',
    }

    response = requests.get(age_url, params=params, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    song_list = soup.select('.lst50')

    # 데이터프레임 초기화
    song_data = pd.DataFrame(columns=columns)

    for i, meta in enumerate(song_list, 1):
        rank = i
        try:
            title = meta.select('a[href*=playSong]')[0].text
        except:
            title = meta.select('.wrap_song_info .ellipsis')[0].text
        title = title.strip()

        song_id_html = str(meta.select('a[onclick*=SongDetail]'))
        matched = re.search(r"\'(\d+)\'", song_id_html)
        song_id = matched.group(1)
        song_url = 'https://www.melon.com/song/detail.htm?songId=' + song_id

        response = requests.get(song_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 가수
        singer_html = soup.select('.wrap_info .artist a')
        singer_s = ', '.join([html['title'] for html in singer_html if html['title']]) if singer_html else 'Various Artists'

        # 가사
        lyric = '없음'
        lyric_html = soup.select_one('.section_lyric .wrap_lyric .lyric')
        if lyric_html:
            lyric = lyric_html.get_text(strip=True, separator='\n')

        # 데이터프레임에 추가
        row = pd.Series([params['chartDate'], rank, song_id, title, singer_s, lyric], index=song_data.columns)
        song_data = pd.concat([song_data, pd.DataFrame([row])], ignore_index=True)

        sleep(0.1)  # IP 차단 방지용

    # CSV 파일로 저장
    file_name = f"melon_chart_{year}.csv"
    file_path = os.path.join(output_dir, file_name)
    song_data.to_csv(file_path, index=False, encoding='utf-8-sig')
    print(f"{file_name} saved.")

    return year

years = list(range(2000, 2024))
max_workers = 5  # 동시에 실행할 쓰레드 수

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(fetch_song_data, year) for year in years]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Overall progress"):
        future.result()

print("All data collected and saved.")
