In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to scrape data for a specific year
def scrape_year_data(year):
    url = f'https://www.melon.com/chart/age/index.htm?chartType=YE&chartGenre=KPOP&chartDate={year}'
    headers = {'User-Agent': 'Mozilla/5.0' }
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    songs_data = []

    # Extracting song information
    songs = soup.select('tr[data-song-no]')
    for song in songs:
        song_id = song['data-song-no']
        title = song.select_one('div.ellipsis.rank01 > span > a').text
        artist = song.select_one('div.ellipsis.rank02 > span').text
        lyrics_url = f'https://www.melon.com/song/detail.htm?songId={song_id}'
        
        lyrics_response = requests.get(lyrics_url, headers=headers)
        lyrics_soup = BeautifulSoup(lyrics_response.text, 'html.parser')
        lyrics = lyrics_soup.select_one('div.lyric').text.strip() if lyrics_soup.select_one('div.lyric') else 'No lyrics found'

        songs_data.append([song_id, title, artist, lyrics])


    return songs_data

# Loop through the years from 2023 to 1990
for year in range(2023, 1989, -1):
    print(f'Scraping data for the year {year}...')
    year_data = scrape_year_data(year)
    
    # Save the data to a CSV file for the current year
    df = pd.DataFrame(year_data, columns=['Song ID', 'Title', 'Artist', 'Lyrics'])
    filename = f'melon_songs_{year}.csv'
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    
    print(f'Data for the year {year} saved to {filename}')
    time.sleep(5)  # Wait between years to avoid being blocked

print('Data scraping completed for all years.')


Scraping data for the year 2023...
Data for the year 2023 saved to melon_songs_2023.csv
Scraping data for the year 2022...
Data for the year 2022 saved to melon_songs_2022.csv
Scraping data for the year 2021...
Data for the year 2021 saved to melon_songs_2021.csv


KeyboardInterrupt: 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures

# 멜론 TOP 100 차트 페이지 URL
url = 'https://www.melon.com/chart/age/index.htm?chartType=YE&chartGenre=KPOP&chartDate=2023'

# 웹 페이지 요청
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)

# BeautifulSoup 객체 생성
soup = BeautifulSoup(response.text, 'html.parser')

# 곡 ID, 제목, 아티스트 추출
song_ids = []
song_titles = []
song_artists = []

for song_tag in soup.find_all('tr', class_='lst50'):
    song_id = song_tag.get('data-song-no')
    if song_id:
        title_tag = song_tag.find('div', class_='rank01').find('a')
        artist_tag = song_tag.find('div', class_='rank02').find('a')
        song_ids.append(song_id)
        song_titles.append(title_tag.text)
        song_artists.append(artist_tag.text)

# 곡 가사 추출 함수
def get_lyrics(song_id):
    song_url = f'https://www.melon.com/song/detail.htm?songId={song_id}'
    song_response = requests.get(song_url, headers=headers)
    song_soup = BeautifulSoup(song_response.text, 'html.parser')
    lyrics_tag = song_soup.find('div', class_='lyric')
    if lyrics_tag:
        return lyrics_tag.text.strip()
    else:
        return ""

# 병렬 처리로 가사 추출
def fetch_lyrics(song_id):
    return song_id, get_lyrics(song_id)

# 병렬처리를 위해 ThreadPoolExecutor 사용
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(fetch_lyrics, song_ids))

# 결과 처리
lyrics_dict = dict(results)
lyrics_list = [lyrics_dict[song_id] for song_id in song_ids]

# 데이터프레임 생성
data = {'Song ID': song_ids, 'Title': song_titles, 'Artist': song_artists, 'Lyrics': lyrics_list}
df = pd.DataFrame(data)

# CSV 파일로 저장
df.to_csv('melon_top_100_with_lyrics.csv', index=False, encoding='utf-8-sig')

print("CSV 파일이 저장되었습니다.")
