In [6]:
# ✅ 크롬 & 드라이버 자동 설치
!apt-get update > /dev/null
!apt install -y wget unzip > /dev/null
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt -fy install > /dev/null

import re, json, urllib.request

# 크롬 버전 확인
chrome_version = !google-chrome --version
chrome_version = re.search(r'(\d+\.\d+\.\d+)', chrome_version[0]).group(1)

# 드라이버 다운로드 URL 추출
metadata_url = 'https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json'
with urllib.request.urlopen(metadata_url) as url:
    data = json.load(url)
    matched = next(item for item in data['channels'].values() if chrome_version in item['version'])
    driver_url = next(dl['url'] for dl in matched['downloads']['chromedriver'] if dl['platform'] == 'linux64')

# 다운로드 및 압축 해제
!wget -q "$driver_url" -O chromedriver_linux64.zip
!unzip -o chromedriver_linux64.zip > /dev/null
!mv chromedriver-linux64/chromedriver /usr/bin/chromedriver
!chmod +x /usr/bin/chromedriver


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


(Reading database ... 126464 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (138.0.7204.49-1) over (138.0.7204.49-1) ...
Setting up google-chrome-stable (138.0.7204.49-1) ...
Processing triggers for mailcap (3.70+nmu1ubuntu1) ...
Processing triggers for man-db (2.10.2-1) ...


In [7]:
!pip install selenium > /dev/null
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import re
import pandas as pd
import time

# 크롬 설정 (headless 환경 가능)
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("user-agent=Mozilla/5.0")
chrome_options.binary_location = "/usr/bin/google-chrome"

# 드라이버 실행
service = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)


# 멜론 연도별 차트 URL
url = "https://www.melon.com/chart/age/index.htm?chartType=YE&chartGenre=KPOP&chartDate=2000"
driver.get(url)
time.sleep(5)  # JS 렌더링 대기

# 전체 페이지 소스를 문자열로 가져오기
html = driver.page_source

# ✅ 정규표현식으로 goAlbumDetail('123456') 형태 찾기
#album_ids = re.findall(r"goAlbumDetail\('(\d+)'\)", html)
album_ids = list(set(re.findall(r"goAlbumDetail\('(\d+)'\)", html)))  # 중복 제거


print("페이지 내 albumDetail 함수 개수:", len(re.findall(r"goAlbumDetail\('(\d+)'\)", html)))


# ✅ URL 생성
album_urls = [f"https://www.melon.com/album/detail.htm?albumId={aid}" for aid in album_ids]

# ✅ 결과 출력 및 저장
df = pd.DataFrame({
    'album_id': album_ids,
    'album_url': album_urls
})
print(df)
df.to_csv("melon_album_urls_1994.csv", index=False)

driver.quit()

페이지 내 albumDetail 함수 개수: 200
   album_id                                          album_url
0      4556  https://www.melon.com/album/detail.htm?albumId...
1      4830  https://www.melon.com/album/detail.htm?albumId...
2       457  https://www.melon.com/album/detail.htm?albumId...
3      6543  https://www.melon.com/album/detail.htm?albumId...
4      4986  https://www.melon.com/album/detail.htm?albumId...
..      ...                                                ...
79     7874  https://www.melon.com/album/detail.htm?albumId...
80      709  https://www.melon.com/album/detail.htm?albumId...
81     3260  https://www.melon.com/album/detail.htm?albumId...
82      116  https://www.melon.com/album/detail.htm?albumId...
83     4397  https://www.melon.com/album/detail.htm?albumId...

[84 rows x 2 columns]


In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time

# ▶ 1. 앨범 URL 목록 로드
album_df = pd.read_csv("/content/melon_album_urls_1994.csv")
album_urls = album_df['album_url'].tolist()

# ▶ 2. 크롬 드라이버 설정
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("user-agent=Mozilla/5.0")
chrome_options.binary_location = "/usr/bin/google-chrome"

service = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

# ▶ 3. 결과 저장 리스트
result = []

for album_url in album_urls:
    try:
        driver.get(album_url)
        time.sleep(2)

        # 앨범명과 아티스트명
        album_title = driver.find_element(By.CLASS_NAME, "song_name").text.replace("앨범명", "").strip()
        artist_name = driver.find_element(By.CLASS_NAME, "artist").text.strip()

        # 수록곡 리스트
        song_rows = driver.find_elements(By.CSS_SELECTOR, 'div#d_song_list table > tbody > tr')

        for row in song_rows:
            try:
                title_tag = row.find_element(By.CSS_SELECTOR, 'div.ellipsis.rank01 a')
                song_title = title_tag.text.strip()
                song_href = title_tag.get_attribute('href')

                # 곡 상세 페이지로 이동
                driver.get(song_href)
                time.sleep(2)

                # 가사 추출
                lyrics_element = driver.find_element(By.CSS_SELECTOR, 'div.lyric')
                lyrics = lyrics_element.text.strip().replace('\n', ' ')

                result.append({
                    "album": album_title,
                    "artist": artist_name,
                    "title": song_title,
                    "lyrics": lyrics
                })

                driver.back()
                time.sleep(1)
            except Exception as e:
                print(f"⚠️ 곡 처리 실패: {e}")
                continue
    except Exception as e:
        print(f"❌ 앨범 처리 실패: {e}")
        continue

driver.quit()

# ▶ 4. 결과 저장
df_result = pd.DataFrame(result)
df_result.to_csv("melon_lyrics_1994.csv", index=False)
print("✅ melon_lyrics_1994.csv 저장 완료")


✅ melon_lyrics_1994.csv 저장 완료


In [9]:
# ▶ 환경 설치 (Colab 전용)
!apt-get update > /dev/null
!apt install -y wget unzip > /dev/null
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt -fy install > /dev/null

# ▶ 크롬 버전 확인 및 드라이버 자동 설치
import re, json, urllib.request

chrome_version = !google-chrome --version
chrome_version = re.search(r'(\d+\.\d+\.\d+)', chrome_version[0]).group(1)

metadata_url = 'https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json'
with urllib.request.urlopen(metadata_url) as url:
    data = json.load(url)
    matched = next(item for item in data['channels'].values() if chrome_version in item['version'])
    driver_url = next(dl['url'] for dl in matched['downloads']['chromedriver'] if dl['platform'] == 'linux64')

!wget -q "$driver_url" -O chromedriver_linux64.zip
!unzip -o chromedriver_linux64.zip > /dev/null
!mv chromedriver-linux64/chromedriver /usr/bin/chromedriver
!chmod +x /usr/bin/chromedriver
!pip install selenium > /dev/null

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


(Reading database ... 126464 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (138.0.7204.49-1) over (138.0.7204.49-1) ...
Setting up google-chrome-stable (138.0.7204.49-1) ...
Processing triggers for mailcap (3.70+nmu1ubuntu1) ...
Processing triggers for man-db (2.10.2-1) ...


In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import pandas as pd
import time
import re

# ▶ 2. 크롬 드라이버 설정
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("user-agent=Mozilla/5.0")
chrome_options.binary_location = "/usr/bin/google-chrome"

service = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

# ▶ 대상 앨범 URL
album_url = "https://www.melon.com/album/detail.htm?albumId=3933"
album_id = re.search(r'albumId=(\d+)', album_url).group(1)

driver.get(album_url)
time.sleep(3)

# ▶ 앨범명 / 아티스트명 추출
album_title = driver.find_element(By.CLASS_NAME, "song_name").text.replace("앨범명", "").strip()
artist_name = driver.find_element(By.CLASS_NAME, "artist").text.strip()

# ▶ 곡 ID 추출 (goSongDetail)
html = driver.page_source
song_ids = re.findall(r"goSongDetail\('(\d+)'\)", html)
print(f"🎶 추출된 곡 수: {len(song_ids)}")

# ▶ 곡별 수집
result = []
for song_id in song_ids:
    try:
        song_url = f"https://www.melon.com/song/detail.htm?songId={song_id}"
        driver.get(song_url)
        time.sleep(2)

        # 곡 제목
        title = driver.find_element(By.CSS_SELECTOR, 'div.song_name').text.replace("곡명", "").strip()

        # 가사
        lyrics_tags = driver.find_elements(By.CSS_SELECTOR, 'div.lyric')
        lyrics = lyrics_tags[0].text.strip().replace('\n', ' ') if lyrics_tags else ""

        result.append({
            "albumid": album_id,
            "songid": song_id,
            "album": album_title,
            "artist": artist_name,
            "title": title,
            "lyrics": lyrics
        })
    except Exception as e:
        print(f"⚠️ 오류 (songId={song_id}):", e)
        continue

driver.quit()

# ▶ CSV 저장
df = pd.DataFrame(result)
df.to_csv("melon_lyrics_3933.csv", index=False)
print("✅ melon_lyrics_3933.csv 저장 완료!")


🎶 추출된 곡 수: 20
✅ melon_lyrics_3933.csv 저장 완료!
