In [1]:
import selenium.webdriver as webdriver
from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
from time import sleep
import datetime
from collections import defaultdict
from pytz import timezone

import pandas as pd
from tqdm import tqdm

In [2]:
display = Display(visible=0,size=(1024, 768))
display.start()

<pyvirtualdisplay.display.Display at 0x7ff3e839a4c0>

In [3]:
driver = webdriver.Chrome('/home/ubuntu/chromedriver')

In [22]:
def get_table():
    
    def find_id(element):
        return str(element)[25:33]
    
    def chart(n):
        driver.get(f'https://www.genie.co.kr/chart/top200?pg={n}')
        page = driver.page_source
        soup = BeautifulSoup(page, 'lxml')
        
        table = soup.find(attrs={'class':'newest-list'}) \
                    .find_all('tr', attrs={'class':'list'})
        
        return [find_id(e) for e in table]
        
    return chart(1) + chart(2) + chart(3) + chart(4)

In [23]:
song_ids = get_table()

In [4]:
class song_page():
    
    def __init__(self, id, rank):
        self.id = id
        self.rank = rank
    
    def get_page(self):
        url = f"https://www.genie.co.kr/detail/songInfo?xgnm={self.id}"
        self.url = url

        driver.get(url)
        song_page = driver.page_source
        song_soup = BeautifulSoup(song_page, 'lxml')
        self.soup = song_soup
        
    def get_name(self):
        self.name = self.soup.find('h2',attrs={'class':'name'}).text.strip()

    def get_values(self):
        values = self.soup.find('ul',attrs={'class':'info-data'}) \
                          .find_all(attrs={'class':'value'})
        attrs  = self.soup.find('ul',attrs={'class':'info-data'}) \
                          .find_all(attrs={'class':'attr'})
        items  = defaultdict(lambda: None)
        
        def find_id(soup, char, num):
            return str(soup).split(char)[num]
        
        items['artist']   = find_id(values[0], "'", 3)
        items['album']    = find_id(values[1], "'", 3)
        items['장르']      = values[2].text
        items['재생시간']   = "0:" + values[3].text
        
        for i, item in enumerate(attrs[4:]):
            attr = find_id(item, '"', 3)
            items[attr] = [find_id(v, "'", 1) for v in values[4+i].find_all('a')]
        
        self.artist    = items['artist']
        self.album     = items['album']
        self.genre     = items['장르']
        self.playtime  = items['재생시간']
        self.lyricists = items['작사가']
        self.composers = items['작곡가']
        self.arrangers = items['편곡자']

        
    def get_likes(self):
        likes_raw = self.soup.find('em',attrs={'id':'emLikeCount'}).text
        likes_str = ''.join(likes_raw.strip().split(","))
        self.likes = int(likes_str)

    def get_totals(self):
        counts = self.soup.find('div', attrs={'class':'total'}) \
                     .find_all('div')
        listners_str = ''.join(counts[0].text.strip().split(","))
        self.listners = int(listners_str)
        
        numplays_str = ''.join(counts[1].text.strip().split(","))
        self.numplays = int(numplays_str)
        
    def get_comments(self):
        comments_raw = self.soup.find('span', attrs={'class':'article'}).text
        comments_str = str(comments_raw)[3:].split("개")[0]
        self.comments = int(comments_str)

    def get_now(self):
        KST = timezone('Asia/Seoul')
        now_raw = datetime.datetime.now()
        now_kr_raw = now_raw.astimezone(KST)
        
        now_str = str(now_kr_raw).split(" ")
        now_date = now_str[0]
        now_time = now_str[1].split(":")[0] + ":00:00"
        
        self.now = now_date + " " + now_time

    def set_attributes(self):
        self.get_page()
        self.get_name()
        self.get_values()
        self.get_likes()
        self.get_totals()
        self.get_comments()
        self.get_now()

    def get_attributes(self):
        return (self.id, self.name,self.genre,self.playtime,
                 self.likes,
                 self.listners,self.numplays,
                 self.comments,
                 self.now)

In [5]:
class musician_page():
    
    def __init__(self, id):
        self.id = id
    
    def get_page(self):
        url = f"https://www.genie.co.kr/detail/artistInfo?xxnm={self.id}"
        self.url = url

        driver.get(url)
        song_page = driver.page_source
        song_soup = BeautifulSoup(song_page, 'lxml')
        self.soup = song_soup
        
    def get_name(self):
        self.name = self.soup.find('h2',attrs={'class':'name'}).text.strip()
        
    def get_values(self):
        values = self.soup.find('ul',attrs={'class':'info-data'}) \
                          .find_all(attrs={'class':'value'})
        attrs  = self.soup.find('ul',attrs={'class':'info-data'}) \
                          .find_all(attrs={'class':'attr'})
        items  = defaultdict(lambda: None)
        
        items['성별'], items['활동유형'] = values[0].text.split("/")
        items['활동연대'] = values[1].text
        items['데뷔']     = values[2].text.split("/")[0].strip()
        items['국적']     = values[3].text if len(values) > 3 else None
        
        self.gender      = items['성별']
        self.type        = items['활동유형']
        self.period      = items['활동연대']
        self.nationality = items['국적']
        
        debut = items['데뷔'].split("년")[0]
        self.debut = debut if debut != "" else None
    
    def set_attributes(self):
        self.get_page()
        self.get_name()
        self.get_values()

    def get_attributes(self):
        return (self.id, self.name,
                 self.gender, self.type, self.period, self.debut, self.nationality)
    
    

In [6]:
class album_page():
    
    def __init__(self, id):
        self.id = id
    
    def get_page(self):
        url = f"https://www.genie.co.kr/detail/albumInfo?axnm={self.id}"
        self.url = url

        driver.get(url)
        song_page = driver.page_source
        song_soup = BeautifulSoup(song_page, 'lxml')
        self.soup = song_soup
        
    def get_name(self):
        self.name = self.soup.find('h2',attrs={'class':'name'}).text.strip()
        
    def get_values(self):
        values = self.soup.find('ul',attrs={'class':'info-data'}) \
                          .find_all(attrs={'class':'value'})
        attrs  = self.soup.find('ul',attrs={'class':'info-data'}) \
                          .find_all(attrs={'class':'attr'})
        items  = defaultdict(lambda: None)
        
        items['장르'], items['스타일'] = values[1].text.split(" / ")
        items['발매사'] = values[2].text
        items['기획사'] = values[3].text
        items['발매일'] = values[4].text
        
        self.genre        = items['장르']
        self.style        = items['스타일']
        self.release_co   = items['발매사']
        self.agency       = items['기획사']
        self.release_date = items['발매일'].strip().replace(".", "-")
    
    def get_tracks(self):
        track_soup = self.soup.find('div', attrs={'class':'songlist-box'}) \
                              .find_all('tr', attrs={'class':'list'})
        
        self.tracks = [(str(tsoup).split('"')[3], 
                        'Title' if tsoup.find(attrs={'class':'title ellipsis'}).text.startswith('\nTITLE') else None)
                       for tsoup in track_soup]
    
    def set_attributes(self):
        self.get_page()
        self.get_name()
        self.get_values()
        self.get_tracks()

    def get_attributes(self):
        return (self.id, self.name,
                 self.genre, self.style, self.release_co, self.agency, self.release_date,
                 self.tracks)

### 곡 하나에 대해서 실험

In [7]:
print('Song info')
first_song = song_page('93619160')
first_song.set_attributes()
print(first_song.get_attributes())

print('\nAlbum info')
first_album = album_page(first_song.album)
first_album.set_attributes()
print(first_album.get_attributes())

print('\nMusicians info')
for musician_id in [first_song.artist] + first_song.lyricists + first_song.composers + first_song.arrangers:
    musician = musician_page(musician_id)
    musician.set_attributes()
    print(musician.get_attributes())

Song info
('93619160', '바라만 본다', '가요 / 발라드', '0:03:32', 25015, 1241761, 17086284, 727, '2021-07-17 17:00:00')

Album info
('82110804', 'MSG워너비 1집', '발라드', '가요', '지니뮤직, Stone Music Entertainment', '유야호', '2021-06-26', [('93619160', 'Title'), ('93619161', 'Title'), ('93619162', None), ('93619163', None)])

Musicians info
('81123111', 'MSG워너비 (M.O.M)', '남성', '그룹', '2020년대', '2021', '한국')
('80292211', '강은경', '여성', '솔로', '2010년대.2020년대', None, '한국')
('80292013', 'Dokkun (김도훈)', '남성', '솔로', '2010년대.2020년대', '2017', '한국')
('80292096', '박근태', '남성', '솔로', '2000년대.2010년대.2020년대', None, '한국')
('80292013', 'Dokkun (김도훈)', '남성', '솔로', '2010년대.2020년대', '2017', '한국')
('80720110', '강지원 (MVL)', '여성', '솔로', '2010년대', None, '한국')
('80720110', '강지원 (MVL)', '여성', '솔로', '2010년대', None, '한국')


### 모든 곡에 대해 적용

In [27]:
song_info = []

for rank, sid in tqdm(enumerate(song_ids)):
    song = song_page(sid, rank + 1)
    song.set_attributes()
    song_info.append(song.get_attributes())

100%|██████████| 200/200 [02:53<00:00,  1.15it/s]


In [28]:
Songs = pd.DataFrame(song_info, 
                     columns = ['id',
                                'name','genre','playtime',
                                'likes',
                                'listners','numplays',
                                'comments',
                                'time(now)'])

In [29]:
Songs

Unnamed: 0,id,name,artist,album,genre,playtime,lyricists,composer,arranger,likes,listners,numplays,comments,time(now)
0,93721048,Permission to Dance,방탄소년단,Butter / Permission to Dance,가요 / 댄스,03:08,"Ed Sheeran,Steve Mac,Johnny McDaid,Jenna Andrews","Ed Sheeran,Steve Mac,Johnny McDaid,Jenna Andrews",,10628,181656,673191,374,2021-07-10 0:00:00
1,93619160,바라만 본다,MSG워너비 (M.O.M),MSG워너비 1집,가요 / 발라드,03:32,"강은경,Dokkun (김도훈)","박근태,Dokkun (김도훈),강지원 (MVL)",강지원 (MVL),21249,1095692,12651942,676,2021-07-10 0:00:00
2,93700014,Weekend,태연 (TAEYEON),Weekend,가요 / 댄스,03:53,황유빈,"RoseInPeace,Saimon,Willemijn van der Neut,Marc...","Saimon,RoseInPeace",8662,505238,1957446,244,2021-07-10 0:00:00
3,93352112,Butter,방탄소년단,Butter,가요 / 댄스,02:44,"Jenna Andrews,Rob Grimaldi,Stephen Kirk,RM,Ale...","Jenna Andrews,Rob Grimaldi,Stephen Kirk,RM,Ale...",,38248,1349639,28357050,1123,2021-07-10 0:00:00
4,93300145,신호등,이무진,신호등,가요 / 락,03:52,이무진,이무진,"유종호,이무진",16014,680790,6418712,121,2021-07-10 0:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,89238855,그대라는 시,태연 (TAEYEON),호텔 델루나 OST Part.3 (tvN 토일드라마),OST / 드라마,03:30,"지훈,박세준",밍지션,밍지션,47081,2585083,62703228,231,2021-07-10 0:00:00
196,89815269,시작,가호 (Gaho),이태원 클라쓰 OST Part.2 (JTBC 금토드라마),OST / 드라마,03:22,서동성,박성일,엉클샘,51220,2256460,61472155,379,2021-07-10 0:00:00
197,93688032,네가 원했던 것들,DAY6 (Even of Day),Right Through Me,가요 / 발라드,03:16,Young K,"Young K,원필 (DAY6),홍지상",홍지상,1888,30146,211372,19,2021-07-10 0:00:00
198,90928056,밤하늘의 저 별처럼,헤이즈 (Heize) & 펀치 (Punch),밤하늘의 저 별처럼 (브람스를 좋아하세요? OST 스페셜 트랙) (SBS 월화드라마),OST / 드라마,03:49,"펀치 (Punch),지훈,Jay Kim (제이킴)","로코베리,Pinkpage",로코베리,21003,1659660,24268029,124,2021-07-10 0:00:00


In [None]:
Songs.info()

In [8]:
driver.quit()
display.stop()

<pyvirtualdisplay.display.Display at 0x7ff3e839a4c0>