In [1]:
import selenium.webdriver as webdriver
from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
from time import sleep
import datetime
from collections import defaultdict

import pandas as pd
from tqdm import tqdm

In [2]:
display = Display(visible=0,size=(1024, 768))
display.start()

<pyvirtualdisplay.display.Display at 0x7f863d0bbb20>

In [3]:
driver = webdriver.Chrome('/home/ubuntu/chromedriver')

In [22]:
def get_table():
    
    def find_id(element):
        return str(element)[25:33]
    
    def chart(n):
        driver.get(f'https://www.genie.co.kr/chart/top200?pg={n}')
        page = driver.page_source
        soup = BeautifulSoup(page, 'lxml')
        
        table = soup.find(attrs={'class':'newest-list'}) \
                    .find_all('tr', attrs={'class':'list'})
        
        return [find_id(e) for e in table]
        
    return chart(1) + chart(2) + chart(3) + chart(4)

In [23]:
song_ids = get_table()

In [20]:
class song_page():
    
    def __init__(self, id):
        self.id = id
    
    def get_page(self):
        url = f"https://www.genie.co.kr/detail/songInfo?xgnm={self.id}"
        self.url = url

        driver.get(url)
        song_page = driver.page_source
        song_soup = BeautifulSoup(song_page, 'lxml')
        self.soup = song_soup
        
    def get_name(self):
        self.name = self.soup.find('h2',attrs={'class':'name'}).text.strip()

    def get_values(self):
        values = self.soup.find('ul',attrs={'class':'info-data'}) \
                          .find_all(attrs={'class':'value'})
        
        self.artist   = values[0].text
        self.album    = values[1].text
        self.genre    = values[2].text
        self.playtime = values[3].text

        len_v = len(values)
        if len_v == 4:
            self.lyricists = None
            self.composers = None
            self.arrangers = None
        else:
            value4 = self.soup.find('ul',attrs={'class':'info-data'}) \
                          .find_all(attrs={'class':'attr'})
            ref_list = []
            
            for row in value4:
                items = row.find_all("img")
                s = str(items)
                s = s[10:].split('"')[1]
                if s == '작곡가' or s == '작사가' or s == '편곡자':
                    ref_list.append(s)
            
            if len(ref_list) == 1:
                if ref_list[0] == '작사가':
                    self.lyricists = values[4].text
                    self.composers = None
                    self.arrangers = None
                elif ref_list[0] == '작곡가':
                    self.lyricists = None
                    self.composers = values[4].text
                    self.arrangers = None
                else:
                    self.lyricists = None
                    self.composers = None
                    self.arrangers = values[4].text
            
            if len(ref_list) == 2:
                if ref_list[0] == '작사가':
                    self.lyricists = values[4].text
                    if ref_list[1] == '작곡가':
                        self.composers = values[5].text
                        self.arrangers = None
                    else:
                        self.composers = None
                        self.arrangers = values[5].text
                elif ref_list[0] == '작곡가':
                    self.lyricists = None
                    self.composers = values[4].text
                    self.arrangers = values[5].text
                    
            if len(ref_list) == 3:
                self.lyricists = values[4].text
                self.composers = values[5].text
                self.arrangers = values[6].text
        
    def get_likes(self):
        likes_raw = self.soup.find('em',attrs={'id':'emLikeCount'}).text
        likes_list = likes_raw.strip().split(",")
        likes_str = '0'
        for li in likes_list:
            likes_str = likes_str + li
        self.likes = int(likes_str)

    def get_totals(self):
        counts = self.soup.find('div', attrs={'class':'total'}) \
                     .find_all('div')
        listners_list = counts[0].text.strip().split(",")
        listners_str = '0'
        for li in listners_list:
            listners_str = listners_str + li
        self.listners = int(listners_str)
        
        numplays_list = counts[1].text.strip().split(",")
        numplays_str = '0'
        for nu in numplays_list:
            numplays_str = numplays_str + nu
        self.numplays = int(numplays_str)
        
    def get_comments(self):
        comments_raw = self.soup.find('span', attrs={'class':'article'}).text
        comments_str = str(comments_raw)[3:].split("개")[0]
        self.comments = int(comments_str)

    def get_now(self):
        #self.now = datetime.datetime.now()
        #time_str = '2019-04-14T02:14:30-06'
        #updated_time = datetime.strptime(time_str, '%Y-%m-%dT%H:%M:%S%z')
        from pytz import timezone
        KST = timezone('Asia/Seoul')

        #print(kst_updated_time.astimezone(KST))
        #now_raw = datetime.strptime(''.join(time_str.rsplit(':', 1), '%Y-%m-%dT%H:%M:%S%z')
        now_raw = datetime.datetime.now()
        now_kr_raw = now_raw.astimezone(KST)
        #now_str = str(datetime.datetime.now())
        now_str = str(now_kr_raw)
        #print(now_str)
        now_date = now_str.split(" ")[0]
        now_time = now_date[1].split(":")[0] + ":00:00"
        self.now = now_date + " " + now_time
        
   #     2021-07-09 13:58:52.618787
   #     2021-07-01 16:57:37.465595 → 2021/07/01, 16:00

    def set_attributes(self):
        self.get_page()
        self.get_name()
        self.get_values()
        self.get_likes()
        self.get_totals()
        self.get_comments()
        self.get_now()

    def get_attributes(self):
        return (self.id, self.name,self.artist,self.album,self.genre,self.playtime,
                 self.lyricists,
                 self.composers,self.arrangers,
                 self.likes,
                 self.listners,self.numplays,
                 self.comments,
                 self.now)


### 곡 하나에 대해서 실험

In [21]:
first_song = song_page('93619160')
first_song.set_attributes()
first_song.get_attributes()

('93619160',
 '바라만 본다',
 'MSG워너비 (M.O.M)',
 'MSG워너비 1집',
 '가요 / 발라드',
 '03:32',
 '강은경,Dokkun (김도훈)',
 '박근태,Dokkun (김도훈),강지원 (MVL)',
 '강지원 (MVL)',
 21246,
 1095568,
 12648132,
 676,
 '2021-07-10 0:00:00')

### 모든 곡에 대해 적용

In [10]:
song_info = []

for sid in tqdm(song_ids):
    song = song_page(sid)
    song.set_attributes()
    song_info.append(song.get_attributes())

100%|██████████| 200/200 [08:12<00:00,  2.46s/it]


In [11]:
Songs = pd.DataFrame(song_info, 
                     columns = ['id',
                                'name','artist','album','genre','playtime',
                                'lyricists','composer','arranger',
                                'likes',
                                'listners','numplays',
                                'comments',
                                'time(now)'])

In [12]:
Songs

Unnamed: 0,id,name,artist,album,genre,playtime,lyricists,composer,arranger,likes,listners,numplays,comments,time(now)
0,93721048,Permission to Dance,방탄소년단,Butter / Permission to Dance,가요 / 댄스,03:08,"Ed Sheeran,Steve Mac,Johnny McDaid,Jenna Andrews","Ed Sheeran,Steve Mac,Johnny McDaid,Jenna Andrews",,10056,169179,585219,(총 345개),2021-07-09 13:58:52.618787
1,93619160,바라만 본다,MSG워너비 (M.O.M),MSG워너비 1집,가요 / 발라드,03:32,"강은경,Dokkun (김도훈)","박근태,Dokkun (김도훈),강지원 (MVL)",강지원 (MVL),21219,1094305,12610547,(총 676개),2021-07-09 13:58:53.673997
2,93700014,Weekend,태연 (TAEYEON),Weekend,가요 / 댄스,03:53,황유빈,"RoseInPeace,Saimon,Willemijn van der Neut,Marc...","Saimon,RoseInPeace",8571,502240,1924754,(총 242개),2021-07-09 13:58:54.603202
3,93330349,Next Level,aespa,Next Level,가요 / 댄스,03:42,유영진,"Mario Marchetti,Adam McInnis,Sophie Curtis,유영진","Mario Marchetti,Adam McInnis,유영진",25596,1241831,21551239,(총 1125개),2021-07-09 13:58:55.520941
4,93300145,신호등,이무진,신호등,가요 / 락,03:52,이무진,이무진,"유종호,이무진",15947,679022,6394576,(총 120개),2021-07-09 13:58:56.368634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,88703035,On My Way,Alan Walker & Sabrina Carpenter & Farruko,On My Way,POP / 일렉트로니카,03:14,,,,40991,2034793,41590347,(총 178개),2021-07-09 14:06:44.443772
196,42774923,연 (捐),빅마마 (Big Mama),For The People,가요 / 발라드,04:55,,,,9513,745355,9245765,(총 31개),2021-07-09 14:06:48.742464
197,92749704,봄 안녕 봄,아이유 (IU),IU 5th Album 'LILAC',가요 / 발라드,05:24,아이유 (IU),나얼,강화성,12468,1010676,8998901,(총 209개),2021-07-09 14:06:52.732304
198,93576072,"같은 꿈, 같은 맘, 같은 밤",세븐틴 (SEVENTEEN),SEVENTEEN 8th Mini Album 'Your Choice',가요 / R&B/소울,04:08,"WOOZI (SEVENTEEN),BUMZU","WOOZI (SEVENTEEN),BUMZU,박기태","박기태,BUMZU",5880,125694,2529189,(총 33개),2021-07-09 14:07:00.034766


In [None]:
Songs.info()

In [None]:
#driver.quit()
#display.stop()