# 유튜브 랭킹 데이터 수집하기

In [38]:
!apt-get update > /dev/null
!pip install selenium > /dev/null
!apt install chromium-chromedriver > /dev/null





In [39]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd

In [40]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')   # 화면없이 실행
options.add_argument('--no-sandbox')
options.add_argument("--single-process")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome('chromedriver', options=options)

In [41]:
url = 'https://youtube-rank.com/board/bbs/board.php?bo_table=youtube&page=1'
driver.get(url)
#time.sleep(2)

In [42]:
trs = driver.find_elements_by_tag_name('tr')
len(trs)

102

In [43]:
trs = driver.find_elements_by_css_selector('.aos-init')
len(trs)

100

In [44]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [45]:
# soup.select('form > table > tbody > tr')
channel_list = soup.select('.aos-init')
len(channel_list)

100

In [46]:
channel = channel_list[0]
category = channel.select_one('p.category').get_text().strip(' \n[]')
category

'음악/댄스/가수'

In [47]:
name = channel.select_one('.subject a').text.strip()
name

'BLACKPINK'

In [48]:
subscriber = channel.select_one('.subscriber_cnt').text
view = channel.select_one('.view_cnt').text
video = channel.select_one('.video_cnt').text[:-1]
subscriber, view, video

('6400만', '190억0381만', '371')

In [49]:
channels = []
for channel in channel_list:
    category = channel.select_one('p.category').get_text().strip(' \n[]')
    name = channel.select_one('.subject a').text.strip()
    subscriber = channel.select_one('.subscriber_cnt').text
    view = channel.select_one('.view_cnt').text
    video = channel.select_one('.video_cnt').text[:-1]
    channels.append([category, name, subscriber, view, video])

In [50]:
df = pd.DataFrame(channels, columns=['카테고리', '채널명', '구독자수', '조회수', '비디오수'])
df.head()

Unnamed: 0,카테고리,채널명,구독자수,조회수,비디오수
0,음악/댄스/가수,BLACKPINK,6400만,190억0381만,371
1,음악/댄스/가수,HYBE LABELS,6030만,187억1305만,654
2,음악/댄스/가수,BANGTANTV,5640만,121억9496만,1579
3,음악/댄스/가수,SMTOWN,2850만,218억7092만,3729
4,키즈/어린이,Boram Tube Vlog [보람튜브 브이로그],2650만,110억5288만,223


In [51]:
df.tail()

Unnamed: 0,카테고리,채널명,구독자수,조회수,비디오수
95,음식/요리/레시피,까니짱 [ G-NI ],408만,10억3471만,482
96,키즈/어린이,로미유 스토리[Romiyu Story],408만,10억6557만,433
97,음악/댄스/가수,NCT DREAM,386만,3억7422만,228
98,BJ/인물/연예인,허팝Heopop,381만,31억3892만,1847
99,취미/라이프,JaeYeol ASMR 재열,380만,10억1154만,970


- '만'과 '억'을 숫자로 변환하는 함수

In [55]:
def convert_unit(s):
    #s = ''.join(s.split('억'))
    s = s.replace('억', '').replace('개','').replace(',','')
    s = s.replace('만', '0000')
    return f'{int(s):,d}'

In [56]:
convert_unit('123억6,557만개')

'12,365,570,000'

- 두번째 페이지 - XPath로 찾아 Click() 하여 이동하기

In [57]:
# //*[@id="list-skin"]/nav/span/a[1]
driver.find_element_by_xpath('//*[@id="list-skin"]/nav/span/a[1]').click()

In [58]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [59]:
channel_list = soup.select('.aos-init')
len(channel_list)

100

In [60]:
results = []
for channel in channel_list:
    category = channel.select_one('p.category').get_text().strip(' \n[]')
    name = channel.select_one('.subject a').text.strip()
    subscriber = convert_unit(channel.select_one('.subscriber_cnt').text)
    view = convert_unit(channel.select_one('.view_cnt').text)
    video = convert_unit(channel.select_one('.video_cnt').text)
    results.append([category,name,subscriber,view,video])

In [61]:
df = pd.DataFrame(results, columns=['카테고리','채널명','구독자수','조회수','비디오수'])
df.head()

Unnamed: 0,카테고리,채널명,구독자수,조회수,비디오수
0,음악/댄스/가수,BLACKPINK,64000000,19003810000,371
1,음악/댄스/가수,HYBE LABELS,60300000,18713050000,654
2,음악/댄스/가수,BANGTANTV,56400000,12194960000,1579
3,음악/댄스/가수,SMTOWN,28500000,21870920000,3729
4,키즈/어린이,Boram Tube Vlog [보람튜브 브이로그],26500000,11052880000,223


- 페이지 1에서 10까지 크롤링하기

In [62]:
results = []
for page in range(1,11):
    url = 'https://youtube-rank.com/board/bbs/board.php?bo_table=youtube&page='+str(page)
    driver.get(url)
    time.sleep(3)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    channel_list = soup.select('.aos-init')

    for channel in channel_list:
        category = channel.select_one('p.category').get_text().strip(' \n[]')
        name = channel.select_one('.subject a').text.strip()
        subscriber = convert_unit(channel.select_one('.subscriber_cnt').text)
        view = convert_unit(channel.select_one('.view_cnt').text)
        video = convert_unit(channel.select_one('.video_cnt').text)
        results.append([category,name,subscriber,view,video])

In [63]:
df = pd.DataFrame(results, columns=['카테고리','채널명','구독자수','조회수','비디오수'])
df.head()

Unnamed: 0,카테고리,채널명,구독자수,조회수,비디오수
0,음악/댄스/가수,BLACKPINK,64000000,19003810000,371
1,음악/댄스/가수,HYBE LABELS,60300000,18713050000,654
2,음악/댄스/가수,BANGTANTV,56400000,12194960000,1579
3,음악/댄스/가수,SMTOWN,28500000,21870920000,3729
4,키즈/어린이,Boram Tube Vlog [보람튜브 브이로그],26500000,11052880000,223


In [64]:
df.tail()

Unnamed: 0,카테고리,채널명,구독자수,조회수,비디오수
995,게임,미소,510000,210770000,3084
996,미분류,밉지않은 관종언니,510000,67740000,170
997,미분류,복지마블TV [Welfare Marble],510000,49960000,240
998,음식/요리/레시피,Muggo,510000,183720000,1388
999,게임,임선비,510000,234290000,1134


In [65]:
df.to_csv('유튜브_순위.csv', index=None)

In [66]:
driver.close()