# 유튜브 랭킹 사이트 크롤링

## 유튜브 랭킹 데이터 수집

### 라이브러리 추가

In [3]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

### 데이터 추출

browser = webdriver.Chrome('C:/Web_Crawling/chromedriver/chromedriver.exe')
url = 'https://youtube-rank.com/board/bbs/board.php?bo_table=youtube'
browser.get(url)

html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')

channel_list = soup.select('form>table>tbody>tr')
for channel in channel_list:
    title = channel.select('h1>a')[0].text.strip()
    category = channel.select('p.category')[0].text.strip()
    subscriber = channel.select('.subscriber_cnt')[0].text
    view = channel.select('.view_cnt')[0].text
    video = channel.select('.video_cnt')[0].text
    print(title, category, subscriber, view, video)

### 페이지별 URL 만들기

In [16]:
page = 1
url = 'https://youtube-rank.com/board/board.php?bo_table=youtube&page={}'.format(page)
print(url)

https://youtube-rank.com/board/board.php?bo_table=youtube&page=1


In [4]:
# 반복문을 이용해서 여러 페이지 크롤링

browser = webdriver.Chrome('C:/Web_Crawling/chromedriver/chromedriver.exe')
results = []

for page in range (1,11):
    url = 'https://youtube-rank.com/board/board.php?bo_table=youtube&page={}'.format(page)
    browser.get(url)
    time.sleep(5)
    html = browser.page_source
    soup = BeautifulSoup(html,'html.parser')
    channel_list = soup.select('form>table>tbody>tr')
    for channel in channel_list:
        category = channel.select('p.category')[0].text.strip()
        title = channel.select('h1>a')[0].text.strip()
        subscriber = channel.select('.subscriber_cnt')[0].text
        view = channel.select('.view_cnt')[0].text
        video = channel.select('.video_cnt')[0].text
        data = [title, category, subscriber, view, video]
        results.append(data)
        
results[:5]

[]

In [None]:
df = pd.DataFrame(results, columns =['title','category','subscriber','view','video'])
df.to_excel('C:/Web_Crawling/MiniProject/Youtube_Crawling.xlsx', index = False)

## 유튜브 랭킹 데이터 시각화

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import font_manager, rc
if platform.system() == 'Windows':
    path = 'C:/Windows/Fonts/malgun.ttf'
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family = font_name)
elif platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
else:
    print('Check your OS system')

In [None]:
df = pd.read_excel('C:/Web_Crawling/MiniProject/Youtube_Crawling.xlsx')
df.head()

In [None]:
df['subscriber'][0:10]

In [None]:
df['subscriber'].str.replace('만', '0000')[0:10]

#### replaced_subscriber Series 문자열 변경

In [None]:
df['repalced_subscriber'] = df['subscriber'].str.replace('만', '0000')
df.head()

In [None]:
df.info()

#### Series 데이터 타입 변환

In [None]:
df['replaced_subscriber'] = df['replaced_subscriber'].astype('int')
df.info()

#### 카테고리별 구독자 수, 채널 수 피봇 테이블 생성

In [None]:
pivot_df = df.pivot_table(index = 'category', values = 'replaced_subscriber', aggfunc = ['sum','count'])
pivot_df.head()

In [None]:
pivot_df.columns = ['subscriber_sum', 'category_count']
pivot_df.head()

#### 인덱스 초기화

In [None]:
pivot_df = pivot_df.reset_index()
pivot_df.head

#### 데이터프레임 내림차순 정렬

In [None]:
pivot_df = pivot_df.sort_values(by = 'subscriber_sum', ascending=False)
pivot_df.head()

#### 카테고리별 구독자수 시각화

In [None]:
plt.figure(figsize = (30,10))
plt.pie(pivot_df['subscriber_sum'], labels=pivot_df['category'], autopct='%1.1f%%')
plt.show()

#### 카테고리별 채널수 시각화

In [None]:
pivot_df = pivot_df.sort_values(by='category_count', ascending=False)
plt.figure(figsize = (30,10))
plt.pie(pivot_df['category_count'], labels = pivot_df['category'], autopct='%1.1f%%')
plt.show()