## Collecting data using youtube api

1. 검색어를 넣어 검색된 결과 영상들의 채널 목록을 가져온다
2. 채널 목록에 있는 채널들의 플레이리스트 아이디를 가져온다
3. 플레이리스트 목록에 있는모든 영상 목록을 가져온다
4. 영상 목록의 조회수를 찾아 추가한다

In [None]:
import pandas as pd
from googleapiclient.discovery import build
import config

In [None]:
youtube = build('youtube', 'v3', developerKey = config.API_KEY)

### 1. 검색어를 넣어 검색된 결과 영상들의 채널 목록을 가져온다 (channelId)

In [None]:
channel_list = []

In [None]:
def get_youtube_channel_search_list(keyword):
    page_token = ''
    
    while (True):
        search_response = youtube.search().list(
            q = keyword,
            part = 'snippet',
            maxResults = 50,
            pageToken = page_token, 
        ).execute()
        
        for item in search_response['items']:
            if (item['id']['kind']=='youtube#video'):
                channel_list.append(item['snippet']['channelId'])
            elif (item['id']['kind']=='youtube#channel'):
                channel_list.append(item['id']['channelId'])
                
        if ('nextPageToken' in search_response):
            page_token = search_response['nextPageToken']
        else:
            break               


In [None]:
get_youtube_channel_search_list('브이로그')
channel_list = pd.DataFrame(channel_list, columns = ['channel_id'])
print('data num: ' + str(len(channel_list)))
channel_list.head(10)

In [None]:
channel_list = channel_list.drop_duplicates()
print('unique data num: ' + str(len(channel_list)))
channel_list = channel_list.reset_index().drop('index', 1)

### 2. 채널 목록에 있는 채널들의 플레이리스트 아이디를 가져온다

In [None]:
playlist_ids_of_channel = []

for channel_id in channel_list['channel_id']:
    content = youtube.channels().list(id = channel_id, 
                                      part = 'contentDetails').execute()
    playlist_id = content['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    playlist_ids_of_channel.append(playlist_id)
    
print('data num: ' + str(len(playlist_ids_of_channel))) # should be same as len(channel_list)

In [None]:
channel_list['playlist_id'] = playlist_ids_of_channel
channel_list

### 3. 플레이리스트 목록에 있는모든 영상 목록을 가져온다

In [None]:
df = pd.DataFrame(
        columns = ['title', 'video_id', 'channel_name', 'channel_id', 'publish_time'])

In [None]:
def get_youtube_video_data(playlist_id):
    page_token = ''
    
    while (True):
        response = youtube.playlistItems().list(
            playlistId = playlist_id,
            part = 'snippet',
            maxResults = 50,
            pageToken = page_token
        ).execute()
        
        for item in response['items']:
            title = item['snippet']['title']
            video_id = item['snippet']['resourceId']['videoId']
            channel_name = item['snippet']['channelTitle']
            channel_id = item['snippet']['channelId']
            publish_time = item['snippet']['publishedAt']
            global df
            df = df.append(
                {'title': title, 'video_id': video_id, 'channel_name': channel_name, 
                 'channel_id': channel_id, 'publish_time': publish_time},
                ignore_index=True)

        if ('nextPageToken' in response):
            page_token = response['nextPageToken']
        else:
            break

In [None]:
for playlist_id in channel_list['playlist_id']:
    get_youtube_video_data(playlist_id)

print('data num: ' + str(len(df)))
df.head(10)

### 4. 영상 목록의 조회수를 찾아 추가한다

In [None]:
df['views'] = -1

In [None]:
for index in range(0, len(df)):
    response = youtube.videos().list(
        part = 'statistics', id = df.loc[index, 'video_id']).execute()
    if len(response['items']) > 0:
        df.loc[index, 'views'] = response['items'][0]['statistics'].get('viewCount')

In [None]:
print('data num: ' + str(len(df)))
df.head(10)

In [None]:
df = df.dropna(axis=0)
indexNames = df[ (df['views'] == -1) | (df['views'] is None) ].index
df.drop(indexNames , inplace=True)
print('data num: ' + str(len(df)))

In [None]:
df.to_csv('vlog_data.csv', index = False)