In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import json
# import requests
import asyncio
import aiohttp
from aiohttp import ClientSession
import datetime
import matplotlib
import os
from functools import partial
from aiohttp import ClientConnectorError
%matplotlib inline

In [2]:
label_mapping = pd.Series.from_csv('../label_names.csv',header=0).to_dict()
labels = pd.read_csv("../train_labels.csv", header=None)
labels.columns = ["videoId", "labels"]

API_KEY = "AIzaSyAASvUHmmE-OltIz1_nb8NGdEQrVHjWNTA"
# API_KEY = "AIzaSyAEPpRxnKph3A3qadt_nDS0HVEGwuhgzps"

async def extract_video_info(video_id, session):
    url = "https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id={0}&key={1}"\
        .format(video_id, API_KEY)
    attempts = 0
    while attempts < 3:
        try:
            res = await session.get(url)
            break
        except ClientConnectorError:
            attempts += 1
            print("Connector error occurred!")
    if attempts == 3:
        return None
#     meta = json.loads(res.text)
    meta = await res.json()
#     res.close()
    try:
        items = meta['items'][0]
    except IndexError:
        return None
    except KeyError:
        return None
    try:
        duration = items['contentDetails']['duration']
        title = items['snippet']['title']
        channel_id = items['snippet']['channelId']
        channel_title = items['snippet']['channelTitle']
        published_at = items['snippet']['publishedAt']
        views = items['statistics']['viewCount']
    except:
        return None
    try:
        likes = items['statistics']['likeCount']
        dislikes = items['statistics']['dislikeCount']
    except KeyError:
        likes = -999  # missing value
        dislikes = -999
    try:
        comments = items['statistics']['commentCount']
    except KeyError:
        comments = -999
    return [video_id, duration, title, channel_id, channel_title, published_at, views, likes, dislikes, comments]

In [3]:
labels.shape

(4906660, 2)

In [5]:
# extract info for a few videos
start = 1222000
end = 1339000
step = 1000

async def gather_results(curr, step):
    tasks = []
    async with ClientSession() as session:
        for video_id in labels['videoId'][curr : curr + step]:
            task = asyncio.ensure_future(extract_video_info(video_id, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        # you now have all response bodies in this variable
        return responses

def process_df(future, curr, step):
    cache = [r for r in future.result() if r is not None]
    if (len(cache) == 0):
        raise "Quota exceeded!"
    df = pd.DataFrame(cache, columns=['video_id', 'duration', 'title', 'channel_id', 
                   'channel_title', 'published_at', 'views', 'likes', 'dislikes', 'comments'])
    df['views'] = df['views'].astype(int)
    df['likes'] = df['likes'].astype(int)
    df['dislikes'] = df['dislikes'].astype(int)
    df['comments'] = df['comments'].astype(int)
    df['published_at'] = df['published_at'].apply(lambda t: datetime.datetime.strptime(t, '%Y-%m-%dT%H:%M:%S.%fZ'))
    try:
        os.mkdir('meta/')
    except FileExistsError:
        pass
    df.to_csv('meta/meta_{0}_{1}.csv'.format(curr, curr + step))
    
for curr in range(start, end, step):
    print("loading data from {0} to {1}".format(curr, curr + step))
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(gather_results(curr, step))
    future.add_done_callback(partial(process_df, curr=curr, step=step))
    loop.run_until_complete(future)

loading data from 1142000 to 1143000
loading data from 1143000 to 1144000
loading data from 1144000 to 1145000
loading data from 1145000 to 1146000
loading data from 1146000 to 1147000
loading data from 1147000 to 1148000
loading data from 1148000 to 1149000
loading data from 1149000 to 1150000
loading data from 1150000 to 1151000
loading data from 1151000 to 1152000
loading data from 1152000 to 1153000
loading data from 1153000 to 1154000
loading data from 1154000 to 1155000
loading data from 1155000 to 1156000
loading data from 1156000 to 1157000
loading data from 1157000 to 1158000
loading data from 1158000 to 1159000
loading data from 1159000 to 1160000
loading data from 1160000 to 1161000
loading data from 1161000 to 1162000
loading data from 1162000 to 1163000
loading data from 1163000 to 1164000
loading data from 1164000 to 1165000
loading data from 1165000 to 1166000
loading data from 1166000 to 1167000
loading data from 1167000 to 1168000
loading data from 1168000 to 1169000
l

KeyboardInterrupt: 