In [None]:
#api_key='AIzaSyDbQTp-qKJP55kAAEgXP2vD80uHK4fVG-s'
#api_key='AIzaSyAQBq0X5Q3JCwwAlqx7hP24x0tS6NYpZpE'
#api_key = 'AIzaSyDBAlVzk0Q_pqFqdxdosJ09AjS9RhN1o28'
#api_key = 'AIzaSyB2VjIO1qBKtHIYfM1kLGK0X4huo5cPgJg'
api_key = "AIzaSyCGEBtZPu5Bpfa-wJIMcE6QPnzIYiNwD5k"

First, we use an API key to get access to the Youtube Data API.

In [None]:
from apiclient.discovery import build

In [None]:
yt = build("youtube",'v3',developerKey=api_key) # establishes a connection with the Youtube Data API v3

Then, we craft a function to retrieve comments from any given 
video by its video ID. This function returns all of the comments in a 
dictionary, where the comment authors are keys. Each of their comments
and their number of likes are stored as the values in a list of 
heterogenous lists. The video statistics are also retrieved.

In [None]:
def get_video_comments(video_id):
    stats = yt.videos().list(id=video_id,
                             part="statistics").execute()['items'][0]['statistics']
    res = yt.commentThreads().list(videoId=video_id,
                                   part="snippet,replies",maxResults=100).execute()
    authors = {}
    def exists(author_id):
        return author_id in authors
    for comment_data in res['items']:
        comment_data = comment_data['snippet']['topLevelComment']['snippet']
        comment = comment_data['textOriginal']
        likes = comment_data['likeCount']
        author = comment_data['authorDisplayName']
        if exists(author):
            authors[author].append([comment,likes])
        else:
            authors[author] = []
            authors[author].append([comment,likes])
    return authors, stats

Our next function retrieves all of a given channel's Youtube
videos and stores the video's video Id, title, date of publishing,
its description, and its statistics. It uses the get_video_comments method to 
create a dictionary of all the comments, and it stores all of these
attributes as a heterogenous list within a list of other video-lists.

In [None]:
def channel_videos(channel_id):
    res = yt.channels().list(id=channel_id,
                              part='contentDetails').execute()
    items = res['items']
    playlist_id = items[0]['contentDetails']['relatedPlaylists']['uploads']
    next_page = None
    page = 0
    pages = 10
    videos = []
    while True:
        result = yt.playlistItems().list(playlistId=playlist_id, 
                                           part='snippet',
                                           maxResults=50,
                                           pageToken = next_page).execute()
        for res in result['items']:
            video_id = res['snippet']['resourceId']['videoId']
            video_title = res['snippet']['title']
            video_publish_date = res['snippet']['publishedAt']
            video_description = res['snippet']['description']
            try:
                authors, stats = get_video_comments(video_id)
                videos.append([video_title, video_publish_date, video_description, stats, authors])
            except Exception:
                print("Comments disabled") 
        next_page = result.get('nextPageToken')
        page += 1
        print(page)
        if next_page == None or page > pages:
            break
    return videos

Some channels might appear as a "user" instead of a "channel," so we convert those by hand.

In [None]:
cnn_id = yt.channels().list(part="id",forUsername='CNN').execute()['items'][0]['id']

In [None]:
fox_id = yt.channels().list(part="id",forUsername='FoxNewsChannel').execute()['items'][0]['id']

In [None]:
cnn_data = channel_videos(cnn_id)

In [None]:
fox_data = channel_videos(fox_id) 

We will now process our data and organize it so that 
it may be used for analysis. We will also cache our data
at this point, since Google restricts the number of requests
we can make with the API and we want to make the most of
every request we are allowed.

In [None]:
def cache_channel_data(data, channel):
    channel_file = open(channel,"w+")
    new_line = "\n\n"
    for video in data:
        # video_title, video_publish_date, video_description, stats, authors
        video_title, video_publish_date,\
        video_description, stats, authors = video
        vid_doc = video_title+"\t"+video_publish_date+"\t"+\
        video_description+str(stats)+new_line
        channel_file.write(vid_doc)
        for author in authors:
            for comment in authors[author]:
                text = comment[0]
                likes = comment[1]
                comment_doc = author+" said: "+\
                text+"("+str(likes)+" likes)\n"
                channel_file.write(comment_doc)
        channel_file.write(new_line)

In [None]:
cache_channel_data(cnn_data, "CNN")

In [None]:
cache_channel_data(fox_data, "Fox")

With our data cached, we can now begin to look at some basic trends. We'll do some analysis on the video statistics and plot the like/dislike ratios for videos.

We'll import the necessary libraries to do plotting and sentiment analysis.

In [None]:
%matplotlib inline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime
from matplotlib.dates import (YEARLY, DateFormatter,rrulewrapper, RRuleLocator, drange)

We'll write some helper functions to take care of the details. First, a function to convert the raw dates from our list of videos.

In [None]:
def convert_raw_date(date):
    date_raw = date.split("T")[0].split("-")
    year = int(date_raw[0])
    month = int(date_raw[1])
    day = int(date_raw[2])
    return year, month, day

Next, we'll write a function to process the statistics for each video and return the number of views, likes, dislikes, and comments.

In [85]:
def process_video_stats(video):
    title, publish_date, description, stats, authors = video
    year, month, day = convert_raw_date(publish_date)
    date = datetime.date(year,month,day)
    views = stats['viewCount']
    likes = stats['likeCount']
    dislikes = stats['dislikeCount']
    num_comments = stats['commentCount']
    return title, date, description, int(views), int(likes), int(dislikes), int(num_comments), authors

We'll use our processing function to build up lists of different values that we can plot against the publishing dates of each videos. We'll plot likes, dislikes, comments, and we will do some sentiment analysis.

In [88]:
def process_videos(videos):
    like_nums = []
    dislike_nums = []
    comment_nums = []
    like_dislike_ratios = []
    dates = []
    titles = []
    comments = []
    for video in videos:
        title, date, description, views, likes, dislikes, num_comments, authors=process_video_stats(video)
        like_nums.append(likes)
        dislike_nums.append(dislikes)
        like_dislike_ratios.append(likes/dislikes)
        comment_nums.append(num_comments)
        dates.append(date)
        titles.append(title)
        comments.append(authors)
    return like_nums, dislike_nums, comment_nums, like_dislike_ratios, dates, titles, comments

In [89]:
processed_cnn_data = process_videos(cnn_data)

In [90]:
processed_cnn_data

([4623,
  5043,
  6953,
  5175,
  1882,
  7786,
  11337,
  2101,
  4511,
  1243,
  4772,
  2312,
  11299,
  4131,
  3579,
  2928,
  182,
  243,
  396,
  743,
  1415,
  1856,
  3404,
  1146,
  2309,
  594,
  11712,
  3171,
  7374,
  7383,
  8267,
  8909,
  6453,
  4371,
  3530,
  212,
  1018,
  674,
  305,
  8677,
  812,
  419,
  3281,
  5538,
  1146,
  3040,
  7412,
  6124,
  6447,
  6695,
  3059,
  2406,
  950,
  1823,
  9934,
  9038,
  337,
  270,
  310,
  538,
  13606,
  3589,
  14387,
  4857,
  22856,
  18759,
  7959,
  1399,
  1087,
  6688,
  8293,
  874,
  596,
  735,
  819,
  591,
  3323,
  841,
  4062,
  26422,
  19876,
  7389,
  1125,
  716,
  8302,
  814,
  4724,
  794,
  1286,
  1659,
  6047,
  10090,
  3548,
  7060,
  595,
  2163,
  4914,
  930,
  3342,
  257,
  18507,
  5040,
  16020,
  3714,
  4090,
  2517,
  2625,
  952,
  691,
  3108,
  4804,
  4695,
  9128,
  3815,
  8832,
  1664,
  6597,
  26050,
  5431,
  3569,
  725,
  1399,
  8595,
  6483,
  15165,
  6962,
  1837,
