# Step 1: Data Collection

First, we lay out some methods in order to treat our Youtube Data API keys as though they are being fed through a revolving door. If we run queries with an API object built from just one key, we hit quota limits that block us from downloading more information and the program crashes, destroying our data. With our system, an API object is built from the first key in the pool of API keys. When the program encounters an HTTP Error from Google, a new API object is built from the next key in the pool. The new object is passed back to the method running queries and the method carries on making requests to the API. The API keys are a bit like ammunition because once they've exhausted their allocated bandwith from Google, they are useless for the day.

In [None]:
from apiclient.discovery import build
from apiclient import errors
import ast
import json
import random
# Author: Jared Prior, Cedric Blaise

# API Revolving Door 
# intercepts rejected API requests and spins 
# up a new Youtube build using the next API
# key in the pool. The rejected key is discarded
# and not considered again, as the revolving
# door only dispenses each API key from the 
# pool once.
current_key = 0
api_revolving_door = ["AIzaSyCCCQsP3NGY9rGvOBI5unIvEy4OsAC41tc", #1
                      "AIzaSyCPqYMsq1HOeOSaLtBGejUh5-GzOKDV8lg", #2
                      "AIzaSyCGEBtZPu5Bpfa-wJIMcE6QPnzIYiNwD5k", #3
                      'AIzaSyB2VjIO1qBKtHIYfM1kLGK0X4huo5cPgJg', #4
                      'AIzaSyDBAlVzk0Q_pqFqdxdosJ09AjS9RhN1o28', #5
                      'AIzaSyAQBq0X5Q3JCwwAlqx7hP24x0tS6NYpZpE', #6
                      'AIzaSyAQBq0X5Q3JCwwAlqx7hP24x0tS6NYpZpE', #7
                      'AIzaSyDbQTp-qKJP55kAAEgXP2vD80uHK4fVG-s', #8
                      'AIzaSyBmFEUq7B52ei7WYnlRbY2biTcNkEEIwsM', #9
                      "AIzaSyDbvMLfe7FPhCCycAcKQxWB2urU3SOD0qM", #10
                      "AIzaSyBMlUoxwkfP2PIjIgDRCe7ladBN6efLgzw", #11
                      'AIzaSyA1Y_LCB5KCU6kZBJVZlHXSvmbNjGiitfY', #12
                      'AIzaSyBlqtXeEgvyJGaw5h0SRxWO_ibw_JWX42s', #13
                      "AIzaSyCmXJ4LrPH9IPyw_CyPbmZF2947j6rBcIY", #14
                      "AIzaSyBU3Hg6Ph7a-z-Dgh1eJ9tHjgDfAonxlyw", #15
                      "AIzaSyDmKGvWEZzfJKo9QS0CAovzYCLZE3MpF7w", #16
                      "AIzaSyAKbiwdli7mrYm-5Hcnl356PB8rGam8fB0", #17
                      "AIzaSyCfm7NmDGTRfuxMK6hMABJYoKdTRGZh-Jk", #18
                      "AIzaSyCfm7NmDGTRfuxMK6hMABJYoKdTRGZh-Jk", #19
                      "AIzaSyAXGe4C55z3hL8Znu2vKkQzjUaV0HgWfhs", #20
                      "AIzaSyCPj4BPl3XcqR8L1GMEblZKKJds3hVIKm8", #21
                      "AIzaSyACcJRNByz_GLpvZGrdl5RjCtJiH2UGDbo", #22
                      "AIzaSyDEJYq2pJr6C-czt_jv4g5lf3ni3gharLA", #23
                      "AIzaSyCMDeQQJogAafO9r3t3yp0Mnp5Spk4K1WU",
                      "AIzaSyCFo4tjFT1Mv5I6tF6Gm3JbDjlv4rjXefY", 
                      "AIzaSyBn0Ad5kaZx2eyArelLLMKNjDSg4ogZ2SA", 
                      "AIzaSyADUMqQ4Vc4zk208Tax8t0adnFi9nGyi90"]
def yt_build(key):
    # builds Youtube object from API key
    yt = build("youtube",'v3',developerKey=key) # establishes a connection with the Youtube Data API v3
    return yt
def door_spin():
    global current_key
    # retrieves the next build or flags empty pool
    if current_key > (len(api_revolving_door)-2):
        return None
    else:
        current_key += 1
    print("Spinning up new API build: " + str(current_key))
    api_key = api_revolving_door[current_key]
    yt = yt_build(api_key)
    return yt

In [None]:
yt = yt_build(api_revolving_door[0])

Then, we craft a function to retrieve comments from any given 
video by its video ID. This function returns all of the comments in a 
dictionary, where the comment authors are keys. Each of their comments
and their number of likes are stored as the values in a heterogenous list. The video statistics are also retrieved.

In [None]:
def get_video_comments(video_id, yt):
    # retrieves comments from a given Youtube video by its ID
    # returns a dictionary of authors and their comments
    global current_key
    authors = {}
    def exists(author_id):
        return author_id in authors
    stats = yt.videos().list(id=video_id,part="statistics").execute()['items'][0]['statistics']
    next_page = None
    page = 0
    pages = 5
    while True:
        res = yt.commentThreads().list(videoId=video_id,
                                       part="snippet,replies",maxResults=100,pageToken = next_page).execute()

        for comment_data in res['items']:
            comment_data = comment_data['snippet']['topLevelComment']['snippet']
            comment = comment_data['textOriginal']
            likes = comment_data['likeCount']
            author = comment_data['authorDisplayName']
            if exists(author):
                authors[author].append([comment,likes])
            else:
                authors[author] = []
                authors[author].append([comment,likes])
        next_page = res.get('nextPageToken')
        page +=1
        if next_page==None or page > pages:
            break
    return authors, stats

Our next function retrieves all of a given channel's Youtube
videos and stores each video's video ID, title, date of publishing,
its description, and its statistics. It uses the get_video_comments method to 
create a dictionary of all the comments, and it stores all of these
attributes as a heterogenous list within a list of other video-lists. I personally think this method is ugly and I will look into thinning it out and making it more legible.

In [None]:
def error_reason(err):
    if err.resp.get('content-type', '').startswith('application/json'):
        reason = json.loads(err.content).get('error').get('errors')[0].get('reason')
    else:
        reason="Unexplained"
    return reason
def channel_videos(channel_id, yt, pages):
    # returns all the videos of a particular Youtube
    # channel, and returns data for each video (title, description,
    # comments, date, statistics, etc.)
    global current_key
    print("Retrieving data for " + channel_id)
    res = None
    while res == None:
        try:
            res = yt.channels().list(id=channel_id,part='contentDetails').execute()
        except errors.HttpError as error:
            if error_reason(error).split(" ")[-1] == "commentsDisabled":
                print("Disabled comments: %s" % error)
                pass
            elif error.resp.status == 403:
                print("Error1: %s" % error)
                yt = door_spin()
                if yt == None:
                    print("Breaking")
                    return None
            else:
                print("Error2: %s" % error)
                pass
    items = res['items']
    playlist_id = items[0]['contentDetails']['relatedPlaylists']['uploads']
    next_page = None
    page = 0
    videos = []
    while True:
        print("Current API index: " + str(current_key))
        try:
            result = yt.playlistItems().list(playlistId=playlist_id,part='snippet',maxResults=50,
                                             pageToken = next_page).execute()
            for res in result['items']:
                video_id = res['snippet']['resourceId']['videoId']
                video_title = res['snippet']['title']
                video_publish_date = res['snippet']['publishedAt']
                video_description = res['snippet']['description']
                if random.random() > 0.20:
                    try:
                        authors, stats = get_video_comments(video_id, yt)
                        videos.append([video_title, video_publish_date, video_description, stats, authors])
                    except errors.HttpError as error:
                        if error_reason(error).split(" ")[-1] == "commentsDisabled":
                            print("Disabled comments: %s" % error)
                            pass
                        elif error.resp.status == 403:
                            print("Error1 inner: %s" % error)
                            yt = door_spin()
                            if yt == None:
                                print("Breaking")
                                return None
                        else:
                            print("Error2: %s" % error)
                            pass
            if random.random()>0.80:
                next_page = yt.playlistItems().list(playlistId=playlist_id,part='snippet',maxResults=50,
                pageToken = result.get('nextPageToken')).execute().get('nextPageToken')
            else:
                next_page = result.get('nextPageToken')
                page += 1
                print("Current page: " + str(page))
        except errors.HttpError as error:
            if error_reason(error).split(" ")[-1] == "commentsDisabled":
                print("Disabled comments: %s" % error)
                pass
            elif error.resp.status == 403:
                print("Error1 outer: %s" % error)
                yt = door_spin()
                if yt == None:
                    print("Breaking")
                    return None
            else:
                print("Error2: %s" % error)
                continue
        if next_page == None or page > pages:
            break
        print(str(len(videos)) + " videos retrieved from the API so far")
    return videos

Some channels might appear as a "user" instead of a "channel," so we convert those by hand.

In [None]:
cnn_id = yt.channels().list(part="id",forUsername='CNN').execute()['items'][0]['id']

In [None]:
fox_id = yt.channels().list(part="id",forUsername='FoxNewsChannel').execute()['items'][0]['id']

In [None]:
ny_times_id = yt.channels().list(part="id",forUsername='TheNewYorkTimes').execute()['items'][0]['id']

In [None]:
msnbc_id = yt.channels().list(part="id",forUsername='msnbcleanforward').execute()['items'][0]['id']

In [None]:
breitbart_id = "UCmgnsaQIK1IR808Ebde-ssA"

Then, we retrieve the videos using our channel IDs and our channel_videos method.

In [None]:
cnn_data = channel_videos(cnn_id, yt, 20) # left wing

In [None]:
fox_data = channel_videos(fox_id, yt, 20) # right wing 

In [None]:
ny_times_data = channel_videos(ny_times_id, yt, 20) # left wing 

In [None]:
msnbc_data = channel_videos(msnbc_id, yt, 20) # left wing

In [None]:
breitbart_data = channel_videos(breitbart_id, yt, 20) # alt right 

# Step 2: Data Caching
We will now process our data and organize it so that 
it may be used for analysis. We will also cache our data
at this point, since Google restricts the number of requests
we can make with the API and we want to make the most of
every request we are allowed. It also takes a frustratingly long
time to make the requests and store the data.

In [None]:
def cache_channel_data(data, channel):
    channel_file = open(channel +".txt","w")
    channel_file.write(str(data))

We will include an unpackaging 
method so that we can retrieve the data from our text files
in the same format as it is returned in the channel_videos
function.

In [None]:
def unpack_cached_data(channel):
    print("Unpacking data: ", channel)
    channel_file = open(channel, "r")
    cached_data = channel_file.read()
    videos = ast.literal_eval(cached_data)
    return videos

We now use our caching function to dump our data to text files. We can use the unpacking method to retrieve all the data in its pre-cache format.

In [None]:
cache_channel_data(cnn_data, "CNN")

In [None]:
cache_channel_data(fox_data, "Fox")

In [None]:
cache_channel_data(breitbart_data, "Breitbart News")

In [None]:
cache_channel_data(ny_times_data, "NYTimes")

In [None]:
cache_channel_data(msnbc_data, "MSNBC")

In [None]:
ucnn_data = unpack_cached_data("CNN.txt")
ufox_data = unpack_cached_data("Fox.txt")
ubreitbart_data = unpack_cached_data("Breitbart News.txt")
umsnbc_data = unpack_cached_data("MSNBC.txt")
uny_times_data = unpack_cached_data("NYTimes.txt")

# Step 3: Data Processing and Organization
We'll write a function to process the statistics for each video and return the number of views, likes, dislikes, and comments. But first we will need some helper methods to help us convert the video dates and timestamps to the proper format.

In [None]:
import datetime
def convert_raw_date(date):
    date_raw = date.split("T")[0].split("-")
    year = int(date_raw[0])
    month = int(date_raw[1])
    day = int(date_raw[2])
    return year, month, day
def to_integer(dt_time):
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

In [None]:
def process_video_stats(video):
    title, publish_date, description, stats, authors = video
    year, month, day = convert_raw_date(publish_date)
    date = datetime.date(year,month,day)
    views = stats['viewCount']
    likes = stats['likeCount']
    dislikes = stats['dislikeCount']
    num_comments = stats['commentCount']
    return title, date, description, int(views), int(likes), int(dislikes), int(num_comments), authors

We'll use our processing function to build up lists of different values that we can plot against the publishing dates of each video.

In [None]:
def process_videos(videos):
    # returns a list of lists where index i in any of the lists
    # contains specific information about video i e.g,
    # titles[i], like_nums[i] = gives you the title and 
    # number of likes of video i
    like_nums = []
    dislike_nums = []
    comment_nums = []
    dislike_ratios = []
    dates = []
    titles = []
    comments = []
    view_nums = []
    interactions = []
    descriptions = []
    for video in videos:
        title, date, description, views, likes, dislikes, num_comments, authors=process_video_stats(video)
        like_nums.append(likes)
        dislike_nums.append(dislikes)
        dislike_ratios.append(dislikes/(likes+dislikes))
        comment_nums.append(num_comments)
        dates.append(date)
        titles.append(title)
        comments.append(authors)
        view_nums.append(views)
        interactions.append(num_comments+likes+dislikes)
        descriptions.append(description)
    return like_nums, dislike_nums, comment_nums, dislike_ratios,\
    dates, titles, comments, view_nums, descriptions, interactions

Now, we can process all of our data so that it is more accessible and easier to plot.

In [None]:
processed_cnn_data = process_videos(cnn_data)
processed_fox_data = process_videos(fox_data)
processed_nytimes_data = process_videos(ny_times_data)
processed_msnbc_data = process_videos(msnbc_data)
processed_breitbart_data = process_videos(breitbart_data)

We can write some auxiliary functions to help us merge authors amongst all the channels.

In [None]:
def cross_channel_authors(author_dicts):
    author_dict = {}
    for authors in author_dicts:
        for author in authors:
            if author in author_dict:
                author_dict[author].append(authors[author])
    return author_dict

In [None]:
def unique_authors(comment_dictionaries):
    unique_authors = {}
    for video in comments:
        for author in video:
            if author in unique_authors:
                unique_authors[author].append(video[author])
            else:
                unique_authors[author] = []
                unique_authors[author].append(video[author])
    return unique_authors

In [None]:
def write_list_to_file(y, title):
    file=open(title,"w")
    for x in y:
        file.write(x + "\n")

# Step 4: Analysis and Plotting

We'll now write a plotting function to plot based on the metrics we've produced from our processing functions.

In [None]:
import numpy
%matplotlib inline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.style as style
import datetime
from matplotlib.dates import WeekdayLocator
from matplotlib.dates import DayLocator
from matplotlib.dates import (YEARLY, DateFormatter,rrulewrapper, RRuleLocator, drange)
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
import statistics
def plot_data(data, dates, title):
    mpl.rcParams['figure.dpi'] = 150
    loc = DayLocator(interval=30)
    formatter = DateFormatter('%Y-%m-%d')
    fig, ax = plt.subplots()
    y_mean = statistics.mean(data)
    st="{:.2f}".format(y_mean)
    plt.plot_date(np.array(dates), np.array(data))
    ax.set_title(title)
    ax.xaxis.set_major_locator(loc)
    ax.xaxis.set_major_formatter(formatter)
    ax.xaxis.set_tick_params(rotation=30, labelsize=10)
    plt.axhline(y=y_mean, color='r', linestyle='-',label='Mean Average: {}'.format(float(st)))
    plt.legend()
    plt.figure(figsize=(20,5))
    style.use('fivethirtyeight')
    plt.show()
def plot_datas(data, data2, title):
    mpl.rcParams['figure.dpi'] = 150
    fig, ax = plt.subplots()
    ax.set_title(title)
    style.use('fivethirtyeight')
    plt.scatter(data,data2,s=2)
    z = numpy.polyfit(data, data2, 1)
    p = numpy.poly1d(z)
    #plt.plot(data,p(data),"r--")
    plt.show()

We can use this function to plot video statistics over time from any given channel.

In [None]:
plot_data(processed_cnn_data[0],processed_cnn_data[4], "CNN Video Likes")
plot_data(processed_fox_data[0],processed_fox_data[4], "FOX News Video Likes")

In [None]:
plot_data(processed_cnn_data[1],processed_cnn_data[4], "CNN Video Dislikes")
plot_data(processed_fox_data[1],processed_fox_data[4], "FOX News Video Dislikes")

In [None]:
plot_data(processed_cnn_data[3],processed_cnn_data[4], "CNN Video Dislikes/Total")
plot_data(processed_fox_data[3],processed_fox_data[4], "FOX News Video Dislikes/Total")

In [None]:
plot_data(processed_cnn_data[7],processed_cnn_data[4], "CNN Video Views")
plot_data(processed_fox_data[7],processed_fox_data[4], "FOX News Video Views")

In [None]:
plot_data(processed_cnn_data[2],processed_cnn_data[4], "CNN Video Comments")
plot_data(processed_fox_data[2],processed_fox_data[4], "FOX News Video Comments")

In [None]:
plot_data(processed_cnn_data[9],processed_cnn_data[4], "CNN Video Interactions")
plot_data(processed_fox_data[9],processed_fox_data[4], "FOX News Video Interactions")

If we'd like to do a broad sentiment analysis for each channel's videos, we can write some functions to help us do that. We'll loop through our processed data and retrive the comments. For each video, we'll find the mean average positive, negative, and compound sentiment scores by aggregating these scores from each comment in the video. We'll also create an overloaded method for analyzing sentiments towards a specific keyword.

In [None]:
analyzer = SentimentIntensityAnalyzer()
def analyze(comment):
    score = analyzer.polarity_scores(comment)
    return score["compound"], score["pos"], score["neg"]
def analyze_channel(data):
    pos_scores = []
    neg_scores = []
    compound_scores = []
    like_nums, dislike_nums, comment_nums, like_dislike_ratios, dates,\
    titles, comments, view_nums, description, interactions = data
    for author_dict in comments: # video comments
        vid_pos = []
        vid_neg = []
        vid_comp = []
        for author in author_dict: # all the authors in the comment section
            for comment in author_dict[author]: # their comments
                comp, pos, neg = analyze(comment[0])
                vid_pos.append(pos)
                vid_neg.append(neg)
                vid_comp.append(comp)
        pos = statistics.mean(vid_pos)
        neg = statistics.mean(vid_neg)
        comp = statistics.mean(vid_comp)
        pos_scores.append(pos)
        neg_scores.append(neg)
        compound_scores.append(comp)
    return pos_scores,neg_scores,compound_scores
def analyze_channel_by_keyword(data, keyword):
    pos_scores = []
    neg_scores = []
    compound_scores = []
    dates_by_keyword = []
    like_nums, dislike_nums, comment_nums, like_dislike_ratios, dates,\
    titles, comments, view_nums, descriptions, interactions = data
    index = 0
    for author_dict in comments: # video comments
        if keyword in titles[index] or keyword in descriptions[index]:
            vid_pos = []
            vid_neg = []
            vid_comp = []
            for author in author_dict: # all the authors in the comment section
                for comment in author_dict[author]: # their comments
                    comp, pos, neg = analyze(comment[0])
                    vid_pos.append(pos)
                    vid_neg.append(neg)
                    vid_comp.append(comp)
            pos = statistics.mean(vid_pos)
            neg = statistics.mean(vid_neg)
            comp = statistics.mean(vid_comp)
            pos_scores.append(pos)
            neg_scores.append(neg)
            compound_scores.append(comp)
            dates_by_keyword.append(dates[index])
        index+=1
    return pos_scores,neg_scores,compound_scores,dates_by_keyword

Now, we'll retrieve the sentiment data for each channel.

In [None]:
cnn_pos, cnn_neg, cnn_comp, cnn_dates = analyze_channel_by_keyword(processed_cnn_data, "Trump")

In [None]:
fox_pos, fox_neg, fox_comp, fox_dates = analyze_channel_by_keyword(processed_fox_data, "Trump")

Then, we'll take the sentiment data we got from our channel analysis with a keyword parameter of "Trump" to see how CNN viewers and Fox viewers tend to speak when the President is mentioned in or is the subject of one of their Youtube videos.

In [None]:
plot_data(cnn_pos,cnn_dates, "CNN Viewer Sentiments (Trump, positive score)") 

In [None]:
plot_data(fox_comp,fox_dates, "Fox Viewer Sentiments (Trump, compound score)")

In [None]:
plot_data(cnn_pos,cnn_dates,"CNN Viewer Sentiments (Trump, positive score)")
plot_data(fox_pos,fox_dates,"Fox Viewer Sentiments (Trump, positive score)")

In [None]:
plot_data(cnn_neg,cnn_dates,"CNN Viewer Sentiments (Trump, negative score)")
plot_data(fox_neg,fox_dates,"Fox Viewer Sentiments (Trump, negative score)")

In [None]:
pos_scores,neg_scores,compound_scores = analyze_channel(processed_cnn_data)

In [None]:
plot_datas(processed_cnn_data[3],pos_scores, "Dislike Ratio Against Positive Sentiment Scores")

In [None]:
plot_datas(processed_cnn_data[3],neg_scores, "Dislike Ratio Against Negative Sentiment Scores")

In [None]:
plot_datas(processed_cnn_data[3],compound_scores, "Dislike Ratio Against Compound Sentiment Scores")

In [None]:
plot_datas(processed_cnn_data[9],compound_scores, "Interactions vs. Compound Sentiment")

# Step 5: Data Pre-Processing for Classification
We'll need to tokenize our data for processing. We'll begin with tokenizing our video titles and descriptions, as they'll need to be narrowed down to keywords.

In [271]:
data_set = [(processed_cnn_data, "CNN"),
            (processed_fox_data, "FOX"),
            (processed_msnbc_data, "MSNBC"),
            (processed_breitbart_data, "Breitbart"),
            (processed_nytimes_data, "NYTimes")]

Since we have a lot of data, if we want to reduce running time, we can abbreviate the data.

In [272]:
abbreviated_data_set = []
for data in data_set:
    a,b,c,d,e,f,g,h,i,j = data[0]
    channel = data[1]
    data = a[::2],b[::2],c[::2],\
    d[::2],e[::2],f[::2],g[::2],\
    h[::2],i[::2],j[::2]
    abbreviated_data_set.append((data,channel))

In [273]:
label_names = []
for data in abbreviated_data_set:
    label_names.append(data[1])

Our dataset is relatively large (certainly, it's massive for my 8GBRAM Macbook Pro), so finalizing the pre-processing might take a little while.

In [274]:
from absl import logging
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

def create_comment_frame(data,channel):
    comment_list = []
    i = 0
    for comment_dict in data[6]: # for every video comment dictionary
        likes = data[0][i]
        dislikes = data[1][i]
        comment_nums = data[2][i]
        title = data[5][i]
        views = data[7][i]
        description = data[8][i]
        for author in comment_dict: # for every author in each vid. dictionary
            for comment in comment_dict[author]: # for all their comments
                comment_list.append((comment[0],
                                     comment[1],
                                     author,
                                     likes,
                                     dislikes,
                                     comment_nums,
                                     title, 
                                     description,
                                     views)) # append their comment and its data to the list
        i+=1
    pd_dict = {}
    pd_dict['comment'] = []
    pd_dict['commentLikes'] = []
    pd_dict['authorName'] = []
    pd_dict['title'] = []
    pd_dict['description'] = []
    pd_dict['videoLikes'] = []
    pd_dict['videoDislikes'] = []
    pd_dict['views'] = []
    pd_dict['commentNum'] = []
    for comment in comment_list:
        pd_dict['comment'].append(comment[0])
        pd_dict['commentLikes'].append(comment[1])
        pd_dict['authorName'].append(comment[2])
        pd_dict['videoLikes'].append(comment[3])
        pd_dict['videoDislikes'].append(comment[4])
        pd_dict['commentNum'].append(comment[5])
        pd_dict['title'].append(comment[6])
        pd_dict['description'].append(comment[7])
        pd_dict['views'].append(comment[8])
    channel_comments = pd.DataFrame.from_dict(pd_dict)
    channel_comments['channel'] = channel
    return channel_comments
def create_comment_dataframe(dataset):
    frame = create_comment_frame(dataset[0][0], dataset[0][1])
    for data in dataset[1:]:
        frame = pd.concat([frame,create_comment_frame(data[0], data[1])]).sample(frac=1).reset_index(drop=True)
    return frame

In [275]:
all_comments_dataframe = create_comment_dataframe(data_set)

In [276]:
all_comments_dataframe.head()

Unnamed: 0,comment,commentLikes,authorName,title,description,videoLikes,videoDislikes,views,commentNum,channel
0,NPV is an initiative that is underway to do aw...,0,Self Learning,McCarthy on impeachment: 'yesterday was no cau...,House Minority Leader McCarthy holds his weekl...,6698,204,308208,1318,FOX
1,If you know your training your replacement tea...,0,Grant Smith,'Tucker Carlson Tonight' investigates: Are US ...,"AT&T outsourcing American jobs, forcing employ...",7652,216,240931,2618,FOX
2,"Macau, HK, Tibet... Japan,Portugal,UK,USA atta...",0,Ronald Dumb,New York Times reporter: Uyghurs say Xinjiang ...,"CNN's Rosemary Church interviews Austin Ramzy,...",879,264,78196,1318,CNN
3,Has anyone asked Cuomo if he wants to be the n...,0,John Whale,Brit Hume analyzes the impact of coronavirus c...,Fox News senior political analyst Brit Hume jo...,1501,142,91328,924,FOX
4,Nobody cares about basketball anymore. Whateve...,0,mark tito,Stephen A. Smith makes prediction about NBA se...,ESPN's Stephen A. Smith explains why he believ...,725,193,84396,663,CNN


# Step 6: Classification
So we have our dataframes all loaded up! Pandas is great because it allows us to perform vectorized operations on our data, which is fairly dense. We will divide our dataframe into a training and test set and we will create text embeddings for our features. I've adapted some code used for sentiment analysis to suit our purposes. I've organized things so that our model considers each comment's content and its context (so the actual language and the title, description, and statistics of the video it was left on. The model treats the channel names as labels, so when it is trained and shown comments, it can make predictions about which channel the comments originated from. Here, we will first train the model to try to recognize channel origins for just singular comments. Then, we will merge comments to form a corpus of comment-books for each video, treating a video as though it were an author of a book of all of its comments, and we will try to train the model to recognize which channel a body of comments came from.

https://www.tensorflow.org/tutorials/keras/text_classification_with_hub

We will be using a Deep Neural Network classifier from TensorFlow. The folks at Google's TensorFlow Hub put together a Jupyter notebook in which they trained a classifier to classify sentiment polarity in movie reviews. I adapted this model so that it can use multiple labels (our channel names), and multiple features (the fields from our dataframes, which I created from our processed data.

In [277]:
import sklearn
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(all_comments_dataframe, test_size=0.2)

We'll divide our dataframe into a test set and a training set.

In [278]:
# Prediction on the whole training set.
train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(
    train_df, train_df["channel"], num_epochs=None, shuffle=True)
# Prediction on the whole training set.
predict_train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(
    train_df, train_df["channel"], shuffle=False)
# Prediction on the test set.
predict_test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(
    test_df, test_df["channel"], shuffle=False)

Then, we'll create embeddings for all our features: comments, video descriptions, the number of likes for each comment, the author of the comment, and the title of the video the comment was left on.

In [279]:
embedded_text_feature_column = hub.text_embedding_column(
    key="comment", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")
embedded_text_feature_column2 = hub.text_embedding_column(
    key="description", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")
embedded_text_feature_column3 = tf.feature_column.numeric_column(
    "commentLikes", shape=(1,), default_value=None, dtype=tf.dtypes.float32, normalizer_fn=None)
embedded_text_feature_column4 = hub.text_embedding_column(
    key="authorName", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")
embedded_text_feature_column5 = hub.text_embedding_column(
    key="title", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

Now, we will pass our feautures and labels to a TensorFlow DNN estimator and fire up the estimator so that we can run training.

In [None]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[500, 100],
    feature_columns=[embedded_text_feature_column,
                     embedded_text_feature_column2,
                     embedded_text_feature_column3,
                     embedded_text_feature_column4,
                     embedded_text_feature_column5],
    label_vocabulary = label_names,
    n_classes=5,
    optimizer=tf.keras.optimizers.Adagrad(lr=0.003))

In [None]:
estimator.train(input_fn=train_input_fn, steps=5000);

In [284]:
train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
print("Training set accuracy: {accuracy}".format(**train_eval_result))
print("Test set accuracy: {accuracy}".format(**test_eval_result))

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2020-05-05T20:46:02Z


INFO:tensorflow:Starting evaluation at 2020-05-05T20:46:02Z


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from /var/folders/tn/fcl1261n75l3455rc43h8df80000gn/T/tmps6lcsp55/model.ckpt-5000


INFO:tensorflow:Restoring parameters from /var/folders/tn/fcl1261n75l3455rc43h8df80000gn/T/tmps6lcsp55/model.ckpt-5000


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Inference Time : 114.23019s


INFO:tensorflow:Inference Time : 114.23019s


INFO:tensorflow:Finished evaluation at 2020-05-05-20:47:56


INFO:tensorflow:Finished evaluation at 2020-05-05-20:47:56


INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.9855486, average_loss = 0.06835699, global_step = 5000, loss = 0.06835705


INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.9855486, average_loss = 0.06835699, global_step = 5000, loss = 0.06835705


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: /var/folders/tn/fcl1261n75l3455rc43h8df80000gn/T/tmps6lcsp55/model.ckpt-5000


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: /var/folders/tn/fcl1261n75l3455rc43h8df80000gn/T/tmps6lcsp55/model.ckpt-5000


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2020-05-05T20:47:57Z


INFO:tensorflow:Starting evaluation at 2020-05-05T20:47:57Z


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from /var/folders/tn/fcl1261n75l3455rc43h8df80000gn/T/tmps6lcsp55/model.ckpt-5000


INFO:tensorflow:Restoring parameters from /var/folders/tn/fcl1261n75l3455rc43h8df80000gn/T/tmps6lcsp55/model.ckpt-5000


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Inference Time : 34.36947s


INFO:tensorflow:Inference Time : 34.36947s


INFO:tensorflow:Finished evaluation at 2020-05-05-20:48:32


INFO:tensorflow:Finished evaluation at 2020-05-05-20:48:32


INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.9858319, average_loss = 0.06781122, global_step = 5000, loss = 0.0678078


INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.9858319, average_loss = 0.06781122, global_step = 5000, loss = 0.0678078


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: /var/folders/tn/fcl1261n75l3455rc43h8df80000gn/T/tmps6lcsp55/model.ckpt-5000


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: /var/folders/tn/fcl1261n75l3455rc43h8df80000gn/T/tmps6lcsp55/model.ckpt-5000


Training set accuracy: 0.9855486154556274
Test set accuracy: 0.9858319163322449


These results seem too good to be true.

In [287]:
test_dataframe = {}

In [289]:
test_dataframe = {}
test_dataframe['comment'] = []
test_dataframe['commentLikes'] = []
test_dataframe['authorName'] = []
test_dataframe['title'] = []
test_dataframe['description'] = []
test_dataframe['videoLikes'] = []
test_dataframe['videoDislikes'] = []
test_dataframe['views'] = []
test_dataframe['commentNum'] = []
comment_list = [["This is the DUMBEST president ever in HISTORY! 🤔",
                 80,
                 "MagicSantos",
                 452,
                 53,
                 383,
                 "Trump explains why he plans to wind down the coronavirus task force",
                 "CNN's Anderson Cooper reports that President Donald Trump plans to wind down the coronavirus task force near the end of May. #CNN #News",
                 4970]]
for comment in comment_list:
    test_dataframe['comment'].append(comment[0])
    test_dataframe['commentLikes'].append(comment[1])
    test_dataframe['authorName'].append(comment[2])
    test_dataframe['videoLikes'].append(comment[3])
    test_dataframe['videoDislikes'].append(comment[4])
    test_dataframe['commentNum'].append(comment[5])
    test_dataframe['title'].append(comment[6])
    test_dataframe['description'].append(comment[7])
    test_dataframe['views'].append(comment[8])
test_dataframe = pd.DataFrame.from_dict(test_dataframe)