# Importing Required Modules

In [1]:
import pandas as pd
import numpy as np
import json
import time
import requests
import demoji
import urllib.request as req
import re
import webbrowser
from pytube import YouTube
from matplotlib import pyplot as plt
from langdetect import detect
from stop_words import get_stop_words
from nltk.corpus import stopwords

ModuleNotFoundError: No module named 'demoji'

##### Activating AutoCompleter in Notebook

In [None]:
%config Completer.use_jedi=False

# Constants

In [None]:
YOUTUBE_VIDEO_URL = 'https://www.youtube.com/watch?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_service_ajax'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
SORT_BY_POPULAR = 0
SORT_BY_RECENT = 1
URL = 'https://www.youtube.com/watch?v={}'
FILENAME = r'json/{}.json'
PIEPLOT = r'plots\pie_{}_{}.png'
BARPLOT = r'plots\bar_{}_{}.png'
SUBPLOT = r'plots\sub_{}_{}.png'
HTML = r'webpages\{}.html'

# API to retrieve youtube comments
### Do not Edit the following cell

In [None]:
def find_value(html, key, num_chars=2, separator='"'):
    pos_begin = html.find(key) + len(key) + num_chars
    pos_end = html.find(separator, pos_begin)
    return html[pos_begin: pos_end]

def ajax_request(session, url, params=None, data=None, headers=None, retries=5, sleep=20):
    for _ in range(retries):
        response = session.post(url, params=params, data=data, headers=headers)
        if response.status_code == 200:
            return response.json()
        if response.status_code in [403, 413]:
            return {}
        else:
            time.sleep(sleep)

def download_comments(youtube_id, sort_by=SORT_BY_RECENT, sleep=.1):
    session = requests.Session()
    session.headers['User-Agent'] = USER_AGENT
    response = session.get(YOUTUBE_VIDEO_URL.format(youtube_id=youtube_id))

    if 'uxe=' in response.request.url:
        session.cookies.set('CONSENT', 'YES+cb', domain='.youtube.com')
        response = session.get(YOUTUBE_VIDEO_URL.format(youtube_id=youtube_id))
    
    ncd = None
    html = response.text
    session_token = find_value(html, 'XSRF_TOKEN', 3)
    session_token = session_token.encode('ascii').decode('unicode-escape')
    
    data = json.loads(find_value(html, 'var ytInitialData = ', 0, '};') + '}')
    for renderer in search_dict(data, 'itemSectionRenderer'):
        ncd = next(search_dict(renderer, 'nextContinuationData'), None)
        if ncd: break

    if not ncd:
        # Comments disabled?
        return

    needs_sorting = sort_by != SORT_BY_POPULAR
    continuations = [(ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comments')]
    while continuations:
        continuation, itct, action = continuations.pop()
        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL,
                                params={action: 1,
                                        'pbj': 1,
                                        'ctoken': continuation,
                                        'continuation': continuation,
                                        'itct': itct},
                                data={'session_token': session_token},
                                headers={'X-YouTube-Client-Name': '1',
                                         'X-YouTube-Client-Version': '2.20201202.06.01'})

        if not response:
            break
        if list(search_dict(response, 'externalErrorMessage')):
            raise RuntimeError('Error returned from server: ' + next(search_dict(response, 'externalErrorMessage')))

        if needs_sorting:
            sort_menu = next(search_dict(response, 'sortFilterSubMenuRenderer'), {}).get('subMenuItems', [])
            if sort_by < len(sort_menu):
                ncd = sort_menu[sort_by]['continuation']['reloadContinuationData']
                continuations = [(ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comments')]
                needs_sorting = False
                continue
            raise RuntimeError('Failed to set sorting')

        if action == 'action_get_comments':
            section = next(search_dict(response, 'itemSectionContinuation'), {})
            for continuation in section.get('continuations', []):
                ncd = continuation['nextContinuationData']
                continuations.append((ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comments'))
            for item in section.get('contents', []):
                continuations.extend([(ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comment_replies')
                                      for ncd in search_dict(item, 'nextContinuationData')])

        elif action == 'action_get_comment_replies':
            continuations.extend([(ncd['continuation'], ncd['clickTrackingParams'], 'action_get_comment_replies')
                                  for ncd in search_dict(response, 'nextContinuationData')])

        for comment in search_dict(response, 'commentRenderer'):
            yield {'cid': comment['commentId'],
                   'text': ''.join([c['text'] for c in comment['contentText'].get('runs', [])]),
                   'time': comment['publishedTimeText']['runs'][0]['text'],
                   'author': comment.get('authorText', {}).get('simpleText', ''),
                   'channel': comment['authorEndpoint']['browseEndpoint']['browseId'],
                   'votes': comment.get('voteCount', {}).get('simpleText', '0'),
                   'photo': comment['authorThumbnail']['thumbnails'][-1]['url'],
                   'heart': next(search_dict(comment, 'isHearted'), False)}
        time.sleep(sleep)

def search_dict(partial, search_key):
    stack = [partial]
    while stack:
        current_item = stack.pop()
        if isinstance(current_item, dict):
            for key, value in current_item.items():
                if key == search_key:
                    yield value
                else:
                    stack.append(value)
        elif isinstance(current_item, list):
            for value in current_item:
                stack.append(value)

#### Function for Downloading emoji codes and stopwords

In [None]:
def downloadEmojiCodes():
    demoji.download_codes()
def downloadNLTKStopWords():
    import nltk
    nltk.download('stopwords')

# Function to get Video Details

In [None]:
def getVideoInfo(video_id):
    url = YOUTUBE_VIDEO_URL.format(youtube_id = video_id)
    data = YouTube(url)
    title = data.title
    views = data.views
    rating = data.rating
    thumbnail_url = data.thumbnail_url
    return url, title, thumbnail_url

# Function to open link in browser

In [None]:
def openInBrowser(link):
    webbrowser.open_new_tab(link)

# Function to Download Comments

In [None]:
def downloadComments(youtube_id = None, limit=10, sort=1):
    if not youtube_id :
        youtube_id = input("Enter Youtube Video ID :").strip()
        limit = int(input("Enter number of comments to be retrieved :"))
        sort = int(input("Enter sorting order \n(download popular (0) ; recent comments (1). Defaults to 1) \t :"))
    comments = []
    try:
        if not youtube_id:
            raise ValueError('You need to specify a Youtube ID')
        print('Downloading Youtube comments for video :', youtube_id)
        count = 0
        i = 1
        g_comments = download_comments(youtube_id, sort)
        print('_' * 100)
        for comment in g_comments:
            count += 1
            comments.append(comment)
            if int(count * 100 / limit) == i: print('*', end=''); i += 1
            if count >= limit: break
        if comments: print('\nDownloaded')
        else: print('\nNo Comments Found')
    except Exception as e:
        print('\nError :', str(e))
    print('{} Comments Downloaded'.format(len(comments)))
    return comments

# Function to get Views, Likes, Dislikes count

In [None]:
def getVLDCount(video_id):
    html = str(req.urlopen(URL.format(video_id)).read())
    res = [*map(lambda x:x.split(), list(zip(*re.findall(r'((\d+(,\d+)*) (dislikes|likes|views|comments))', html)))[0])]
    Views = Likes = Dislikes = 0
    for i in res:
        n = int(''.join(i[0].split(',')))
        if 'views' in i and n:
            if not Views: Views = n
        elif 'likes' in i and n:
            if not Likes: Likes = n
        elif 'dislikes' in i and n:
            if not Dislikes: Dislikes = n
    return Views, Likes, Dislikes

# Function to Download the requirements

In [None]:
def getRequirements(video_id, limit=1000, sort=1):
    """
        This Function takes the following inputs
            video_id : (str) id of the video
            limit(optional) : (int) number of comments to be downloaded
            sort_order(optional) : (int) 0 or 1; defaults to zero
        Returns
            comments : list of dictionaries
            views : no_of_views
            likes : no_of_likes
            dislikes : no_of_dislikes
    """
    FILE_NAME = FILENAME.format(video_id)
    try:
        with open(FILE_NAME) as J: comments = json.load(J)
        print('Dataset Exists. Opening...')
        print('Opened')
        print('{} Comments Found'.format(len(comments)))
    except:
        print('Dataset Not Found. Downloading...')
        comments = downloadComments(video_id, limit)
        try:
            with open(FILE_NAME, 'w') as J: json.dump(comments, J)
        except:
            print('Unable to save the Dataset')
    views, likes, dislikes = getVLDCount(video_id)
    return comments, views, likes, dislikes

# Function to detect language

In [None]:
def detect_lang(x):
    lang = None
    if x:
        try: lang = detect(x)
        except: pass
    return lang

# Function to process the comments

In [None]:
def process(comments, likes=1, dislikes=0):
    print('Processing DataFrame...')
    df_comments = pd.DataFrame(comments)
    df_emoji_data = pd.read_csv('Emoji_Sentiment_Data.csv')[['Emoji', 'Negative', 'Neutral', 'Positive']]
    df_emoji_data['emoji_score'] = df_emoji_data['Positive'] - df_emoji_data['Negative'] + (60 / df_emoji_data['Neutral'] * 100) - (40 / df_emoji_data['Neutral'] * 100)
    emoji_scores = df_emoji_data.set_index('Emoji')['emoji_score'].to_dict()
    pattern = r"[^\w']+" # pattern = r'[\W_]+'
    stop_words = list(get_stop_words('en'))
    try:
        nltk_words = list(stopwords.words('english'))
    except:
        downloadNLTKStopWords()
        nltk_words = list(stopwords.words('english'))
    stop_words.extend(nltk_words)
    stop_words = nltk_words
    negative_words_data = None
    with open('negative_words.txt','r') as f: negative_words_data = f.read().split()
    try:
        df_comments['emojies'] = df_comments['text'].apply(lambda x : demoji.findall_list(x, desc=False))
    except:
        downloadEmojiCodes()
        df_comments['emojies'] = df_comments['text'].apply(lambda x : demoji.findall_list(x, desc=False))
    df_comments['comments'] = df_comments["text"].apply(lambda x : demoji.replace(x,""))
    df_comments['votes'] = df_comments['votes'].apply(lambda x : float(x[:-1]) * 1000 if x[-1].lower() == 'k' else float(x))
    df_comments['emoji_scores'] = df_comments['emojies'].apply(lambda x : sum([emoji_scores[i] if i and i in emoji_scores and emoji_scores[i] else 0 for i in x]))
    df_comments['emoji_scores'] = df_comments['emoji_scores'].fillna(0)
    df_comments['language'] = df_comments['comments'].apply(detect_lang)
    df_comments = df_comments[df_comments['language'] == 'en']
    df_comments['comments'] = df_comments['comments'].apply(lambda x : re.sub(pattern," ",x))
    df_comments['comments_stopword_less'] = df_comments['comments'].apply(lambda x : ' '.join([i for i in x.split() if i not in stop_words]))
    df_comments['words'] = df_comments['comments_stopword_less'].apply(lambda x : x.split())
    df_comments['negative_words'] = df_comments['words'].apply(lambda x : [i for i in x if i in negative_words_data])
    df_comments['word_scores'] = 1 - df_comments['negative_words'].str.len() / df_comments['words'].str.len()
    df_comments['word_scores'] = df_comments['word_scores'].fillna(0)
    word_per  = (df_comments['word_scores'].sum() / len(df_comments['word_scores']))
    likes_per = ((likes) / (likes + dislikes))
    pve = df_comments[df_comments['emoji_scores'] > 0]['emoji_scores'].sum()
    nve = -df_comments[df_comments['emoji_scores'] < 0]['emoji_scores'].sum()
    emoji_per = (pve) / (pve + nve)
    df_comments['neg_comment'] = df_comments['negative_words'].apply(lambda x: 1 if len(x) > 0 else 0)
    neg_comments_count = df_comments['neg_comment'].sum()
    pos_comments_count = len(df_comments['neg_comment']) - neg_comments_count
    result = (likes_per * 0.70) + (emoji_per * 0.20) + (word_per * 0.10)
    return df_comments, likes_per, emoji_per, word_per, result, pos_comments_count, neg_comments_count

# Function for Pieplot

In [None]:
def piePlot(values, title='Chart', explode=(0.2, 0), colors=['green', 'red'], labels=['positive', 'negative'],\
            shadow=True, startangle=0, autopct='%1.2f%%', save=0):
    plt.pie(values, labels=labels, autopct=autopct, explode=explode, shadow=shadow, colors=colors)
    plt.legend()
    plt.title(title)
    if save:
        f_name = PIEPLOT.format(video_id, ''.join(title.split()))
        plt.savefig(f_name)
        print('{} Saved'.format(f_name))
    plt.show()
    plt.clf()

# Function for BarPlot

In [None]:
def barPlot(values, title='Chart', labels=['positive', 'negative'], colors=['green', 'red'],\
            xlabel='Positive - Negative', ylabel='Positive - Negative Count', save=0):
    plt.bar(labels, values, color=colors)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    if save:
        f_name = BARPLOT.format(video_id, ''.join(title.split()))
        plt.savefig(f_name)
        print('{} Saved'.format(f_name))
    plt.show()
    plt.clf()

# Function for SubPlot

In [None]:
def subPlot(value1, value2, value3, value4, title='Chart', plotlabels=['Likes', 'Emojies', 'Comments', 'Result'], explode=(0.2, 0),\
            colors=['green', 'red'], labels=['positive', 'negative'], shadow=True, startangle=0, autopct='%1.2f%%', save=0):
    fig, axs = plt.subplots(2, 2, figsize=(10, 10))
    axs[0, 0].pie(value1, labels=labels, autopct=autopct, explode=explode, shadow=shadow, colors=colors)
    axs[0, 0].set_title(plotlabels[0])
    axs[0, 1].pie(value2, labels=labels, autopct=autopct, explode=explode, shadow=shadow, colors=colors)
    axs[0, 1].set_title(plotlabels[1])
    axs[1, 0].pie(value3, labels=labels, autopct=autopct, explode=explode, shadow=shadow, colors=colors)
    axs[1, 0].set_title(plotlabels[2])
    axs[1, 1].pie(value4, labels=labels, autopct=autopct, explode=explode, shadow=shadow, colors=colors)
    axs[1, 1].set_title(plotlabels[3])
    fig.legend()
    # for ax in axs.flat: ax.set(xlabel='x-label', ylabel='y-label')
    for ax in axs.flat: ax.label_outer()
    if save:
        f_name = SUBPLOT.format(video_id, ''.join(title.split()))
        plt.savefig(f_name)
        print('{} Saved'.format(f_name))
    plt.show()
    plt.clf()

# Function All in One (Code for all in one)

In [None]:
def doAll(video_id, count=1000, sort=1, save=0):
    comments, views, likes, dislikes = getRequirements(video_id, count, sort)
    print("Views : {} ; Likes : {} ; Dislikes : {}".format(views, likes, dislikes))
    df_comments, likes_per, emoji_per, word_per, result, pos_comments_count, neg_comments_count = process(comments, likes, dislikes)
    values_likes = [likes_per, 1 - likes_per]
    values_emojies = [emoji_per, 1 - emoji_per]
    values_words = [word_per, 1 - word_per]
    values_res = [result, 1 - result]
    values_neg = [pos_comments_count, neg_comments_count]
    piePlot(values_likes, "Likes Chart", save=save)
    piePlot(values_emojies, "Emojies Chart", save=save)
    piePlot(values_words, "Comments Chart", save=save)
    piePlot(values_res, "Result Chart", save=save)
    barPlot(values_neg, 'Positive-Negative Comments Chart', save=save)
    subPlot(values_likes, values_emojies, values_words, values_res, save=save)

# All in One Action

## Downloading Comments of given youtube video ID, getting Views, Likes, Dislikes Count

In [None]:
if "video_id" not in globals() : video_id = input("Enter Video ID :").strip()
doAll(video_id, count=5000, save=1)

In [None]:
url, title, thumb = getVideoInfo(video_id)

In [None]:
with open('Template.txt') as T: Temp = T.read()

In [None]:
html_values = {"video_id":video_id,
               "video_title":title,
               "video_id":video_id,
               "chart1_title":'Likes Chart',
               "chart2_title":'Emojies Chart',
               "chart3_title":'Comments Chart',
               "chart4_title":'Result Chart',
               "chart1-image":PIEPLOT.format(video_id, 'LikesChart'),
               "chart2-image":PIEPLOT.format(video_id, 'EmojiesChart'),
               "chart3-image":PIEPLOT.format(video_id, 'CommentsChart'),
               "chart4-image":PIEPLOT.format(video_id, 'ResultChart')}
html = Temp.format(**html_values)

In [None]:
with open(HTML.format(video_id), 'w') as H:
    H.write(html)

In [None]:
openInBrowser(HTML.format(video_id))