In [34]:
import pandas as pd
import json
import numpy as np
import os
import glob
from tqdm import tqdm

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
output_path = 'data/comments-data'
plots_path = 'data/plots'

In [36]:
data_excel = 'video-list-ranked new coding system May Final (Completed Set) (22_5_23).xlsx'
df = pd.read_excel(f'data/{data_excel}', sheet_name='Clean Video Sheet')
df = df.sort_values(by='debateScore2', ascending=False)
df

Unnamed: 0,id,nrComments,nrThreads,nrReplies,maxThreadLength,disabled,nrAuthors,nrAuthors3Posts,maxPostsByAuthor,url,...,popularityScore2,debateScore,debateScore2,Age,Language,Duration,Pro or agai,Type,Delivery,Other notes
9,uqwvf6R1_QY,141,19,41,493,False,99,11,8,https://www.youtube.com/watch?v=uqwvf6R1_QY,...,14.795546,0.619581,6.624065,2 yrs,English,30 mins,Pro,Monologue,Monologue,Gaming channel of an internet personality
3,yiw6_JakZFc,150,26,50,458,False,111,7,17,https://www.youtube.com/watch?v=yiw6_JakZFc,...,16.403931,0.652462,6.614726,1 yr,English,15 mins,Both,Ed Vid,Illustration,Kurzgesagt channel
0,eRLJscAlk1M,118,10,19,500,False,115,0,2,https://www.youtube.com/watch?v=eRLJscAlk1M,...,17.312566,0.607542,6.612041,7 yrs,English,6 mins,Pro,Song,Drama,Weird add at the end
8,EhAemz1v7dQ,165,31,65,388,False,118,9,8,https://www.youtube.com/watch?v=EhAemz1v7dQ,...,15.935968,0.658505,6.555357,1 yr,English,10 mins,Neither,Ed Vid,Illustration,Kurzgesast channel Nuclear Energy and CH
5,wbR-5mHI6bo,128,17,28,445,False,103,9,3,https://www.youtube.com/watch?v=wbR-5mHI6bo,...,15.781058,0.585561,6.542472,2 yrs,English,10 mins,Pro,Ed Vid,Illustration,Kurzgesagt channel - complicated if its too la...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,WkvPdUtYhX8,0,0,0,0,True,0,0,0,https://www.youtube.com/watch?v=WkvPdUtYhX8,...,12.392996,0.000000,0.000000,1 yr,English,11 mins,Pro,Ed Vid,Informative,Learn Bright CH for kids
134,N6t6QHQtdVw,0,0,0,0,True,0,0,0,https://www.youtube.com/watch?v=N6t6QHQtdVw,...,13.174443,0.000000,0.000000,8 months,English,30 mins,Pro,Ed Vid,Illustration,"Peekaboo kidz, CH for kids"
133,-D_Np-3dVBQ,0,0,0,0,True,0,0,0,https://www.youtube.com/watch?v=-D_Np-3dVBQ,...,13.852708,0.000000,0.000000,5 yrs,English,8 mins,Pro,Ed Vid,Illustration,Matt Miltonberger on what are the probs and so...
132,cn9PhiDJp-A,0,0,0,0,True,0,0,0,https://www.youtube.com/watch?v=cn9PhiDJp-A,...,14.841249,0.000000,0.000000,1 yr,English,4 mins,Pro,Song,Illustration,Hopscotch climate change song


In [37]:
df[df['commentCount'] > 20][['id', 'nrComments', 'nrThreads', 'nrReplies', 'commentCount']]

Unnamed: 0,id,nrComments,nrThreads,nrReplies,commentCount
9,uqwvf6R1_QY,141,19,41,26181
3,yiw6_JakZFc,150,26,50,63876
0,eRLJscAlk1M,118,10,19,169326
8,EhAemz1v7dQ,165,31,65,26269
5,wbR-5mHI6bo,128,17,28,42908
...,...,...,...,...,...
115,hOdKBLDXH9o,46,5,5,49
116,2M_xtMmaB9s,33,7,13,33
104,N7Qot_aax9M,32,5,13,82
117,E6wzsoB7Xoo,20,2,3,31


In [5]:
def createCommentsRepliesDf(df):
    comments_df = pd.DataFrame()

    for yt_id in df['id'].tolist():
        # load comments
        with open(f'data/comments-data/{yt_id}.json', 'r') as file:
            data = json.load(file)

        # flatten 'items'
        items_df = pd.json_normalize(data['items'])

        # create a temp dataframe from comments column
        temp = pd.DataFrame()
        for row in items_df['replies.comments']:
            if row is not np.nan:
                temp = pd.concat([temp, pd.DataFrame(row)], sort=False)

        # flatten 'snippet' column (which is each comment's reply)
        df_normalized = pd.json_normalize(temp['snippet'])
        df_normalized.index = temp.index
        temp = pd.concat([temp, df_normalized], axis=1)

        # merge the original top level comments to the replies to form one dataframe
        items_df = pd.concat([temp, items_df], sort=False)
        items_df['textDisplay'] = items_df['textDisplay'].fillna(items_df['snippet.topLevelComment.snippet.textDisplay'])
        items_df['videoId'] = items_df['videoId'].fillna(items_df['snippet.topLevelComment.snippet.videoId'])
        items_df['updatedAt'] = items_df['updatedAt'].fillna(items_df['snippet.topLevelComment.snippet.updatedAt'])
        items_df = items_df[['id', 'videoId', 'textDisplay', 'parentId', 'updatedAt']]

        comments_df = pd.concat([comments_df, items_df], sort=False)
        
        return comments_df

In [6]:
def getParentId(comment_id):
    comment_id_split = comment_id.split('.')
    if len(comment_id_split) == 1:
        return None
    return comment_id_split[0]

In [7]:
def convertToNumber(x):
    if isinstance(x, int):
        return
    
    if 'K' in x:
        x = x.split('K')[0]
        if '.' in x:
            x = x.split('.')[0] + x.split('.')[1] + '00'
        else:
            x = x + '000'
    return int(x)

# Sentiment Analysis

In [8]:
## TextBlob
def getTextBlobPolarity(text):
    return TextBlob(text).sentiment.polarity

def getTextBlobSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

## VADER (optimised for social media)
def getVaderSentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(text)
    return vs['compound']

# https://github.com/cjhutto/vaderSentiment
def getVaderClassification(sentiment_compound):
    classifiction = "Error"
    if sentiment_compound >= 0.05:
        classifiction = 'Positive'
    elif sentiment_compound > -0.05 and sentiment_compound < 0.05:
        classifiction = 'Neutral'
    elif sentiment_compound <= -0.05:
        classifiction = 'Negative'
    
    return classifiction

In [9]:
def getSentiment(df):
    comments_df = df.copy()
    
    comments_df['TextBlob_subjectivity'] = comments_df['text'].apply(lambda x: getTextBlobSubjectivity(x))
    comments_df['TextBlob_polarity'] = comments_df['text'].apply(lambda x: getTextBlobPolarity(x))

    comments_df['VADER_sentiment'] = comments_df['text'].apply(lambda x: getVaderSentiment(x))
    comments_df['VADER_sentiment_classify'] = comments_df['VADER_sentiment'].apply(lambda x: getVaderClassification(x))

    return comments_df

In [33]:
analyzer = SentimentIntensityAnalyzer()
sentence = "nice"
vs = analyzer.polarity_scores(sentence)
print(vs)
print(getVaderClassification(vs['compound']))

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
Positive


In [37]:
# attach sentiment to each dataframe and save as json
for folder in glob.glob(os.path.join(output_path, '*')):
    yt_id = folder.split('/')[-1]
    print(yt_id)
    
    if not os.path.exists(f'{folder}/{yt_id}_sentiment.json') and \
       len(pd.read_json(f'{folder}/{yt_id}.json', lines=True)) > 0:  
            
        temp = pd.read_json(f'{folder}/{yt_id}.json', lines=True)
        temp['parentId'] = temp['cid'].apply(lambda x: getParentId(x))
        temp['videoId'] = yt_id
        temp['votes'] = temp['votes'].apply(lambda x: convertToNumber(x))

        temp = getSentiment(temp)
    
        temp.to_json(f'{folder}/{yt_id}_sentiment.json')

n9Ej5E47TNI
V7cCYstvCG0
RLqXkYrdmjY
j-tzy7MJXiU
8z5nHZVRu4Y
_DntRvQ5h6g
AlR33oOjiNU
7yYn-ATFF-k
2zN2uycYcF0
AvXFZz8DIMY
Ok8rMT2KCy0
Wpy4xBftFuY
Lq0iua0r0KQ
F6LTn8NwB0A
u9I0j_D2oSw
cn9PhiDJp-A
6iM3kHSRNu8
9PFhrpyWV-w
YfWCUYX2_U0
surwq9exCKA
QppBsx28RwI
cK1spdjQxP0
nawMoUExYKU
wbR-5mHI6bo
W3xkniUYxow
hTtR_WaLHnY
61KdJ2KQaG4
PvPlCr_fPSA
LxgMdjyw8uw
L6S63VXfqMw
dIsjcG7hTmo
GR46_ohNh9U
E6wzsoB7Xoo
hhI4IBC66sw
GGtAilkWTtI
HqHuJngkJkE
Ffq4YTRFfeY
YVQWd5mXYVQ
dcBXmj1nMTQ
m_hgffJuSaA
CJnF_3jzeMc
5azavfLcQFo
MCXNbd4yvyM
uynhvHZUOOo
saVb8vq0d-c
-n4A0BssFd0
o-TMOeCDeus
re_9Wa006oU
ZfFvLJyPf3o
6lo16d2n0RM
sYXqObiahRE
-D_Np-3dVBQ
2sySOtltups
7yXEqb0HWdk
8URGCX4R3bc
2CAG6Fg6Vlo
PciRrn0fJ3c
CqtZdnpfgIc
j2ETr6X1lOk
4SkDSmanKeY
dDKuVeU_Pwk
9Zxz73bKMu0
eRLJscAlk1M
3DOcQRl9ASc
SHp6oysLGJQ
TVQm5lTRMO0
IDtuNEAKNR8
8RVooYlyl20
NhMU1tBL4EI
Ff3yjCmMy9A
ifrHogDujXw
AkopY80phF8
uqwvf6R1_QY
48zAWYkrBIw
qJv1IPNZQao
pKZuc4blMDA
ULpGDnuz308
G4H1N_yXBiA
f4zul0BuO8A
w1bq0yqmSPs
EhAemz1v7dQ
Cts7Tz53ajA
Ar0qt_i1tMs
9oJ0

In [10]:
# get average sentiment for each video and map it back to original df
mean_sentiment_dict = {}
for yt_id in tqdm(df[df['commentCount'] > 0]['id'].tolist()):
    if len(pd.read_json(f'{output_path}/{yt_id}/{yt_id}.json', lines=True)) > 0:
        sentiment_df = pd.read_json(f'{output_path}/{yt_id}/{yt_id}_sentiment.json')
        mean_sentiment_dict[yt_id] = sentiment_df['VADER_sentiment'].mean()
    
df['mean_sentiment'] = df['id'].map(mean_sentiment_dict)
df['VADER_sentiment_classify'] = df['mean_sentiment'].apply(lambda x: getVaderClassification(x))
df

100%|█████████████████████████████████████████| 132/132 [00:08<00:00, 14.88it/s]


Unnamed: 0,id,nrComments,nrThreads,nrReplies,maxThreadLength,disabled,nrAuthors,nrAuthors3Posts,maxPostsByAuthor,url,...,debateScore2,Age,Language,Duration,Pro or agai,Type,Delivery,Other notes,mean_sentiment,VADER_sentiment_classify
9,uqwvf6R1_QY,141,19,41,493,False,99,11,8,https://www.youtube.com/watch?v=uqwvf6R1_QY,...,6.624065,2 yrs,English,30 mins,Pro,Monologue,Monologue,Gaming channel of an internet personality,0.100063,Positive
3,yiw6_JakZFc,150,26,50,458,False,111,7,17,https://www.youtube.com/watch?v=yiw6_JakZFc,...,6.614726,1 yr,English,15 mins,Both,Ed Vid,Illustration,Kurzgesagt channel,0.077032,Positive
0,eRLJscAlk1M,118,10,19,500,False,115,0,2,https://www.youtube.com/watch?v=eRLJscAlk1M,...,6.612041,7 yrs,English,6 mins,Pro,Song,Drama,Weird add at the end,0.068703,Positive
8,EhAemz1v7dQ,165,31,65,388,False,118,9,8,https://www.youtube.com/watch?v=EhAemz1v7dQ,...,6.555357,1 yr,English,10 mins,Neither,Ed Vid,Illustration,Kurzgesast channel Nuclear Energy and CH,0.113529,Positive
5,wbR-5mHI6bo,128,17,28,445,False,103,9,3,https://www.youtube.com/watch?v=wbR-5mHI6bo,...,6.542472,2 yrs,English,10 mins,Pro,Ed Vid,Illustration,Kurzgesagt channel - complicated if its too la...,0.082061,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,WkvPdUtYhX8,0,0,0,0,True,0,0,0,https://www.youtube.com/watch?v=WkvPdUtYhX8,...,0.000000,1 yr,English,11 mins,Pro,Ed Vid,Informative,Learn Bright CH for kids,,Error
134,N6t6QHQtdVw,0,0,0,0,True,0,0,0,https://www.youtube.com/watch?v=N6t6QHQtdVw,...,0.000000,8 months,English,30 mins,Pro,Ed Vid,Illustration,"Peekaboo kidz, CH for kids",,Error
133,-D_Np-3dVBQ,0,0,0,0,True,0,0,0,https://www.youtube.com/watch?v=-D_Np-3dVBQ,...,0.000000,5 yrs,English,8 mins,Pro,Ed Vid,Illustration,Matt Miltonberger on what are the probs and so...,,Error
132,cn9PhiDJp-A,0,0,0,0,True,0,0,0,https://www.youtube.com/watch?v=cn9PhiDJp-A,...,0.000000,1 yr,English,4 mins,Pro,Song,Illustration,Hopscotch climate change song,,Error


In [11]:
# get viable videos (videos with sentiment)
viable_videos_df = df[~df['mean_sentiment'].isna()]

# N-Gram Analysis

In [44]:
import re
import string
import nltk
import nltk.sentiment.util
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [45]:
def clean_text(text): 
    # remove symbols and emojis
    text = text.lower()
    text = re.sub('@', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r"[^a-zA-Z ]+", "", text)
    
    # tokenize 
    text = nltk.word_tokenize(text)
    
    # lemmatize
    text = [lemmatizer.lemmatize(t) for t in text]
    text = [lemmatizer.lemmatize(t, 'v') for t in text]
    
    # mark Negation
    tokens_neg_marked = nltk.sentiment.util.mark_negation(text)
    
    # remove stopwords
    text = [t for t in tokens_neg_marked
             if t.replace("_NEG", "").isalnum() and
             t.replace("_NEG", "") not in sw]
    
    return text

In [120]:
def plot_ngram(ngram, tokens, name):
    if ngram == 1:
        # create FreqDist, keep the 20 most common tokens
        freq_dist = nltk.FreqDist(tokens).most_common(20)
        # convert to Pandas series for easier plotting
        all_fdist = pd.Series(dict(freq_dist))

        fig, ax = plt.subplots(figsize=(14,8))
        bar_plot = sns.barplot(x=all_fdist.index, y=all_fdist.values, ax=ax)
        plt.xticks(rotation=40)
        plt.title(f'{name.capitalize()} Frequency Distribution')
        plt.savefig(f'{plots_path}/{yt_id}/ngrams/{name}.png', bbox_inches="tight")
        plt.close()
        
    else:
        freq_dist = nltk.FreqDist(tokens)
        # sort values by highest frequency
        ngram_sorted = {k:v for k,v in sorted(freq_dist.items(), key=lambda item:item[1], reverse=True)}
        # get 20 most frequest phrases
        ngram_most_frequent = {k: ngram_sorted[k] for k in list(ngram_sorted)[:20]}

        # join ngram tokens with '_' + maintain sorting
        ngram_joined = {'_'.join(map(str, k)):v for k,v in sorted(ngram_most_frequent.items(), key=lambda item:item[1], reverse=True)}

        # convert to Pandas series for easy plotting
        ngram_freqdist = pd.Series(ngram_joined)

        fig, ax = plt.subplots(figsize=(10,10))
        bar_plot = sns.barplot(x=ngram_freqdist.values, y=ngram_freqdist.index, orient='h', ax=ax)
        plt.title(f'{name.capitalize()} Frequency Distribution')
        plt.savefig(f'{plots_path}/{yt_id}/ngrams/{name}.png', bbox_inches="tight")
        plt.close()

In [133]:
for yt_id in tqdm(viable_videos_df['id'].tolist()):
    if not os.path.exists(f'{plots_path}/{yt_id}/ngrams'):
        os.makedirs(f'{plots_path}/{yt_id}/ngrams')
        
        comments_df = pd.read_json(f'{output_path}/{yt_id}/{yt_id}.json', lines=True)
        comments_df['cleaned_text'] = comments_df['text'].apply(lambda x: clean_text(x))

        # get list of all words in video 
        all_words = comments_df['cleaned_text'].explode().to_list()
        # monogram
        plot_ngram(ngram=1, tokens=all_words, name='monogram')

        # bigrams
        bigram = nltk.bigrams(all_words)
        plot_ngram(ngram=2, tokens=bigram, name='bigram')

        # trigrams
        trigram = nltk.trigrams(all_words)
        plot_ngram(ngram=3, tokens=trigram, name='trigram')

100%|█████████████████████████████████████████| 130/130 [00:02<00:00, 63.07it/s]


# Analysis

In [43]:
# plot each videos sentiment count
for yt_id in tqdm(viable_videos_df['id'].tolist()):
    if not os.path.exists(f'{plots_path}/{yt_id}'):
        os.makedirs(f'{plots_path}/{yt_id}')
    
        fig, ax = plt.subplots()

        sentiment_df = pd.read_json(f'{output_path}/{yt_id}/{yt_id}_sentiment.json')
        sentiment = sentiment_df[sentiment_df['videoId'] == yt_id]['VADER_sentiment_classify'].value_counts().reset_index()

        sentiment_labels = sentiment['index'].tolist()
        sentiment_values = sentiment['VADER_sentiment_classify'].tolist()

        ax.bar(sentiment_labels, sentiment_values)
        ax.set_title(yt_id)

        plt.savefig(f'{plots_path}/{yt_id}/{yt_id}_sentiment.png')
        plt.close()

100%|█████████████████████████████████████████| 130/130 [00:07<00:00, 17.23it/s]


In [28]:
# get average sentiment 
parent_mean_sentiment_dict = {}
reply_mean_sentiment_dict = {}
for yt_id in tqdm(viable_videos_df['id'].tolist()):
    sentiment_df = pd.read_json(f'{output_path}/{yt_id}/{yt_id}_sentiment.json')
    # only get parent comments
    parent_comments_df = sentiment_df[sentiment_df['reply'] == False]
    parent_mean_sentiment_dict[yt_id] = parent_comments_df['VADER_sentiment'].mean()
    # only get reply comments
    reply_comments_df = sentiment_df[sentiment_df['reply'] == True]
    reply_mean_sentiment_dict[yt_id] = reply_comments_df['VADER_sentiment'].mean()
    
viable_videos_df.loc[:, 'parent_mean_sentiment'] = viable_videos_df['id'].map(parent_mean_sentiment_dict)
viable_videos_df.loc[:, 'parent_VADER_sentiment_classify'] = viable_videos_df['parent_mean_sentiment'].apply(lambda x: getVaderClassification(x))
viable_videos_df.loc[:, 'reply_mean_sentiment'] = viable_videos_df['id'].map(reply_mean_sentiment_dict)
viable_videos_df.loc[:, 'reply_VADER_sentiment_classify'] = viable_videos_df['reply_mean_sentiment'].apply(lambda x: getVaderClassification(x))
viable_videos_df

100%|█████████████████████████████████████████| 130/130 [00:04<00:00, 27.43it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,id,nrComments,nrThreads,nrReplies,maxThreadLength,disabled,nrAuthors,nrAuthors3Posts,maxPostsByAuthor,url,...,Pro or agai,Type,Delivery,Other notes,mean_sentiment,VADER_sentiment_classify,parent_mean_sentiment,parent_VADER_sentiment_classify,reply_mean_sentiment,reply_VADER_sentiment_classify
9,uqwvf6R1_QY,141,19,41,493,False,99,11,8,https://www.youtube.com/watch?v=uqwvf6R1_QY,...,Pro,Monologue,Monologue,Gaming channel of an internet personality,0.100063,Positive,0.162456,Positive,0.058856,Positive
3,yiw6_JakZFc,150,26,50,458,False,111,7,17,https://www.youtube.com/watch?v=yiw6_JakZFc,...,Both,Ed Vid,Illustration,Kurzgesagt channel,0.077032,Positive,0.087312,Positive,0.064645,Positive
0,eRLJscAlk1M,118,10,19,500,False,115,0,2,https://www.youtube.com/watch?v=eRLJscAlk1M,...,Pro,Song,Drama,Weird add at the end,0.068703,Positive,0.079641,Positive,0.043310,Neutral
8,EhAemz1v7dQ,165,31,65,388,False,118,9,8,https://www.youtube.com/watch?v=EhAemz1v7dQ,...,Neither,Ed Vid,Illustration,Kurzgesast channel Nuclear Energy and CH,0.113529,Positive,0.161086,Positive,0.059929,Positive
5,wbR-5mHI6bo,128,17,28,445,False,103,9,3,https://www.youtube.com/watch?v=wbR-5mHI6bo,...,Pro,Ed Vid,Illustration,Kurzgesagt channel - complicated if its too la...,0.082061,Positive,0.105552,Positive,0.061022,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,Ff3yjCmMy9A,4,0,0,0,False,3,0,2,https://www.youtube.com/watch?v=Ff3yjCmMy9A,...,Pro,News Report,Informative,Citizen TV Kenya on CH,0.084220,Positive,0.141425,Positive,-0.144600,Negative
127,nm5LlWNqkJY,2,0,0,0,False,1,0,2,https://www.youtube.com/watch?v=nm5LlWNqkJY,...,Pro,Monologue,Argument,Wahsington post live posted a monologue of a w...,0.148433,Positive,0.148433,Positive,,Error
129,YVQWd5mXYVQ,1,0,0,0,False,1,0,1,https://www.youtube.com/watch?v=YVQWd5mXYVQ,...,Pro,Short Clip,Informative,CLIMATE change ch on daily CH news,0.000000,Neutral,0.000000,Neutral,,Error
130,e75rohU1kn4,1,0,0,0,False,1,0,1,https://www.youtube.com/watch?v=e75rohU1kn4,...,Pro,News Report,Informative,CitiTube on Adaptation summit in Africa on CH,0.885400,Positive,0.885400,Positive,,Error


In [29]:
viable_videos_df.to_excel('data/sentiment.xlsx')