In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package

# general DS packages
import pandas as pd
import numpy as np

# cleaning and pre-processing
from src.processing.text_cleaning import (normalize_text, process_contractions, remove_all_punctuation, remove_emojis, 
remove_html_unescape, remove_href_pattern, remove_digits, remove_extra_whitespace, remove_website_links)

from src.processing.text_processing import (tokenize_comment, lemmatize_comment, remove_stop_words, remove_tiny_tokens, 
remove_tekken_character_names_from_tokens, part_of_speech, part_of_speech_tag, part_of_speech_dependency, part_of_speech_shape, 
part_of_speech_alpha, part_of_speech_is_stop, word_count)

# modeling
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# visualisation
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv", )
data = raw_data.copy()
df = pd.DataFrame(data)

# Clean and process dataframe

In [3]:
%%time

# clean
df['textDisplay'] = df['textDisplay'].apply(normalize_text)
df['textDisplay'] = df['textDisplay'].apply(process_contractions)
df['textDisplay'] = df['textDisplay'].apply(remove_website_links)
df['textDisplay'] = df['textDisplay'].apply(remove_html_unescape)
df['textDisplay'] = df['textDisplay'].apply(remove_emojis)
df['textDisplay'] = df['textDisplay'].apply(remove_digits)
df['textDisplay'] = df['textDisplay'].apply(remove_all_punctuation)
df['textDisplay'] = df['textDisplay'].apply(remove_href_pattern)
df['textDisplay'] = df['textDisplay'].apply(remove_extra_whitespace)

# process
df["textDisplayWordCount"] = df['textDisplay'].apply(word_count)
df["textStopWordsRemoved"] = df["textDisplay"].apply(remove_stop_words)
df["textTokenized"] = df['textStopWordsRemoved'].apply(tokenize_comment)
df["textLemmatized"] = df["textStopWordsRemoved"].apply(lemmatize_comment)
# remove short meaningless tokens from lemmatized tokens
df["textLemmatized"] = df['textLemmatized'].apply(remove_tiny_tokens)
df["textTekkenCharactersRemoved"] = df["textLemmatized"].apply(remove_tekken_character_names_from_tokens)
df["textProcessedCharactersRemoved"] = df["textTekkenCharactersRemoved"].apply(lambda x: ' '.join(x))

# part of speech operations
df["pos"] = df["textStopWordsRemoved"].apply(part_of_speech)
df["posTag"] = df["textStopWordsRemoved"].apply(part_of_speech_tag)
df["posDependency"] = df["textStopWordsRemoved"].apply(part_of_speech_dependency)
df["posShape"] = df["textStopWordsRemoved"].apply(part_of_speech_shape)
df["posAlpha"] = df["textStopWordsRemoved"].apply(part_of_speech_alpha)
df["posStopWord"] = df["textStopWordsRemoved"].apply(part_of_speech_is_stop)


# remove rows with empty strings in the 'textProcessedCharactersRemoved' column as these will have nothing to pass to the vectorizer when we come to transforming the text input
# to numerical input
df = df[df["textProcessedCharactersRemoved"].astype(str) != '']
df.reset_index(drop=True, inplace=True)

df.head()

CPU times: user 39.8 s, sys: 154 ms, total: 39.9 s
Wall time: 40.4 s


Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textDisplayWordCount,textStopWordsRemoved,textTokenized,textLemmatized,textTekkenCharactersRemoved,textProcessedCharactersRemoved,pos,posTag,posDependency,posShape,posAlpha,posStopWord
0,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now...,10,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[see, get, view, bamco]",see get view bamco,"[VERB, VERB, NOUN, NOUN]","[VBN, VBG, NNS, NNS]","[ROOT, xcomp, dobj, dobj]","[xxxx, xxxx, xxxx, xxxx]","[True, True, True, True]","[False, False, False, False]"
1,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,1,wow,[wow],[wow],[wow],wow,[INTJ],[UH],[ROOT],[xxx],[True],[False]
2,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,2,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[oww, yeaah]",oww yeaah,"[PROPN, PROPN]","[NNP, NNP]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
3,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,150,18,i hope we get an angel version of jin,9,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[hope, version]",hope version,"[PROPN, PROPN, PROPN, PROPN]","[NNP, NNP, NNP, NNP]","[compound, compound, compound, ROOT]","[xxxx, xxxx, xxxx, xxx]","[True, True, True, True]","[False, False, False, False]"
4,rDxrpSqYHD8,@kazamataurus337,2023-11-01 16:10:08+00:00,2023-11-01 16:10:08+00:00,1,0,so it begins,3,begins,[begins],[begin],[begin],begin,[VERB],[VBZ],[ROOT],[xxxx],[True],[False]


# Topic modeling: Latent Dirichlet Allocation model
We're going to build an LDA model, but first we need to prepare the data so it can be read by an algorithm

## Create a corpus and document term matrix

The first thing that needs to be done is to create a corpus (a list that contains all of the YouTube comments in it)) document term matrix.

In [9]:
# create a corpus
corpus = list(df['textTekkenCharactersRemoved'].values)
print(corpus)

[['see', 'get', 'view', 'bamco'], ['wow'], ['oww', 'yeaah'], ['hope', 'version'], ['begin'], ['waiting', 'room', 'right'], ['let'], ['wow'], ['marvelous'], ['late', 'bandai'], ['excitement'], ['marvelous'], ['new', 'trailer', 'drop', 'excited', 'game'], ['need', 'rest', 'trailer', 'mfs'], ['reason', 'get'], ['primer', 'comentario'], ['legend', 'shiiet', 'throw'], ['character', 'bandai', 'love'], ['good', 'fight', 'game', 'franchise'], ['want', 'action', 'point'], ['get', 'trailer', 'crime'], ['know', 'shriek', 'like', 'thatt'], ['amazing'], ['hailo', 'bandai', 'sayonara'], ['finally', 'sadly', 'design', 'suck'], ['think', 'new', 'trailer'], ['hope', 'bring', 'character', 'creation'], ['gdmf'], ['year', 'epic'], ['gaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], ['tvt'], ['black', 'goku'], ['finally', 'mean'], ['need', 'beg'], ['dragunov'], ['lol'], ['queen'], ['weekly', 'trailer'], ['bring', 'crime', 'ashame'], ['take', 'inspiration', 'goku', 'black'], ['roster', 'look', 'lovely', 'w

Use Gensim to create a dictionary that will store each word in the corpus and assign a unique ID to it. Then create a bag of words document-term matrix which will return a tuple with the word's unique ID and how many times it occurs in the document i.e., (word_id, frequnecy_in_document)

In [15]:
# create dictionary
comments_dictionary = corpora.Dictionary(corpus)

# create term document frequency
document_term_matrix = [comments_dictionary.doc2bow(doc) for doc in corpus]

print(document_term_matrix)

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1)], [(5, 1), (6, 1)], [(7, 1), (8, 1)], [(9, 1)], [(10, 1), (11, 1), (12, 1)], [(13, 1)], [(4, 1)], [(14, 1)], [(15, 1), (16, 1)], [(17, 1)], [(14, 1)], [(18, 1), (19, 1), (20, 1), (21, 1), (22, 1)], [(22, 1), (23, 1), (24, 1), (25, 1)], [(1, 1), (26, 1)], [(27, 1), (28, 1)], [(29, 1), (30, 1), (31, 1)], [(15, 1), (32, 1), (33, 1)], [(20, 1), (34, 1), (35, 1), (36, 1)], [(37, 1), (38, 1), (39, 1)], [(1, 1), (22, 1), (40, 1)], [(41, 1), (42, 1), (43, 1), (44, 1)], [(45, 1)], [(15, 1), (46, 1), (47, 1)], [(48, 1), (49, 1), (50, 1), (51, 1)], [(21, 1), (22, 1), (52, 1)], [(7, 1), (32, 1), (53, 1), (54, 1)], [(55, 1)], [(56, 1), (57, 1)], [(58, 1)], [(59, 1)], [(60, 1), (61, 1)], [(49, 1), (62, 1)], [(24, 1), (63, 1)], [(64, 1)], [(65, 1)], [(66, 1)], [(22, 1), (67, 1)], [(40, 1), (53, 1), (68, 1)], [(60, 1), (61, 1), (69, 1), (70, 1)], [(71, 1), (72, 1), (73, 1), (74, 1), (75, 1)], [(76, 1)], [(77, 1)], [(75, 1)], [(32, 1), (78, 1)], [(36, 1), (71,

The document term matrix is a list of lists. Each document (YouTube comment) is now a list of tuples instead of tokens (words). The tuples provide the word ID (each word has a unique ID created by Gensim) and the frequency with which that word occurs in the document.

We can also view the actual word with the frequency...

In [23]:
# first word in dictionary
comments_dictionary[0]

'bamco'

In [25]:
# human readable format of document term matrix for first 10 documents (first 10 YouTube comments)
[[(comments_dictionary[id], freq) for id, freq in cp] for cp in document_term_matrix[:10]]

[[('bamco', 1), ('get', 1), ('see', 1), ('view', 1)],
 [('wow', 1)],
 [('oww', 1), ('yeaah', 1)],
 [('hope', 1), ('version', 1)],
 [('begin', 1)],
 [('right', 1), ('room', 1), ('waiting', 1)],
 [('let', 1)],
 [('wow', 1)],
 [('marvelous', 1)],
 [('bandai', 1), ('late', 1)]]

We now have the data in a state where we can build the topic model...

# LDA

In [26]:
# LDA model
lda_number_of_topics = 3
lda_model = LdaModel(corpus=document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=comments_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# Results
lda_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

[(0,
  '0.051*"want" + 0.036*"capoeira" + 0.019*"return" + 0.018*"need" + 0.018*"game"'),
 (1,
  '0.020*"come" + 0.016*"trailer" + 0.016*"look" + 0.013*"character" + 0.012*"not"'),
 (2,
  '0.045*"character" + 0.022*"new" + 0.021*"like" + 0.018*"look" + 0.016*"get"')]

## Visualise model

In [28]:
pyLDAvis.enable_notebook()
viz = pyLDAvis.gensim.prepare(lda_model, document_term_matrix, comments_dictionary)
pyLDAvis.save_html(viz, "models/visualisations/lda_topics_viz.html")

### How to interpret pyLDAvis’s output?

- Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.
- A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
- A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.
- If you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

# Evaluation

## Perplexity and coherence
- Model perplexity and topic coherence provide a convenient measure to indicate how good a given topic model is.
- Lower the perplexity better the model.
- Higher the topic coherence, the topic is more human interpretable.

In [46]:
# compute perplexity
print(f"\nLDA perplexity: {lda_model.log_perplexity(chunk=document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.

# compute coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=comments_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"\nLDA coherence score: {coherence_lda}")


LDA perplexity: -7.226635993873205

LDA coherence score: 0.43499782779343726


- The perplexity and coherence scores are good
- The _pyLDAvis_ gives insight into the top 30 words for each topic. There is some overlap.
- Next we'll see if using bigrams and trigrams will improve the model...

# Create corpus of bigrams and trigrams

### Build the bigrams and trigrams

In [30]:
# build the bigram and trigram models
bigram = gensim.models.Phrases(corpus, min_count=3, threshold=10) # higher threshold fewer phrases
trigram = gensim.models.Phrases(bigram[corpus], threshold=10)  

# Phraser objects take Phrases models as inputs and are optimized for faster phrase application
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# see trigram example
print(trigram_mod[bigram_mod[corpus[1]]])   # applies the bigram model to the first document, combining common word pairs. trigram_mod[] applies the trigram model 
                                            # to the result of the bigram application, forming trigrams from frequent bigrams and single words.
print(trigram_mod[bigram_mod[corpus[13]]])

['wow']
['need', 'rest', 'trailer', 'mfs']


In [34]:
def make_bigrams(corpus):
    return [bigram_mod[doc] for doc in corpus]

def make_trigrams(corpus):
    return [trigram_mod[bigram_mod[doc]] for doc in corpus]

## Create bigrams corpus

In [35]:
# create bigrams corpus
corpus_bigrams = make_bigrams(corpus)

In [36]:
# create dictionary
bigrams_dictionary = corpora.Dictionary(corpus_bigrams)

# create term document frequency
bigrams_document_term_matrix = [bigrams_dictionary.doc2bow(doc) for doc in corpus_bigrams]

### Create bigrams model

In [37]:
# LDA model
lda_number_of_topics = 3
lda_bigrams_model = LdaModel(corpus=bigrams_document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=bigrams_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# Results
lda_bigrams_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

[(0,
  '0.020*"need" + 0.019*"capoeira_capoeira" + 0.015*"return" + 0.013*"legend" + 0.011*"game"'),
 (1,
  '0.020*"want_want" + 0.017*"look" + 0.016*"trailer" + 0.015*"want" + 0.015*"get"'),
 (2,
  '0.041*"character" + 0.020*"game" + 0.018*"bring" + 0.016*"not" + 0.011*"like"')]

In [50]:
pyLDAvis.enable_notebook()
viz = pyLDAvis.gensim.prepare(lda_bigrams_model, bigrams_document_term_matrix, bigrams_dictionary)
pyLDAvis.save_html(viz, "models/visualisations/lda_topics_bigrams_viz.html")

# Evaluation

In [47]:
# compute Perplexity
print(f"\nLDA bigrams perplexity: {lda_bigrams_model.log_perplexity(chunk=bigrams_document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_bigrams = CoherenceModel(model=lda_bigrams_model, texts=corpus_bigrams, dictionary=bigrams_dictionary, coherence='c_v')
coherence_lda_bigrams = coherence_model_lda_bigrams.get_coherence()
print(f"\nLDA bigrams coherence score: {coherence_lda_bigrams}")


LDA bigrams perplexity: -7.4298109021026075

LDA bigrams coherence score: 0.4626374352486382


## Model assessment

In [49]:
print(f"\nLDA perplexity: {lda_model.log_perplexity(chunk=document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.
print(f"\nLDA bigrams perplexity: {lda_bigrams_model.log_perplexity(chunk=bigrams_document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.
print(f"\nLDA coherence score: {coherence_lda}")
print(f"\nLDA bigrams coherence score: {coherence_lda_bigrams}")



LDA perplexity: -7.22663619971668

LDA bigrams perplexity: -7.429813865596894

LDA coherence score: 0.43499782779343726

LDA bigrams coherence score: 0.4626374352486382


- The two models are similar in terms of perplexity and coherence, although the bigrams model has slightly better scores, meaning that:
    - the bigrams model is very slightly more able to measure the model's ability to predict a held-out word in a document and assess how well words within a topic connect semantically and form meaningful themes.
    - However, the scores for both models are good.