In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package

# general DS packages
import pandas as pd
import numpy as np

# cleaning and pre-processing
from src.processing.text_cleaning import normalize_text, process_contractions, remove_all_punctuation, remove_emojis, remove_html_unescape, remove_digits, remove_extra_whitespace, remove_website_links
from src.processing.text_processing import tokenize_comment, lemmatize_comment, remove_stop_words, part_of_speech, part_of_speech_tag, part_of_speech_dependency, part_of_speech_shape, part_of_speech_alpha, part_of_speech_is_stop

# modeling
import gensim
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# visualisation
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv", )
data = raw_data.copy()
df = pd.DataFrame(data)

# Clean and process dataframe

In [3]:
%%time

# clean
df['textDisplay'] = df['textDisplay'].apply(normalize_text)
df['textDisplay'] = df['textDisplay'].apply(process_contractions)
df['textDisplay'] = df['textDisplay'].apply(remove_website_links)
df['textDisplay'] = df['textDisplay'].apply(remove_html_unescape)
df['textDisplay'] = df['textDisplay'].apply(remove_emojis)
df['textDisplay'] = df['textDisplay'].apply(remove_digits)
df['textDisplay'] = df['textDisplay'].apply(remove_all_punctuation)
df['textDisplay'] = df['textDisplay'].apply(remove_extra_whitespace)

# process
df["textStopWordsRemoved"] = df["textDisplay"].apply(remove_stop_words)
df["textTokenized"] = df['textStopWordsRemoved'].apply(tokenize_comment)
df["textLemmatized"] = df["textStopWordsRemoved"].apply(lemmatize_comment)

# part of speech operations
(df["pos"], 
 df["posTag"],
 df["posDependency"],
 df["posShape"],
 df["posAlpha"],
 df["posStopWord"]) = (df["textStopWordsRemoved"].apply(part_of_speech),
                                 df["textStopWordsRemoved"].apply(part_of_speech_tag),
                                 df["textStopWordsRemoved"].apply(part_of_speech_dependency),
                                 df["textStopWordsRemoved"].apply(part_of_speech_shape),
                                 df["textStopWordsRemoved"].apply(part_of_speech_alpha),
                                 df["textStopWordsRemoved"].apply(part_of_speech_is_stop)
                                )
df.head()

CPU times: user 39.5 s, sys: 38.3 ms, total: 39.6 s
Wall time: 39.6 s


Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textStopWordsRemoved,textTokenized,textLemmatized,pos,posTag,posDependency,posShape,posAlpha,posStopWord
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,lei wulong,"[lei, wulong]","[lei, wulong]","[PROPN, NOUN]","[NNP, NN]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now...,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[VERB, VERB, NOUN, NOUN]","[VBN, VBG, NNS, NNS]","[ROOT, xcomp, dobj, dobj]","[xxxx, xxxx, xxxx, xxxx]","[True, True, True, True]","[False, False, False, False]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,wow,[wow],[wow],[INTJ],[UH],[ROOT],[xxx],[True],[False]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[PROPN, PROPN]","[NNP, NNP]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[PROPN, PROPN, PROPN, PROPN]","[NNP, NNP, NNP, NNP]","[compound, compound, compound, ROOT]","[xxxx, xxxx, xxxx, xxx]","[True, True, True, True]","[False, False, False, False]"


# Topic modeling

## Create corpus and document-term matrix

Create a corpus (a list that contains all of the YouTube comments in it))...

In [4]:
# create a corpus
corpus = list(df['textLemmatized'].values)
corpus

[['lei', 'wulong'],
 ['see', 'get', 'view', 'bamco'],
 ['wow'],
 ['oww', 'yeaah'],
 ['hope', 'angel', 'version', 'jin'],
 ['begin'],
 ['let'],
 ['miguel', 'waiting', 'room', 'right'],
 ['wow'],
 ['yo'],
 ['marvelous'],
 ['late', 'bandai'],
 ['excitement'],
 ['marvelous'],
 ['new', 'trailer', 'drop', 'excited', 'game'],
 ['need', 'rest', 'trailer', 'mfs'],
 ['background', 'music', 'trailer', 'bchef', 'kissb'],
 ['reason', 'm', 'get', 'ps'],
 ['primer', 'comentario'],
 ['legend', 'shiiet', 'throw', 'lei', 'st'],
 ['character', 'bandai', 'go', 'love'],
 ['good', 'fight', 'game', 'franchise'],
 ['want', 'steve', 'action', 'point'],
 ['yoshimitsu', 'get', 'trailer', 'crime'],
 ['know', 'devil', 'jin', 'shriek', 'like', 'thatt'],
 ['hrefvrdxrpsqyhdtmsa'],
 ['amazing'],
 ['hailo', 'bandai', 'sayonara'],
 ['finally', 'lee', 'sadly', 'design', 'suck'],
 ['think', 'new', 'trailer'],
 ['devil', 'jin', 'vs', 'devil', 'kazuya', 'go', 'surreal'],
 ['lucky', 'chloe'],
 ['hope', 'bring', 'character', 

Use Gensim to create a dictionary that will store each word in the corpus and assign a unique ID to it. Then create a bag of words document-term matrix which will return a tuple with the word's unique ID and how many times it occurs in the document i.e., (word_id, frequnecy_in_document)

In [5]:
# create dictionary
comments_dictionary = corpora.Dictionary(corpus)

# create term document frequency
document_term_matrix = [comments_dictionary.doc2bow(doc) for doc in corpus]

In [6]:
comments_dictionary

<gensim.corpora.dictionary.Dictionary at 0x179811580>

In [7]:
document_term_matrix

[[(0, 1), (1, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1)],
 [(6, 1)],
 [(7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1)],
 [(14, 1)],
 [(15, 1), (16, 1), (17, 1), (18, 1)],
 [(6, 1)],
 [(19, 1)],
 [(20, 1)],
 [(21, 1), (22, 1)],
 [(23, 1)],
 [(20, 1)],
 [(24, 1), (25, 1), (26, 1), (27, 1), (28, 1)],
 [(28, 1), (29, 1), (30, 1), (31, 1)],
 [(28, 1), (32, 1), (33, 1), (34, 1), (35, 1)],
 [(3, 1), (36, 1), (37, 1), (38, 1)],
 [(39, 1), (40, 1)],
 [(0, 1), (41, 1), (42, 1), (43, 1), (44, 1)],
 [(21, 1), (45, 1), (46, 1), (47, 1)],
 [(26, 1), (48, 1), (49, 1), (50, 1)],
 [(51, 1), (52, 1), (53, 1), (54, 1)],
 [(3, 1), (28, 1), (55, 1), (56, 1)],
 [(11, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1)],
 [(62, 1)],
 [(63, 1)],
 [(21, 1), (64, 1), (65, 1)],
 [(66, 1), (67, 1), (68, 1), (69, 1), (70, 1)],
 [(27, 1), (28, 1), (71, 1)],
 [(11, 1), (46, 1), (57, 2), (72, 1), (73, 1), (74, 1)],
 [(75, 1), (76, 1)],
 [(10, 1), (45, 1), (77, 1), (78, 1)],
 [(79, 1)],
 [],
 [(46, 1), (80, 

The document term matrix is a list and each document (YouTube comment) within the list is now a list of tuples instead of tokens (words). The tuples provide the word ID (each word has a unique ID created by Gensim) and the frequency with which that word occurs in the document.


We can also view the actual word with the frequency...

In [8]:
comments_dictionary[0]

'lei'

In [9]:
# human readable format of document term matrix for first youtube comments
[[(comments_dictionary[id], freq) for id, freq in cp] for cp in document_term_matrix[:10]]

[[('lei', 1), ('wulong', 1)],
 [('bamco', 1), ('get', 1), ('see', 1), ('view', 1)],
 [('wow', 1)],
 [('oww', 1), ('yeaah', 1)],
 [('angel', 1), ('hope', 1), ('jin', 1), ('version', 1)],
 [('begin', 1)],
 [('let', 1)],
 [('miguel', 1), ('right', 1), ('room', 1), ('waiting', 1)],
 [('wow', 1)],
 [('yo', 1)]]

We now have the data in a state where we can build the topic model...

# Build the models

In [10]:
gensim.__version__

'4.3.2'

## LDA

In [11]:
# LDA model
lda_number_of_topics = 3
lda_model = LdaModel(corpus=document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=comments_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# Results
lda_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

[(0,
  '0.036*"wulong" + 0.029*"lei" + 0.021*"play" + 0.021*"lee" + 0.017*"alisa"'),
 (1,
  '0.041*"game" + 0.027*"jin" + 0.026*"devil" + 0.025*"trailer" + 0.024*"new"'),
 (2,
  '0.041*"tekken" + 0.038*"eddy" + 0.031*"bring" + 0.022*"go" + 0.022*"king"')]

### Visualise model

In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, document_term_matrix, comments_dictionary)
pyLDAvis.save_html(vis, "visualisations/lda_topics_viz.html")
# vis

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


### How to interpret pyLDAvis’s output?

- Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.
- A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
- A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.
- If you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

## Perplexity and coherence
- Model perplexity and topic coherence provide a convenient measure to indicate how good a given topic model is.
- Lower the perplexity better the model.
- Higher the topic coherence, the topic is more human interpretable.

In [14]:
# compute Perplexity
print(f"\nPerplexity: ' {lda_model.log_perplexity(chunk=document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=comments_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"\nCoherence Score: ' {coherence_lda}")


Perplexity: ' -7.240124806010448

Coherence Score: ' 0.4194747091257774


# LDA: bigram and trigram

### Build the bigrams and trigrams

In [15]:
# build the bigram and trigram models
bigram = gensim.models.Phrases(corpus, min_count=3, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[corpus], threshold=10)  

# Phraser objects take Phrases models as inputs and are optimized for faster phrase application
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[corpus[0]]])   # applies the bigram model to the first document, combining common word pairs. trigram_mod[] applies the trigram model 
                                            # to the result of the bigram application, forming trigrams from frequent bigrams and single words.
print(trigram_mod[bigram_mod[corpus[135]]])

['lei_wulong']
['eddy_gordo']


In [16]:
def make_bigrams(corpus):
    return [bigram_mod[doc] for doc in corpus]

def make_trigrams(corpus):
    return [trigram_mod[bigram_mod[doc]] for doc in corpus]

### Create bigrams corpus

In [17]:
# create bigrams corpus
corpus_bigrams = make_bigrams(corpus)

In [18]:
# create dictionary
bigrams_dictionary = corpora.Dictionary(corpus_bigrams)

# create term document frequency
bigrams_document_term_matrix = [bigrams_dictionary.doc2bow(doc) for doc in corpus_bigrams]

### Create bigrams model

In [19]:
# LDA model
lda_number_of_topics = 3
lda_bigrams_model = LdaModel(corpus=bigrams_document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=bigrams_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# Results
lda_bigrams_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

[(0,
  '0.041*"game" + 0.033*"character" + 0.027*"new" + 0.023*"tekken" + 0.019*"get"'),
 (1,
  '0.028*"trailer" + 0.027*"devil_jin" + 0.025*"tekken" + 0.021*"zafina" + 0.020*"come"'),
 (2,
  '0.045*"eddy" + 0.032*"bring" + 0.018*"lee" + 0.015*"wait" + 0.015*"lei_wulong"')]

In [20]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_bigrams_model, bigrams_document_term_matrix, bigrams_dictionary)
pyLDAvis.save_html(vis, "visualisations/lda_bigrams_topics_viz.html")
# vis

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


In [21]:
# compute Perplexity
print(f"\nPerplexity: ' {lda_bigrams_model.log_perplexity(chunk=bigrams_document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_bigrams = CoherenceModel(model=lda_bigrams_model, texts=corpus_bigrams, dictionary=bigrams_dictionary, coherence='c_v')
coherence_lda_bigrams = coherence_model_lda_bigrams.get_coherence()
print(f"\nCoherence Score: ' {coherence_lda_bigrams}")


Perplexity: ' -7.624250366214033

Coherence Score: ' 0.4912911921353462


The bigrams model is more perplex and coherent. This means:
- The model is able to...

## Next steps
- BERT model
- Non negative matrix/explore other models that may be suitable
- try one where you remove all words apart from character names?
- Wordcloud?