In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package

# general DS packages
import pandas as pd
import numpy as np

# cleaning and pre-processing
from src.processing.text_cleaning import (normalize_text, process_contractions, remove_all_punctuation, remove_emojis, 
remove_html_unescape, remove_href_pattern, remove_digits, remove_extra_whitespace, remove_website_links)
from src.processing.text_processing import (tokenize_comment, lemmatize_comment, remove_stop_words, remove_tekken_character_names_from_tokens, 
part_of_speech, part_of_speech_tag, part_of_speech_dependency, part_of_speech_shape, part_of_speech_alpha, part_of_speech_is_stop)

# modeling
import gensim
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# visualisation
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv", )
data = raw_data.copy()
df = pd.DataFrame(data)

# Clean and process dataframe

In [3]:
%%time

# clean
df['textDisplay'] = df['textDisplay'].apply(normalize_text)
df['textDisplay'] = df['textDisplay'].apply(process_contractions)
df['textDisplay'] = df['textDisplay'].apply(remove_website_links)
df['textDisplay'] = df['textDisplay'].apply(remove_html_unescape)
df['textDisplay'] = df['textDisplay'].apply(remove_emojis)
df['textDisplay'] = df['textDisplay'].apply(remove_digits)
df['textDisplay'] = df['textDisplay'].apply(remove_all_punctuation)
df['textDisplay'] = df['textDisplay'].apply(remove_href_pattern)
df['textDisplay'] = df['textDisplay'].apply(remove_extra_whitespace)

# process
df["textStopWordsRemoved"] = df["textDisplay"].apply(remove_stop_words)
df["textTokenized"] = df['textStopWordsRemoved'].apply(tokenize_comment)
df["textLemmatized"] = df["textStopWordsRemoved"].apply(lemmatize_comment)
df["textTekkenCharactersRemoved"] = df["textLemmatized"].apply(remove_tekken_character_names_from_tokens)


# part of speech operations
(df["pos"], 
 df["posTag"],
 df["posDependency"],
 df["posShape"],
 df["posAlpha"],
 df["posStopWord"]) = (df["textStopWordsRemoved"].apply(part_of_speech),
                                 df["textStopWordsRemoved"].apply(part_of_speech_tag),
                                 df["textStopWordsRemoved"].apply(part_of_speech_dependency),
                                 df["textStopWordsRemoved"].apply(part_of_speech_shape),
                                 df["textStopWordsRemoved"].apply(part_of_speech_alpha),
                                 df["textStopWordsRemoved"].apply(part_of_speech_is_stop)
                                )
df.head()

CPU times: user 39.5 s, sys: 32.2 ms, total: 39.5 s
Wall time: 39.6 s


Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textStopWordsRemoved,textTokenized,textLemmatized,textTekkenCharactersRemoved,pos,posTag,posDependency,posShape,posAlpha,posStopWord
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,lei wulong,"[lei, wulong]","[lei, wulong]",[],"[PROPN, NOUN]","[NNP, NN]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now...,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[see, get, view, bamco]","[VERB, VERB, NOUN, NOUN]","[VBN, VBG, NNS, NNS]","[ROOT, xcomp, dobj, dobj]","[xxxx, xxxx, xxxx, xxxx]","[True, True, True, True]","[False, False, False, False]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,wow,[wow],[wow],[wow],[INTJ],[UH],[ROOT],[xxx],[True],[False]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[oww, yeaah]","[PROPN, PROPN]","[NNP, NNP]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[hope, version]","[PROPN, PROPN, PROPN, PROPN]","[NNP, NNP, NNP, NNP]","[compound, compound, compound, ROOT]","[xxxx, xxxx, xxxx, xxx]","[True, True, True, True]","[False, False, False, False]"


# Topic modeling

## Create corpus and document-term matrix

Create a corpus (a list that contains all of the YouTube comments in it))...

In [7]:
# create a corpus
corpus = list(df['textTekkenCharactersRemoved'].values)
corpus

[[],
 ['see', 'get', 'view', 'bamco'],
 ['wow'],
 ['oww', 'yeaah'],
 ['hope', 'version'],
 ['begin'],
 ['let'],
 ['waiting', 'room', 'right'],
 ['wow'],
 ['yo'],
 ['marvelous'],
 ['late', 'bandai'],
 ['excitement'],
 ['marvelous'],
 ['new', 'trailer', 'drop', 'excited', 'game'],
 ['need', 'rest', 'trailer', 'mfs'],
 ['background', 'music', 'trailer', 'bchef', 'kissb'],
 ['reason', 'm', 'get', 'ps'],
 ['primer', 'comentario'],
 ['legend', 'shiiet', 'throw', 'st'],
 ['character', 'bandai', 'go', 'love'],
 ['good', 'fight', 'game', 'franchise'],
 ['want', 'action', 'point'],
 ['get', 'trailer', 'crime'],
 ['know', 'shriek', 'like', 'thatt'],
 [],
 ['amazing'],
 ['hailo', 'bandai', 'sayonara'],
 ['finally', 'sadly', 'design', 'suck'],
 ['think', 'new', 'trailer'],
 ['vs', 'kazuya', 'go', 'surreal'],
 [],
 ['hope', 'bring', 'character', 'creation'],
 ['gdmf'],
 [],
 ['year', 'go', 'epic'],
 [],
 ['gaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
 [],
 ['tvt'],
 [],
 ['black', 'goku'],
 ['

Use Gensim to create a dictionary that will store each word in the corpus and assign a unique ID to it. Then create a bag of words document-term matrix which will return a tuple with the word's unique ID and how many times it occurs in the document i.e., (word_id, frequnecy_in_document)

In [8]:
# create dictionary
comments_dictionary = corpora.Dictionary(corpus)

# create term document frequency
document_term_matrix = [comments_dictionary.doc2bow(doc) for doc in corpus]

In [9]:
comments_dictionary

<gensim.corpora.dictionary.Dictionary at 0x16956a570>

In [10]:
document_term_matrix

[[],
 [(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1)],
 [(5, 1), (6, 1)],
 [(7, 1), (8, 1)],
 [(9, 1)],
 [(10, 1)],
 [(11, 1), (12, 1), (13, 1)],
 [(4, 1)],
 [(14, 1)],
 [(15, 1)],
 [(16, 1), (17, 1)],
 [(18, 1)],
 [(15, 1)],
 [(19, 1), (20, 1), (21, 1), (22, 1), (23, 1)],
 [(23, 1), (24, 1), (25, 1), (26, 1)],
 [(23, 1), (27, 1), (28, 1), (29, 1), (30, 1)],
 [(1, 1), (31, 1), (32, 1), (33, 1)],
 [(34, 1), (35, 1)],
 [(36, 1), (37, 1), (38, 1), (39, 1)],
 [(16, 1), (40, 1), (41, 1), (42, 1)],
 [(21, 1), (43, 1), (44, 1), (45, 1)],
 [(46, 1), (47, 1), (48, 1)],
 [(1, 1), (23, 1), (49, 1)],
 [(50, 1), (51, 1), (52, 1), (53, 1)],
 [],
 [(54, 1)],
 [(16, 1), (55, 1), (56, 1)],
 [(57, 1), (58, 1), (59, 1), (60, 1)],
 [(22, 1), (23, 1), (61, 1)],
 [(41, 1), (62, 1), (63, 1), (64, 1)],
 [],
 [(7, 1), (40, 1), (65, 1), (66, 1)],
 [(67, 1)],
 [],
 [(41, 1), (68, 1), (69, 1)],
 [],
 [(70, 1)],
 [],
 [(71, 1)],
 [],
 [(72, 1), (73, 1)],
 [(74, 1), (75, 1), (76, 1)],
 [(58, 1), (77, 1)],
 [],
 [(25, 1

The document term matrix is a list and each document (YouTube comment) within the list is now a list of tuples instead of tokens (words). The tuples provide the word ID (each word has a unique ID created by Gensim) and the frequency with which that word occurs in the document.


We can also view the actual word with the frequency...

In [11]:
comments_dictionary[0]

'bamco'

In [12]:
# human readable format of document term matrix for first youtube comments
[[(comments_dictionary[id], freq) for id, freq in cp] for cp in document_term_matrix[:10]]

[[],
 [('bamco', 1), ('get', 1), ('see', 1), ('view', 1)],
 [('wow', 1)],
 [('oww', 1), ('yeaah', 1)],
 [('hope', 1), ('version', 1)],
 [('begin', 1)],
 [('let', 1)],
 [('right', 1), ('room', 1), ('waiting', 1)],
 [('wow', 1)],
 [('yo', 1)]]

We now have the data in a state where we can build the topic model...

# Build the models

In [13]:
gensim.__version__

'4.3.2'

## LDA

In [14]:
# LDA model
lda_number_of_topics = 3
lda_model = LdaModel(corpus=document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=comments_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# Results
lda_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

[(0,
  '0.024*"character" + 0.022*"wait" + 0.015*"baki" + 0.014*"plz" + 0.014*"marvelous"'),
 (1,
  '0.035*"trailer" + 0.023*"bring" + 0.023*"want" + 0.022*"new" + 0.022*"come"'),
 (2, '0.034*"game" + 0.026*"m" + 0.023*"get" + 0.022*"love" + 0.022*"new"')]

### Visualise model

In [15]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, document_term_matrix, comments_dictionary)
pyLDAvis.save_html(vis, "visualisations/lda_topics_tekken_characters_removed_viz.html")
# vis

  pid = os.fork()
  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)


### How to interpret pyLDAvis’s output?

- Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.
- A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
- A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.
- If you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

## Perplexity and coherence
- Model perplexity and topic coherence provide a convenient measure to indicate how good a given topic model is.
- Lower the perplexity better the model.
- Higher the topic coherence, the topic is more human interpretable.

In [16]:
# compute Perplexity
print(f"\nPerplexity: ' {lda_model.log_perplexity(chunk=document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=comments_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"\nCoherence Score: ' {coherence_lda}")


Perplexity: ' -7.672085734750174

Coherence Score: ' 0.42544079214268077


# Create corpus of bigrams and trigrams

### Build the bigrams and trigrams

In [17]:
# build the bigram and trigram models
bigram = gensim.models.Phrases(corpus, min_count=3, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[corpus], threshold=10)  

# Phraser objects take Phrases models as inputs and are optimized for faster phrase application
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[corpus[0]]])   # applies the bigram model to the first document, combining common word pairs. trigram_mod[] applies the trigram model 
                                            # to the result of the bigram application, forming trigrams from frequent bigrams and single words.
print(trigram_mod[bigram_mod[corpus[135]]])

[]
[]


In [18]:
def make_bigrams(corpus):
    return [bigram_mod[doc] for doc in corpus]

def make_trigrams(corpus):
    return [trigram_mod[bigram_mod[doc]] for doc in corpus]

### Create bigrams corpus

In [19]:
# create bigrams corpus
corpus_bigrams = make_bigrams(corpus)

In [20]:
# create dictionary
bigrams_dictionary = corpora.Dictionary(corpus_bigrams)

# create term document frequency
bigrams_document_term_matrix = [bigrams_dictionary.doc2bow(doc) for doc in corpus_bigrams]

### Create bigrams model

In [21]:
# LDA model
lda_number_of_topics = 3
lda_bigrams_model = LdaModel(corpus=bigrams_document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=bigrams_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# Results
lda_bigrams_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

[(0, '0.020*"add" + 0.018*"get" + 0.017*"come" + 0.015*"story" + 0.015*"pls"'),
 (1,
  '0.037*"character" + 0.036*"game" + 0.023*"play" + 0.023*"go" + 0.022*"get"'),
 (2,
  '0.031*"trailer" + 0.030*"bring" + 0.024*"s" + 0.021*"game" + 0.020*"love"')]

In [22]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_bigrams_model, bigrams_document_term_matrix, bigrams_dictionary)
pyLDAvis.save_html(vis, "visualisations/lda_bigrams_topics_tekken_characters_removed_viz.html")
# vis

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


In [23]:
# compute Perplexity
print(f"\nPerplexity: ' {lda_bigrams_model.log_perplexity(chunk=bigrams_document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_bigrams = CoherenceModel(model=lda_bigrams_model, texts=corpus_bigrams, dictionary=bigrams_dictionary, coherence='c_v')
coherence_lda_bigrams = coherence_model_lda_bigrams.get_coherence()
print(f"\nCoherence Score: ' {coherence_lda_bigrams}")


Perplexity: ' -7.8668283611868475

Coherence Score: ' 0.4583093389006425


The bigrams model is..

## Next steps
- BERT model
- Non negative matrix/explore other models that may be suitable
- try one where you remove all words apart from character names?
- Wordcloud?