In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package

# general ds packages
import pandas as pd
import numpy as np

# cleaning and pre-processing
from src.processing.text_cleaning import (normalize_text, process_contractions, remove_all_punctuation, remove_emojis, 
remove_html_unescape, remove_href_pattern, remove_digits, remove_extra_whitespace, remove_website_links)

from src.processing.text_processing import (tokenize_comment, lemmatize_comment, remove_stop_words, remove_tiny_tokens, 
remove_tekken_character_names_from_tokens, part_of_speech, part_of_speech_tag, part_of_speech_dependency, part_of_speech_shape, 
part_of_speech_alpha, part_of_speech_is_stop, word_count)

# modeling
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# visualisation
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

# Load the data

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv", )
data = raw_data.copy()
df = pd.DataFrame(data)

# Clean and process data

In [3]:
%%time

# clean
df['textDisplay'] = df['textDisplay'].apply(normalize_text)
df['textDisplay'] = df['textDisplay'].apply(process_contractions)
df['textDisplay'] = df['textDisplay'].apply(remove_website_links)
df['textDisplay'] = df['textDisplay'].apply(remove_html_unescape)
df['textDisplay'] = df['textDisplay'].apply(remove_emojis)
df['textDisplay'] = df['textDisplay'].apply(remove_digits)
df['textDisplay'] = df['textDisplay'].apply(remove_all_punctuation)
df['textDisplay'] = df['textDisplay'].apply(remove_href_pattern)
df['textDisplay'] = df['textDisplay'].apply(remove_extra_whitespace)

# process
df["textDisplayWordCount"] = df['textDisplay'].apply(word_count)
df["textStopWordsRemoved"] = df["textDisplay"].apply(remove_stop_words)
df["textTokenized"] = df['textStopWordsRemoved'].apply(tokenize_comment)
df["textLemmatized"] = df["textStopWordsRemoved"].apply(lemmatize_comment)
# remove short meaningless tokens from lemmatized tokens
df["textLemmatized"] = df['textLemmatized'].apply(remove_tiny_tokens)
df["textTekkenCharactersRemoved"] = df["textLemmatized"].apply(remove_tekken_character_names_from_tokens)
df["textProcessedCharactersRemoved"] = df["textTekkenCharactersRemoved"].apply(lambda x: ' '.join(x))

# part of speech operations
df["pos"] = df["textStopWordsRemoved"].apply(part_of_speech)
df["posTag"] = df["textStopWordsRemoved"].apply(part_of_speech_tag)
df["posDependency"] = df["textStopWordsRemoved"].apply(part_of_speech_dependency)
df["posShape"] = df["textStopWordsRemoved"].apply(part_of_speech_shape)
df["posAlpha"] = df["textStopWordsRemoved"].apply(part_of_speech_alpha)
df["posStopWord"] = df["textStopWordsRemoved"].apply(part_of_speech_is_stop)


# remove rows with empty strings in the 'textProcessedCharactersRemoved' column as these will have nothing to pass to the vectorizer when we come to transforming the text input
# to numerical input
df = df[df["textProcessedCharactersRemoved"].astype(str) != '']
df.reset_index(drop=True, inplace=True)

df.head()

CPU times: user 39.7 s, sys: 81 ms, total: 39.8 s
Wall time: 40.1 s


Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textDisplayWordCount,textStopWordsRemoved,textTokenized,textLemmatized,textTekkenCharactersRemoved,textProcessedCharactersRemoved,pos,posTag,posDependency,posShape,posAlpha,posStopWord
0,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now...,10,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[see, get, view, bamco]",see get view bamco,"[VERB, VERB, NOUN, NOUN]","[VBN, VBG, NNS, NNS]","[ROOT, xcomp, dobj, dobj]","[xxxx, xxxx, xxxx, xxxx]","[True, True, True, True]","[False, False, False, False]"
1,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,1,wow,[wow],[wow],[wow],wow,[INTJ],[UH],[ROOT],[xxx],[True],[False]
2,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,2,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[oww, yeaah]",oww yeaah,"[PROPN, PROPN]","[NNP, NNP]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
3,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,150,18,i hope we get an angel version of jin,9,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[hope, version]",hope version,"[PROPN, PROPN, PROPN, PROPN]","[NNP, NNP, NNP, NNP]","[compound, compound, compound, ROOT]","[xxxx, xxxx, xxxx, xxx]","[True, True, True, True]","[False, False, False, False]"
4,rDxrpSqYHD8,@kazamataurus337,2023-11-01 16:10:08+00:00,2023-11-01 16:10:08+00:00,1,0,so it begins,3,begins,[begins],[begin],[begin],begin,[VERB],[VBZ],[ROOT],[xxxx],[True],[False]


# Topic Modeling
- We're going to build an Latent Dirichlet Allocation (LDA) model, but first we need to prepare the data so it can be read by the algorithm. To do this we need to:
    - Create a corpus (a list that contains all of the YouTube comments in it)
    - Create a document term matrix (a tuple of word ID and frequency of occurrence in a given document).

## Create a corpus and document term matrix

In [23]:
# create a corpus (list of all the comments)
corpus = list(df['textTekkenCharactersRemoved'].values)
print(corpus[:20])

[['see', 'get', 'view', 'bamco'], ['wow'], ['oww', 'yeaah'], ['hope', 'version'], ['begin'], ['waiting', 'room', 'right'], ['let'], ['wow'], ['marvelous'], ['late', 'bandai'], ['excitement'], ['marvelous'], ['new', 'trailer', 'drop', 'excited', 'game'], ['need', 'rest', 'trailer', 'mfs'], ['reason', 'get'], ['primer', 'comentario'], ['legend', 'shiiet', 'throw'], ['character', 'bandai', 'love'], ['good', 'fight', 'game', 'franchise'], ['want', 'action', 'point']]


The output shows that our corpus is a list of all the YouTube comments; the items in each list are the words that make up the comment. We now have two next steps:
1. Use _Gensim_ to create a dictionary that will store each word in the corpus and assign a unique ID to it.
2. Create a bag of words document-term matrix which will return a tuple with the word's unique ID and how many times it occurs in the document i.e., (word_id, frequnecy_in_document)

In [5]:
# create dictionary that will store each word in the corpus and assign a unique ID to each word
comments_dictionary = corpora.Dictionary(corpus)

# explore dictionary
print(comments_dictionary[0])
print(comments_dictionary[1])
print(comments_dictionary[2])

bamco
get
see


We now have a dictionary that stores the words in our corpus. Note that the first words in the dictioanry are the words in the first comment sorted alphabetically.

In [20]:
# create a document term matrix
document_term_matrix = [comments_dictionary.doc2bow(doc) for doc in corpus]

print(document_term_matrix[:10])

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1)], [(5, 1), (6, 1)], [(7, 1), (8, 1)], [(9, 1)], [(10, 1), (11, 1), (12, 1)], [(13, 1)], [(4, 1)], [(14, 1)], [(15, 1), (16, 1)]]


The document term matrix is a list of lists. Each document (nested list) is a YouTube comment represented as a list of tuples. The tuples provide the word ID (each word has a unique ID created by _Gensim_) and the frequency with which that word occurs in the document (YouTube comment).

We can also view the actual word with the frequency...

In [7]:
# human readable format of document term matrix for first 10 documents (first 10 YouTube comments)
[[(comments_dictionary[id], freq) for id, freq in cp] for cp in document_term_matrix[:10]]

[[('bamco', 1), ('get', 1), ('see', 1), ('view', 1)],
 [('wow', 1)],
 [('oww', 1), ('yeaah', 1)],
 [('hope', 1), ('version', 1)],
 [('begin', 1)],
 [('right', 1), ('room', 1), ('waiting', 1)],
 [('let', 1)],
 [('wow', 1)],
 [('marvelous', 1)],
 [('bandai', 1), ('late', 1)]]

We now have the data in a state where we can build the topic model.

# Latent Dirichlet Allocation (LDA) model

In [8]:
# build an LDA model
lda_number_of_topics = 3   # manually specify the number of topics we want
lda_model = LdaModel(corpus=document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=comments_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                     random_state=42,)

# print the topics and the words most closely alighned with this topic
lda_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

[(0,
  '0.063*"want" + 0.042*"capoeira" + 0.016*"look" + 0.016*"game" + 0.015*"like"'),
 (1,
  '0.044*"character" + 0.022*"look" + 0.022*"new" + 0.020*"game" + 0.019*"like"'),
 (2,
  '0.026*"hope" + 0.020*"trailer" + 0.014*"come" + 0.012*"get" + 0.009*"baki"')]

The output shows our three topics and the 5 words most closely aligned with that topic.
The output is a list of tuples with the first item in the tuple the topic ID, and the second item contains a string with the words that make up the topic and their weighting.
The greater the weight the more important the word is to the given topic.

### Topics
- 0 appears to focus an interest in 'capoeira', a martial art.
- 1 appears to relate to discussion of the new characters and the 'look' or visuals of the game.
- 2 appears focus on excitement surrounding the trailer with 'hope' showing anticipation.

# Visualise model

In [9]:
pyLDAvis.enable_notebook()
viz = pyLDAvis.gensim.prepare(lda_model, document_term_matrix, comments_dictionary)
pyLDAvis.save_html(viz, "models/visualisations/lda_topics_viz.html")

### How to interpret pyLDAvis’s output?
(see html output for the visual)

#### Bubble plot
- Each bubble on the plot represents a topic. The larger the bubble, the more prevalent the topic.
- A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
- A model with too many topics will typically have many overlaps and/or small bubbles clustered in one region of the chart.
- If you move the cursor over one of the bubbles the words and bars on the right-hand side will update. These words are the most important words that form the selected topic.

# Evaluation

### Perplexity and coherence
Model perplexity and topic coherence provide a convenient measure to indicate how good a given topic model is.

**Perplexity**:
- The lower the perplexity better the model.
- It is said that around 10-20 for smaller datasets and closer to 50 for larger ones is a good aim.
- Interpretation: Very low perplexity might suggest overfitting, while excessively high values indicate poor topic capture.
  
**Coherence**:
- The higher the topic coherence, the more likely the topic is to be human interpretable.
- 'Good scores' will vary depending on the measure and dataset.
- Measurements: For 'u_mass', above 0.5 can be good, while for 'c_v' or 'c_uci', above 0.1 might be suitable.

In [10]:
# get perplexity score
lda_perplexity_score = lda_model.log_perplexity(chunk=document_term_matrix, total_docs=None)   # a measure of how good the model is. lower the better.
print(f"\nLDA perplexity:{lda_perplexity_score}")   

# get coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=comments_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"\nLDA coherence score: {coherence_lda}")


LDA perplexity:-7.234274494855926

LDA coherence score: 0.452657714270358


- The perplexity and coherence scores are good.
- The _pyLDAvis_ gives insight into the top 30 words for each topic. There is some overlap between topics.
- Next we'll see if using bigrams and trigrams will improve the model...

# Create corpus of bigrams and trigrams
- Creating bigrams and trigrams can potentially better capture semantic meaning, improve topic coherence, and handle ambguity better.
- They achieve this by:
    - **Capturing meaning that single words might miss.** For example, "machine learning" or "United States" convey more specific concepts than "machine" or "States" alone.
    - **Improving topic coherence by capturing more interpretable and meaningful topics**, as sometimes pairs of words can represent specific subject areas or themes better than single words.
    - **Handling ambiguity by capturing crucial information in the second word.** For example, "jaguar car" vs. "jaguar animal".

## Build the bigrams and trigrams corpus
We want to create a corpus that includes bigrams and trigrams. To do this we need to create bigram and trigram models, and to achieve this we:

- Use _gensim_'s Phrases to train a model on our corpus. This model will anlyse the input corpus using statistical methods and build a model that captures frequent word co-occurrences.
- Pass the Phrases model to _gensim_'s Phraser - this takes a trained Phrases model as input and creates an internal representation for faster phrase detection. When presented with a document, it applies the stored rules to identify and replace frequent word sequences with their corresponding phrases.

### Notes
- Experimentation is required with the _min_count_ and _threshold_ parameters.

In [11]:
# build the bigram and trigram models
bigram = gensim.models.Phrases(corpus, 
                               min_count=5,   # the minimum absolute frequency a word pair needs to have in the corpus to be considered for joining into a bigram
                               threshold=0.1)  # higher threshold leads to fewer bigrams, the bigram will need to occur at least 10 times in the document, can also
                                              # be set as a perentage the bigram occurs in the document

# use the bigram corpus to create a tigram
trigram = gensim.models.Phrases(bigram[corpus], threshold=10)  

# Phraser objects take Phrases models as inputs and are optimized for faster phrase application
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [12]:
# create functions that will create bigram and trigram corpora quickly
def make_bigrams(corpus):
    return [bigram_mod[doc] for doc in corpus]

def make_trigrams(corpus):
    return [trigram_mod[bigram_mod[doc]] for doc in corpus]

## Create bigrams corpus

In [13]:
# create bigrams corpus
corpus_bigrams = make_bigrams(corpus)

# create dictionary
bigrams_dictionary = corpora.Dictionary(corpus_bigrams)

# create term document frequency
bigrams_document_term_matrix = [bigrams_dictionary.doc2bow(doc) for doc in corpus_bigrams]

# Create bigrams model

In [14]:
# LDA model
lda_number_of_topics = 3
lda_bigrams_model = LdaModel(corpus=bigrams_document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=bigrams_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# show results
lda_bigrams_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

[(0,
  '0.014*"wait" + 0.012*"legend" + 0.012*"not" + 0.009*"character" + 0.009*"return"'),
 (1,
  '0.023*"hope" + 0.021*"come" + 0.020*"want_want" + 0.019*"want" + 0.015*"game"'),
 (2,
  '0.023*"need" + 0.020*"look" + 0.020*"character" + 0.017*"capoeira_capoeira" + 0.017*"like"')]

The output shows our three topics and the 5 words most closely aligned with that topic.
The output is a list of tuples with the first item in the tuple the topic ID, and the second item contains a string with the words that make up the topic and their weighting.
The greater the weight the more important the word is to the given topic.

### Topics
- 0 appears to focus on returning characters and 'legends' of the game.
- 1 appears to show excitement for the upcoming game.
- 2 appears focus on capoeira - a martial art - and the visual appearance of this fighting style.

In [15]:
pyLDAvis.enable_notebook()
viz = pyLDAvis.gensim.prepare(lda_bigrams_model, bigrams_document_term_matrix, bigrams_dictionary)
pyLDAvis.save_html(viz, "models/visualisations/lda_topics_bigrams_viz.html")

# Evaluation

In [16]:
# get perplexity score
lda_bigrams_perplexity_score = lda_bigrams_model.log_perplexity(chunk=bigrams_document_term_matrix, total_docs=None)
print(f"\nLDA bigrams perplexity: {lda_bigrams_perplexity_score}")   # a measure of how good the model is. lower the better.

# get coherence Score
coherence_model_lda_bigrams = CoherenceModel(model=lda_bigrams_model, texts=corpus_bigrams, dictionary=bigrams_dictionary, coherence='c_v')
coherence_lda_bigrams = coherence_model_lda_bigrams.get_coherence()
print(f"\nLDA bigrams coherence score: {coherence_lda_bigrams}")


LDA bigrams perplexity: -7.465235348984373

LDA bigrams coherence score: 0.44210836940752


## Model assessment

In [17]:
print(f"\nLDA perplexity: {lda_model.log_perplexity(chunk=document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.
print(f"\nLDA bigrams perplexity: {lda_bigrams_model.log_perplexity(chunk=bigrams_document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.
print(f"\nLDA coherence score: {coherence_lda}")
print(f"\nLDA bigrams coherence score: {coherence_lda_bigrams}")


LDA perplexity: -7.234274714514751

LDA bigrams perplexity: -7.465235588683538

LDA coherence score: 0.452657714270358

LDA bigrams coherence score: 0.44210836940752


- The two models are similar in terms of perplexity and coherence, although the bigrams model has a slightly lower perplexity score, meaning that:
    - the bigrams model is very slightly more able to measure the model's ability to predict a held-out word in a document
    - Both models seem to be simialr in their assessing of how well words within a topic connect semantically and form meaningful themes.
    - The scores for both models are good and **meet the key results of the project**.