In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package

# general DS packages
import pandas as pd
import numpy as np

# cleaning and pre-processing
from nltk.tokenize import RegexpTokenizer
from src.processing.text_cleaning import (normalize_text, process_contractions, remove_all_punctuation, remove_emojis, 
remove_html_unescape, remove_href_pattern, remove_digits, remove_extra_whitespace, remove_website_links)

from src.processing.text_processing import (tokenize_comment, lemmatize_comment, remove_stop_words, remove_tiny_tokens, 
remove_tekken_character_names_from_tokens, part_of_speech, part_of_speech_tag, part_of_speech_dependency, part_of_speech_shape, 
part_of_speech_alpha, part_of_speech_is_stop, word_count, unique_words_from_tokens)

from src.modeling.topic_dataframe import topic_dataframe, _heaviest_words_indices

# modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# visualisation
import matplotlib.pyplot as plt

# other
from operator import itemgetter

# Load the data

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv", )
data = raw_data.copy()
df = pd.DataFrame(data)

# Clean and process data

In [3]:
%%time

# clean
df['textDisplay'] = df['textDisplay'].apply(normalize_text)
df['textDisplay'] = df['textDisplay'].apply(process_contractions)
df['textDisplay'] = df['textDisplay'].apply(remove_website_links)
df['textDisplay'] = df['textDisplay'].apply(remove_html_unescape)
df['textDisplay'] = df['textDisplay'].apply(remove_emojis)
df['textDisplay'] = df['textDisplay'].apply(remove_digits)
df['textDisplay'] = df['textDisplay'].apply(remove_all_punctuation)
df['textDisplay'] = df['textDisplay'].apply(remove_href_pattern)
df['textDisplay'] = df['textDisplay'].apply(remove_extra_whitespace)

# process
df["textDisplayWordCount"] = df['textDisplay'].apply(word_count)
df["textStopWordsRemoved"] = df["textDisplay"].apply(remove_stop_words)
df["textTokenized"] = df['textStopWordsRemoved'].apply(tokenize_comment)
df["textLemmatized"] = df["textStopWordsRemoved"].apply(lemmatize_comment)
# remove short meaningless tokens from lemmatized tokens
df["textLemmatized"] = df['textLemmatized'].apply(remove_tiny_tokens)
df["textTekkenCharactersRemoved"] = df["textLemmatized"].apply(remove_tekken_character_names_from_tokens)
df["textProcessedCharactersRemoved"] = df["textTekkenCharactersRemoved"].apply(lambda x: ' '.join(x))

# part of speech operations
df["pos"] = df["textStopWordsRemoved"].apply(part_of_speech)
df["posTag"] = df["textStopWordsRemoved"].apply(part_of_speech_tag)
df["posDependency"] = df["textStopWordsRemoved"].apply(part_of_speech_dependency)
df["posShape"] = df["textStopWordsRemoved"].apply(part_of_speech_shape)
df["posAlpha"] = df["textStopWordsRemoved"].apply(part_of_speech_alpha)
df["posStopWord"] = df["textStopWordsRemoved"].apply(part_of_speech_is_stop)


# remove rows with empty strings in the 'textProcessedCharactersRemoved' column as these will have nothing to pass to the vectorizer when we come to transforming the text input
# to numerical input
df = df[df["textProcessedCharactersRemoved"].astype(str) != '']
df.reset_index(drop=True, inplace=True)

df.head()

CPU times: user 38.3 s, sys: 59.3 ms, total: 38.3 s
Wall time: 38.6 s


Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textDisplayWordCount,textStopWordsRemoved,textTokenized,textLemmatized,textTekkenCharactersRemoved,textProcessedCharactersRemoved,pos,posTag,posDependency,posShape,posAlpha,posStopWord
0,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now...,10,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[see, get, view, bamco]",see get view bamco,"[VERB, VERB, NOUN, NOUN]","[VBN, VBG, NNS, NNS]","[ROOT, xcomp, dobj, dobj]","[xxxx, xxxx, xxxx, xxxx]","[True, True, True, True]","[False, False, False, False]"
1,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,1,wow,[wow],[wow],[wow],wow,[INTJ],[UH],[ROOT],[xxx],[True],[False]
2,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,2,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[oww, yeaah]",oww yeaah,"[PROPN, PROPN]","[NNP, NNP]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
3,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,150,18,i hope we get an angel version of jin,9,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[hope, version]",hope version,"[PROPN, PROPN, PROPN, PROPN]","[NNP, NNP, NNP, NNP]","[compound, compound, compound, ROOT]","[xxxx, xxxx, xxxx, xxx]","[True, True, True, True]","[False, False, False, False]"
4,rDxrpSqYHD8,@kazamataurus337,2023-11-01 16:10:08+00:00,2023-11-01 16:10:08+00:00,1,0,so it begins,3,begins,[begins],[begin],[begin],begin,[VERB],[VBZ],[ROOT],[xxxx],[True],[False]


# Non-negative Matrix Factorisation (NMF) with 5 topics
- At this point we have the text processed and avaibale in tokenized format and as a string.
- We now need to turn the text into numbers...
    - This can be done in a variety of ways e.g., TF-IDF, bag of words (which we previously used gensim to create as part of the LDA model)
    - We're going to use TF-IDF to create the features.
- Once the features are created we can then create a topic model.
- The previous model returned 30 topics as the optimal number; 30 is too many so we'll now try manually setting the number of topics to 5.

## Create TF-IDF vectorizer

In [4]:
# set the text input that we'll transform into numerical format
texts = df['textTekkenCharactersRemoved']
texts[:10]

0    [see, get, view, bamco]
1                      [wow]
2               [oww, yeaah]
3            [hope, version]
4                    [begin]
5     [waiting, room, right]
6                      [let]
7                      [wow]
8                [marvelous]
9             [late, bandai]
Name: textTekkenCharactersRemoved, dtype: object

In [5]:
# Create the tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(
    min_df=3,  # discard words in less than 3 documents
    max_df=0.85,    # discard words in more than 85% of documents
    max_features=999, 
    ngram_range=(1, 2),   # enable bigrams
    preprocessor=' '.join   # joins each list of words within a document into a single string, creating the expected string 
                            # representation for each document.
)

tfidf_vectorizer

### Fit and transform the text
- _fit_ learns the vocab (terms) and frequencies
- _transform_ takes the knowledge gained during fit and applies it to transform raw text into a structured, numerical representation (a TF-IDF score) that machines can effectively analyze.
- The TF-IDF score reflects the word or phrase's unique usage pattern compared to the entire corpus.
- Higher scores indicate terms that are more statistically relevant and informative for that specific document.

In [6]:
# fit and transform the input text
tfidf = tfidf_vectorizer.fit_transform(texts)

# store the feature names in a variable - we'll use these later for topic summaries
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

- The _tfidf_ variable stores each comment, and the tfidf score for each word in our corpus for a given comment.
- The _tfidf_feature_names_ variable stores our features (words)

In [7]:
tfidf.shape

(1739, 544)

In [8]:
len(tfidf_feature_names)

544

In [9]:
tfidf_feature_names[:10]

array(['able', 'absolutely', 'actor', 'actual', 'actually',
       'actually look', 'add', 'aesthetic', 'alisas', 'alright'],
      dtype=object)

### Check the words and TF-IDF scores

In [10]:
# show the comment number, word index and tfidf score for the first 2 comments
print(tfidf[:2])

  (0, 29)	0.5475514448208861
  (0, 504)	0.5945789339577491
  (0, 182)	0.36290698487249895
  (0, 422)	0.4636397598340532
  (1, 532)	1.0


The output is a tuple which shows the comment number, the word index and the TF-IDF score for this word in this comment. So, the ouput:      
        
        (0, 29) 0.5475514448208861

So, this can be read as comment 0, word indx 29 and a TF-IDF score of 0.54755. We can validate this by checking the words in comment 0, and seeing if the indices match from our feature names variable.

In [11]:
# access comment 0 (sorted alphabetically)
texts[0]

['see', 'get', 'view', 'bamco']

We can now check the above words whose indices should match the output above.

In [12]:
print(tfidf_feature_names[29])
print(tfidf_feature_names[504])
print(tfidf_feature_names[182])
print(tfidf_feature_names[422])

bamco
view
get
see


Let's check the TF-IDF score to see if they match.

In [13]:
# store the TF-IDF scores associated with each word for each document
tfidf_matrix = tfidf.toarray()

# access TF-IDF scores
# loop through the first 10 comments and get the words and their TF-IDF score
for index, values in enumerate(tfidf_matrix[:2]):
    print(f"\nComment {index}:")
    for word_index, tfidf_score in enumerate(values):
        if tfidf_score >0.0:    
            word_index = int(word_index)   # indexing issue as for some reason the word index was a float
            print(f"{tfidf_feature_names[word_index]}: {round(tfidf_score, 3)}")


Comment 0:
bamco: 0.548
get: 0.363
see: 0.464
view: 0.595

Comment 1:
wow: 1.0


The scores and words match.

# Create an NMF model

In [14]:
# create a scikit-learn NMF model
nmf = NMF(
    n_components=5,   # number of topics to be extracted
    init='nndsvd',  # initialization method for the non-negative matrices; 'nndsvd' works well on sparse data
    max_iter=200,   # Maximum number of iterations for the optimization algorithm. Higher values lead to better convergence but increase computation time.
    l1_ratio=0.0,   # Regularization parameter controlling balance between L1 and L2 penalties for sparsity; 0.0 (default): No L1 penalty, only L2 penalty.
    solver='cd',    # Optimization algorithm used for NMF. 'cd' (default): Coordinate Descent. 'mu': Multiplicative Update.
    alpha_W=0.0,    # Regularization parameter for the W matrix (document-topic matrix); 0.0 (default): No regularization.
    tol=0.0001,     # Convergence tolerance for the optimization algorithm. Training stops when the change in error falls below this threshold.
    random_state=42
).fit(tfidf)

- We can check out the weights given to each word in a given topic by exploring _nmf_components\__ which is an array, with each item in the array containing an array of the feature (word) weights.
- _nmf_components\__ contains each topic and the weighting of each feature (word), so this is a list of length x (number of topics) and each item in the list is an array of feature (word) weights

In [15]:
# length should be the set number of topics
len(nmf.components_)

5

Each topic (array) has a feature weight for each feature (word) and so each array will have a length of ~500, so let's explote the first 10 weights in topic 0...


In [16]:
# access first 10 weights of topic 0
nmf.components_[0][:10]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.04638966, 0.00871678, 0.        , 0.        ])

## Apply the model

In [17]:
# convert text document into numerical TF-IDF representation and apply nmf model to this
document_weights = nmf.transform(tfidf_vectorizer.transform(texts))

In [18]:
# set the number of words we want to use (the top x many)
number_of_words = 5

df_topic = topic_dataframe(model=nmf, 
                           feature_names=tfidf_feature_names, 
                           number_of_words=number_of_words
                          ).T    # .T transposes the dataframe

df_topic.head(2)

Unnamed: 0,0,1,2,3,4
0,need,man,trailer,happy,bro
1,bring,bro,miss,plz,need bring


We have a dataframe that contains the top x most heavily weighted (important) words for a given topic. Now we'll create a dataframe that houses the topic number and a string of the words that are most closely linked with the topic.

In [19]:
# perform operations to bring together the top x words and create a string of topic words
df_topic['topic_words'] = df_topic.apply(lambda x: [' '.join(x)], axis=1) # apply row (axis=1) operation that takes the word from each column and 
                                                                     # turns it into a list
df_topic['topic_words'] = df_topic['topic_words'].str[0]  # returns the str of first item in single item list (essentially removes the list brackets)
df_topic['topic_words'] = df_topic['topic_words'].apply(lambda x: tokenize_comment(x)) # tokenize
df_topic['topic_words'] = df_topic['topic_words'].apply(lambda x: unique_words_from_tokens(x))  # Removing duplicate words
df_topic['topic_words'] = df_topic['topic_words'].apply(lambda x: [' '.join(x)])  # returns a single item list where the item is a single string
df_topic['topic_words'] = df_topic['topic_words'].str[0]  # returns the str of first item in single item list (essentially removes the list brackets)


df_topic.head(3)

Unnamed: 0,0,1,2,3,4,topic_words
0,need,man,trailer,happy,bro,need man trailer happy bro
1,bring,bro,miss,plz,need bring,bring bro miss plz need
2,character,hope,look,game,new,character hope look game new


In [20]:
pd.set_option('display.max_colwidth', None) 

# create a dataframe with only the topics and topic number
df_topic = df_topic['topic_words'].reset_index()
df_topic.columns = ['topic_number', 'topic_words']

df_topic

Unnamed: 0,topic_number,topic_words
0,0,need man trailer happy bro
1,1,bring bro miss plz need
2,2,character hope look game new
3,3,want play trailer baki point
4,4,wait not come return


We now have our 5 topics and the top x words that are most closely associated with these topics...

# Evaluation
- 5 topics had a coherence score of 0.35 which does meet the key result of a coherence score >0.4
- The topics do't appear to be particularly interpretable which is the key objective.
- There is no in-built perplexity score functionality for the NMF model, and, whilst calculations are available that could provide an approximate perplexity score, it is decided that coherence score combined with human interpration of the topics will suffice for this use case.
- The LDA model appears to have provided a stronger model.
- 10 topics had a coherence score of 0.42 which meets the key result so it is perhaps worth re-running the notebook with 10 topics...

# ---------------------------------- 
# Re-run with 10 topics 

## Create TF-IDF vectorizer

In [21]:
# set the text input that we'll transform into numerical format
texts = df['textTekkenCharactersRemoved']
texts[:10]

0    [see, get, view, bamco]
1                      [wow]
2               [oww, yeaah]
3            [hope, version]
4                    [begin]
5     [waiting, room, right]
6                      [let]
7                      [wow]
8                [marvelous]
9             [late, bandai]
Name: textTekkenCharactersRemoved, dtype: object

In [22]:
# Create the tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(
    min_df=3,  # discard words in less than 3 documents
    max_df=0.85,    # discard words in more than 85% of documents
    max_features=999, 
    ngram_range=(1, 2),   # enable bigrams
    preprocessor=' '.join   # joins each list of words within a document into a single string, creating the expected string 
                            # representation for each document.
)

tfidf_vectorizer

### Fit and transform the text
- _fit_ learns the vocab (terms) and frequencies
- _transform_ takes the knowledge gained during fit and applies it to transform raw text into a structured, numerical representation (a TF-IDF score) that machines can effectively analyze.
- The TF-IDF score reflects the word or phrase's unique usage pattern compared to the entire corpus.
- Higher scores indicate terms that are more statistically relevant and informative for that specific document.

In [23]:
# fit and transform the input text
tfidf = tfidf_vectorizer.fit_transform(texts)

# store the feature names in a variable - we'll use these later for topic summaries
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Create an NMF model

In [24]:
# create a scikit-learn NMF model
nmf = NMF(
    n_components=10,   # number of topics to be extracted
    init='nndsvd',  # initialization method for the non-negative matrices; 'nndsvd' works well on sparse data
    max_iter=200,   # Maximum number of iterations for the optimization algorithm. Higher values lead to better convergence but increase computation time.
    l1_ratio=0.0,   # Regularization parameter controlling balance between L1 and L2 penalties for sparsity; 0.0 (default): No L1 penalty, only L2 penalty.
    solver='cd',    # Optimization algorithm used for NMF. 'cd' (default): Coordinate Descent. 'mu': Multiplicative Update.
    alpha_W=0.0,    # Regularization parameter for the W matrix (document-topic matrix); 0.0 (default): No regularization.
    tol=0.0001,     # Convergence tolerance for the optimization algorithm. Training stops when the change in error falls below this threshold.
    random_state=42
).fit(tfidf)

- We can check out the weights given to each word in a given topic by exploring _nmf_components\__ which is an array, with each item in the array containing an array of the feature (word) weights.
- _nmf_components\__ contains each topic and the weighting of each feature (word), so this is a list of length x (number of topics) and each item in the list is an array of feature (word) weights

In [25]:
# length should be the set number of topics
len(nmf.components_)

10

Each topic (array) has a feature weight for each feature (word) and so each array will have a length of ~500, so let's explote the first 10 weights in topic 0...


In [26]:
# access first 10 weights of topic 0 (these should be different from the 5 topic model)
nmf.components_[0][:10]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.05031129, 0.00918286, 0.        , 0.        ])

## Apply the model

In [27]:
# convert text document into numerical TF-IDF representation and apply nmf model to this
document_weights = nmf.transform(tfidf_vectorizer.transform(texts))

In [28]:
# set the number of words we want to use (the top x many)
number_of_words = 5

df_topic = topic_dataframe(model=nmf, 
                           feature_names=tfidf_feature_names, 
                           number_of_words=number_of_words
                          ).T    # .T transposes the dataframe

df_topic.head()

Unnamed: 0,0,1,2,3,4
0,need,man,happy,bro,baki
1,bring,bro,plz,miss,need bring
2,character,new,new character,game,guest
3,want,play,baki,point,add
4,wait,not wait,not,announce,man


We have a dataframe that contains the top x most heavily weighted (important) words for a given topic. Now we'll create a dataframe that houses the topic number and a string of the words that are most closely linked with the topic.

In [29]:
# perform operations to bring together the top x words and create a string of topic words
df_topic['topic_words'] = df_topic.apply(lambda x: [' '.join(x)], axis=1) # apply row (axis=1) operation that takes the word from each column and 
                                                                     # turns it into a list
df_topic['topic_words'] = df_topic['topic_words'].str[0]  # returns the str of first item in single item list (essentially removes the list brackets)
df_topic['topic_words'] = df_topic['topic_words'].apply(lambda x: tokenize_comment(x)) # tokenize
df_topic['topic_words'] = df_topic['topic_words'].apply(lambda x: unique_words_from_tokens(x))  # Removing duplicate words
df_topic['topic_words'] = df_topic['topic_words'].apply(lambda x: [' '.join(x)])  # returns a single item list where the item is a single string
df_topic['topic_words'] = df_topic['topic_words'].str[0]  # returns the str of first item in single item list (essentially removes the list brackets)


df_topic.head(3)

Unnamed: 0,0,1,2,3,4,topic_words
0,need,man,happy,bro,baki,need man happy bro baki
1,bring,bro,plz,miss,need bring,bring bro plz miss need
2,character,new,new character,game,guest,character new game guest


In [30]:
pd.set_option('display.max_colwidth', None) 

# create a dataframe with only the topics and topic number
df_topic = df_topic['topic_words'].reset_index()
df_topic.columns = ['topic_number', 'topic_words']

df_topic

Unnamed: 0,topic_number,topic_words
0,0,need man happy bro baki
1,1,bring bro plz miss need
2,2,character new game guest
3,3,want play baki point add
4,4,wait not announce man
5,5,hope return announce
6,6,look like game good
7,7,come glad think boy yes
8,8,trailer get reveal new love
9,9,main legend return let game


We now have our 10 topics and the top x words that are most closely associated with these topics...

# Evaluation
- 10 topics had a coherence score of 0.42 which meets the key result of a coherence score >0.4
- The topics appear to be slightly more interpretable which is the key objective.
- There is no in-built perplexity score functionality for the NMF model, and, whilst calculations are available that could provide an approximate perplexity score, it is decided that coherence score combined with human interpration of the topics will suffice for this use case.
- The LDA model appears to have provided a stronger model.

# Conclusion
- LDA model met the key results of coherence and perplexity
- LDA topics were more humanly interpretable.