In [None]:
python3 -m spacy download en_core_web_sm


In [None]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package

# general DS packages
import pandas as pd
import numpy as np

# cleaning and pre-processing
from src.processing.text_cleaning import normalize_text, process_contractions, remove_all_punctuation, remove_emojis, remove_html_unescape, remove_digits, remove_extra_whitespace, remove_website_links
from src.processing.text_processing import tokenize_comment, lemmatize_comment, remove_stop_words, part_of_speech, part_of_speech_tag, part_of_speech_dependency, part_of_speech_shape, part_of_speech_alpha, part_of_speech_is_stop

# modeling
import gensim
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# visualisation
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv", )
data = raw_data.copy()
df = pd.DataFrame(data)

# Clean and process dataframe

In [None]:
%%time

# clean
df['textDisplay'] = df['textDisplay'].apply(normalize_text)
df['textDisplay'] = df['textDisplay'].apply(process_contractions)
df['textDisplay'] = df['textDisplay'].apply(remove_website_links)
df['textDisplay'] = df['textDisplay'].apply(remove_html_unescape)
df['textDisplay'] = df['textDisplay'].apply(remove_emojis)
df['textDisplay'] = df['textDisplay'].apply(remove_digits)
df['textDisplay'] = df['textDisplay'].apply(remove_all_punctuation)
df['textDisplay'] = df['textDisplay'].apply(remove_extra_whitespace)

# process
df["textStopWordsRemoved"] = df["textDisplay"].apply(remove_stop_words)
df["textTokenized"] = df['textStopWordsRemoved'].apply(tokenize_comment)
df["textLemmatized"] = df["textStopWordsRemoved"].apply(lemmatize_comment)

# part of speech operations
df["pos"] = df["textStopWordsRemoved"].apply(part_of_speech)
df["posTag"] = df["textStopWordsRemoved"].apply(part_of_speech_tag)
df["posDependency"] = df["textStopWordsRemoved"].apply(part_of_speech_dependency)
df["posShape"] = df["textStopWordsRemoved"].apply(part_of_speech_shape)
df["posAlpha"] = df["textStopWordsRemoved"].apply(part_of_speech_alpha)
df["posStopWord"] = df["textStopWordsRemoved"].apply(part_of_speech_is_stop)

df.head()

# Topic modeling

## Create corpus and document-term matrix

Create a corpus (a list that contains all of the YouTube comments in it))...

In [None]:
# create a corpus
corpus = list(df['textLemmatized'].values)
corpus

Use Gensim to create a dictionary that will store each word in the corpus and assign a unique ID to it. Then create a bag of words document-term matrix which will return a tuple with the word's unique ID and how many times it occurs in the document i.e., (word_id, frequnecy_in_document)

In [None]:
# create dictionary
comments_dictionary = corpora.Dictionary(corpus)

# create term document frequency
document_term_matrix = [comments_dictionary.doc2bow(doc) for doc in corpus]

In [None]:
comments_dictionary

In [None]:
document_term_matrix

The document term matrix is a list and each document (YouTube comment) within the list is now a list of tuples instead of tokens (words). The tuples provide the word ID (each word has a unique ID created by Gensim) and the frequency with which that word occurs in the document.


We can also view the actual word with the frequency...

In [None]:
comments_dictionary[0]

In [None]:
# human readable format of document term matrix for first youtube comments
[[(comments_dictionary[id], freq) for id, freq in cp] for cp in document_term_matrix[:10]]

We now have the data in a state where we can build the topic model...

# Build the models

In [None]:
gensim.__version__

## LDA

In [None]:
# LDA model
lda_number_of_topics = 3
lda_model = LdaModel(corpus=document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=comments_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# Results
lda_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

### Visualise model

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, document_term_matrix, comments_dictionary)
pyLDAvis.save_html(vis, "visualisations/lda_topics_viz.html")
# vis

### How to interpret pyLDAvis’s output?

- Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.
- A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
- A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.
- If you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

## Perplexity and coherence
- Model perplexity and topic coherence provide a convenient measure to indicate how good a given topic model is.
- Lower the perplexity better the model.
- Higher the topic coherence, the topic is more human interpretable.

In [None]:
# compute Perplexity
print(f"\nPerplexity: ' {lda_model.log_perplexity(chunk=document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=comments_dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"\nCoherence Score: ' {coherence_lda}")

# LDA: bigram and trigram

### Build the bigrams and trigrams

In [None]:
# build the bigram and trigram models
bigram = gensim.models.Phrases(corpus, min_count=3, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[corpus], threshold=10)  

# Phraser objects take Phrases models as inputs and are optimized for faster phrase application
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[corpus[0]]])   # applies the bigram model to the first document, combining common word pairs. trigram_mod[] applies the trigram model 
                                            # to the result of the bigram application, forming trigrams from frequent bigrams and single words.
print(trigram_mod[bigram_mod[corpus[135]]])

In [None]:
def make_bigrams(corpus):
    return [bigram_mod[doc] for doc in corpus]

def make_trigrams(corpus):
    return [trigram_mod[bigram_mod[doc]] for doc in corpus]

### Create bigrams corpus

In [None]:
# create bigrams corpus
corpus_bigrams = make_bigrams(corpus)

In [None]:
# create dictionary
bigrams_dictionary = corpora.Dictionary(corpus_bigrams)

# create term document frequency
bigrams_document_term_matrix = [bigrams_dictionary.doc2bow(doc) for doc in corpus_bigrams]

### Create bigrams model

In [None]:
# LDA model
lda_number_of_topics = 3
lda_bigrams_model = LdaModel(corpus=bigrams_document_term_matrix,
                     num_topics=lda_number_of_topics, 
                     id2word=bigrams_dictionary,
                     passes=10,   # number of iterations over the entire corpus during model training
                    random_state=42,)

# Results
lda_bigrams_model.print_topics(num_topics=lda_number_of_topics, num_words=5)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_bigrams_model, bigrams_document_term_matrix, bigrams_dictionary)
pyLDAvis.save_html(vis, "visualisations/lda_bigrams_topics_viz.html")
# vis

In [None]:
# compute Perplexity
print(f"\nPerplexity: ' {lda_bigrams_model.log_perplexity(chunk=bigrams_document_term_matrix, total_docs=None)}")   # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_bigrams = CoherenceModel(model=lda_bigrams_model, texts=corpus_bigrams, dictionary=bigrams_dictionary, coherence='c_v')
coherence_lda_bigrams = coherence_model_lda_bigrams.get_coherence()
print(f"\nCoherence Score: ' {coherence_lda_bigrams}")

The bigrams model is more perplex and coherent. This means:
- The model is able to...

## Next steps
- BERT model
- Non negative matrix/explore other models that may be suitable
- try one where you remove all words apart from character names?
- Wordcloud?