Demo ipynb for CTM (Contextualized Topic Models)

Since we aren't doing multi-language, we may use the Combined TM instead of zero-shot one

In [2]:
import pandas as pd
import numpy as np


from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
# from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

import nltk

from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 81776 entries, 63365 to 145140
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         81776 non-null  int64 
 1   app_id        81776 non-null  int64 
 2   app_name      81776 non-null  object
 3   review_text   81776 non-null  object
 4   review_score  81776 non-null  int64 
 5   review_votes  81776 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 4.4+ MB


In [4]:
# data preprocessing

import re
import gensim.utils.deaccent


def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)
    
def remove_num(texts):
   output = re.sub(r'\d+', '', texts)
   return output

def unify_whitespaces(x):
    cleaned_string = re.sub(' +', ' ', x)
    return cleaned_string


def deaccent(x):
    '''Deaccent to change characters with accent to the corresponding without accent'''
    return deaccent(x)

from nltk.corpus import stopwords
stop=set(stopwords.words("english"))
def remove_stopword(text):
   text=[word.lower() for word in text.split() if word.lower() not in stop]
   return " ".join(text)

# only keep alphabets
def remove_non_alphabets(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: clean(x))
    df[review] = df[review].apply(lambda x: deEmojify(x))

    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: remove_num(x))
    df[review] = df[review].apply(lambda x: unify_whitespaces(x))

    df[review] = df[review].apply(lambda x: deaccent(x))
    df[review] = df[review].apply(lambda x: remove_non_alphabets(x))
    df[review] = df[review].apply(lambda x: remove_stopword(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: clean(x), str_list))
    str_list = list(map(lambda x: deEmojify(x), str_list))

    str_list = list(map(lambda x: x.lower(), str_list))
    str_list = list(map(lambda x: remove_num(x), str_list))
    str_list = list(map(lambda x: unify_whitespaces(x), str_list))

    str_list = list(map(lambda x: deaccent(x), str_list))
    str_list = list(map(lambda x: remove_non_alphabets(x), str_list))
    str_list = list(map(lambda x: remove_stopword(x), str_list))
    return str_list

def cleaning_little(df, review):
    df[review] = df[review].apply(lambda x: clean(x))
    df[review] = df[review].apply(lambda x: deEmojify(x))
    df[review] = df[review].apply(lambda x: unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: deaccent(x))


In [5]:
# create a copy of the dataset, as we need both untouched text and cleaned text

dataset_preprocessed = dataset.copy()

In [6]:
cleaning(dataset_preprocessed, 'review_text')


cleaning_little(dataset, 'review_text')

In [None]:
X_preprocessed = dataset_preprocessed['review_text'].values
X = dataset['review_text'].values

Apply lemmatizing to the preprocessed dataset as well (for BoW)

In [8]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [11]:
X_preprocessed = list(map(lambda x: lemmatization(x), X_preprocessed))
X_preprocessed = list(map(lambda x: ' '.join(x), X_preprocessed))

reduce the BoW to 2000 as recommended by CTM

In [None]:
# referencing to preprocessing in CTM
# to keep first 2000 words in the preprpcessed documents

from sklearn.feature_extraction.text import CountVectorizer

VOCABULARY_SIZE = 2000

vectorizer = CountVectorizer(max_features=VOCABULARY_SIZE)
vectorizer.fit_transform(X_preprocessed)
temp_vocabulary = set(vectorizer.get_feature_names_out())

preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
                            for doc in X_preprocessed]

X_preprocessed = preprocessed_docs_tmp

In [12]:
X_preprocessed[0]

'werewolf rid unicorn shooting rainbow gun build teleporters find hair dresser spider cavern get sword shoot cat take lord moon use yoyo summon sharknado minion shoot sharks enemy find sky temple air wyverns spawn buy music box wizard go record music like play base whenever want go build castle make entirely white marble would seem thing minecraft game dimension trust get use start learning game terrarium simply one satisfy sandbox experience may sound rude compare minecraft imagination'

Training

In [13]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=X, text_for_bow=X_preprocessed)

Downloading .gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.32MB/s]
Downloading 1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 225kB/s]
Downloading README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 11.4MB/s]
Downloading config.json: 100%|██████████| 571/571 [00:00<00:00, 662kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 127kB/s]
Downloading data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 37.2MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:05<00:00, 74.2MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 60.9kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 267kB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.16MB/s]
Downloading tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 413kB/s]
Downloading train_script.py: 100%|██████████| 13.1k/13.1k [00:00<00:00, 11.0MB/s]
Downloading vocab.t

In [14]:
# check the vocabs

tp.vocab[:10]

array(['aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaaa', 'aaaaaaaa', 'aaaaaaaaa',
       'aaaaaaaaaaa', 'aaaaaaaaaaaa', 'aaaaaaaaaaaaaaa'], dtype=object)

In [15]:
len(tp.vocab)

35060

Training combined TM

In [16]:
N_TOPICS = 20

ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=N_TOPICS, num_epochs=10)
ctm.fit(training_dataset) # run the model

0it [00:00, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid 

In [17]:
ctm.get_topic_lists(5)

[['disappointed', 'ingame', 'ex', 'experiance', 'success'],
 ['skyrim', 'ign', 'beating', 'scale', 'simulator'],
 ['go', 'kill', 'house', 'stuff', 'get'],
 ['buy', 'play', 'friend', 'friends', 'hour'],
 ['shame', 'unfortunately', 'lastly', 'initially', 'roughly'],
 ['rpg', 'minecraft', 'exploration', 'building', 'adventure'],
 ['unicorn', 'bunny', 'shoot', 'fish', 'lord'],
 ['ever', 'penny', 'best', 'definetly', 'addict'],
 ['assure', 'stress', 'custom', 'prefix', 'mid'],
 ['free', 'release', 'value', 'price', 'pay'],
 ['alright', 'guess', 'ok', 'neat', 'okay'],
 ['mac', 'please', 'fix', 'cant', 'crash'],
 ['average', 'frustrating', 'ingame', 'satisfaction', 'overwhelm'],
 ['terrarium', 'boss', 'minecraft', 'item', 'world'],
 ['collector', 'storage', 'favor', 'express', 'abandon'],
 ['scroller', 'platformer', 'retro', 'action', 'side'],
 ['game', 'play', 'time', 'new', 'one'],
 ['game', 'terrarium', 'minecraft', 'many', 'people'],
 ['lastly', 'harsh', 'enchantment', 'vs', 'sport'],
 ['

In [19]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

  0%|          | 0/1278 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

quantitative evaluation

like calulating u_mass, u_npmi score

or inversed RBO (diversity metrics)

In [32]:
def _get_topics(ctm, k=10):
    return ctm.get_topic_lists(k)

def _get_topic_word_metrix(ctm):
    return ctm.get_topic_word_distribution()

# ref: https://contextualized-topic-models.readthedocs.io/en/latest/readme.html (go to the section: Mono-Lingual Topic Modeling)
# testing_dataset = qt.transform(text_for_contextual=testing_text_for_contextual, text_for_bow=testing_text_for_bow)
# # n_sample how many times to sample the distribution (see the doc)
# ctm.get_doc_topic_distribution(testing_dataset, n_samples=20) # returns a (n_documents, n_topics) matrix with the topic distribution of each document
def _get_topic_document_metrix(ctm, dataset, n_samples=20):
    return ctm.get_doc_topic_distribution(dataset, n_samples=n_samples).T


topic_words = _get_topics(ctm, k=10)
topic_word_metrix = _get_topic_word_metrix(ctm)

In [34]:
topic_word_metrix.shape

(20, 35060)

In [37]:
doc_topic_distrbution = ctm.get_doc_topic_distribution(training_dataset,n_samples=20)

  0%|          | 0/1278 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

In [42]:
doc_topic_distrbution.T.shape

(20, 81776)

In [41]:
np.sum(doc_topic_distrbution.T, axis=0)

array([1.0000001 , 0.99999994, 0.99999994, ..., 1.        , 1.        ,
       1.        ], dtype=float32)

In [None]:
# use Gensim to calculate coherence score

from gensim.models import CoherenceModel

coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                corpus=corpus,
                                dictionary=dictionary,
                                topn=10,
                                coherence='c_v')

Save the model

In [21]:
ctm_model_path = Path('./ctm_model')
ctm.save(models_dir=ctm_model_path)



load the model

In [24]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, num_epochs=100, n_components=50)

ctm.load(ctm_model_path.joinpath("contextualized_topic_model_nc_20_tpm_0.0_tpv_0.95_hs_prodLDA_ac_(100, 100)_do_softplus_lr_0.2_mo_0.002_rp_0.99"), epoch=9)



In [25]:
ctm.get_topic_lists(5)

[['disappointed', 'ingame', 'ex', 'experiance', 'success'],
 ['skyrim', 'ign', 'beating', 'scale', 'simulator'],
 ['go', 'kill', 'house', 'stuff', 'get'],
 ['buy', 'play', 'friend', 'friends', 'hour'],
 ['shame', 'unfortunately', 'lastly', 'initially', 'roughly'],
 ['rpg', 'minecraft', 'exploration', 'building', 'adventure'],
 ['unicorn', 'bunny', 'shoot', 'fish', 'lord'],
 ['ever', 'penny', 'best', 'definetly', 'addict'],
 ['assure', 'stress', 'custom', 'prefix', 'mid'],
 ['free', 'release', 'value', 'price', 'pay'],
 ['alright', 'guess', 'ok', 'neat', 'okay'],
 ['mac', 'please', 'fix', 'cant', 'crash'],
 ['average', 'frustrating', 'ingame', 'satisfaction', 'overwhelm'],
 ['terrarium', 'boss', 'minecraft', 'item', 'world'],
 ['collector', 'storage', 'favor', 'express', 'abandon'],
 ['scroller', 'platformer', 'retro', 'action', 'side'],
 ['game', 'play', 'time', 'new', 'one'],
 ['game', 'terrarium', 'minecraft', 'many', 'people'],
 ['lastly', 'harsh', 'enchantment', 'vs', 'sport'],
 ['