Demo ipynb for CTM (Contextualized Topic Models)

Since we aren't doing multi-language, we may use the Combined TM instead of zero-shot one

In [1]:
import pandas as pd
import numpy as np


from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
# from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

import nltk

from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 81776 entries, 63365 to 145140
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         81776 non-null  int64 
 1   app_id        81776 non-null  int64 
 2   app_name      81776 non-null  object
 3   review_text   81776 non-null  object
 4   review_score  81776 non-null  int64 
 5   review_votes  81776 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 4.4+ MB


In [3]:
# data preprocessing

import re
from gensim.utils import deaccent


def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)
    
def remove_num(texts):
   output = re.sub(r'\d+', '', texts)
   return output

def unify_whitespaces(x):
    cleaned_string = re.sub(' +', ' ', x)
    return cleaned_string


def _deaccent(x):
    '''Deaccent to change characters with accent to the corresponding without accent'''
    return deaccent(x)

from nltk.corpus import stopwords
stop=set(stopwords.words("english"))
def remove_stopword(text):
   text=[word.lower() for word in text.split() if word.lower() not in stop]
   return " ".join(text)

# only keep alphabets
def remove_non_alphabets(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: clean(x))
    df[review] = df[review].apply(lambda x: deEmojify(x))

    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: remove_num(x))
    df[review] = df[review].apply(lambda x: unify_whitespaces(x))

    df[review] = df[review].apply(lambda x: _deaccent(x))
    df[review] = df[review].apply(lambda x: remove_non_alphabets(x))
    df[review] = df[review].apply(lambda x: remove_stopword(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: clean(x), str_list))
    str_list = list(map(lambda x: deEmojify(x), str_list))

    str_list = list(map(lambda x: x.lower(), str_list))
    str_list = list(map(lambda x: remove_num(x), str_list))
    str_list = list(map(lambda x: unify_whitespaces(x), str_list))

    str_list = list(map(lambda x: _deaccent(x), str_list))
    str_list = list(map(lambda x: remove_non_alphabets(x), str_list))
    str_list = list(map(lambda x: remove_stopword(x), str_list))
    return str_list

def cleaning_little(df, review):
    df[review] = df[review].apply(lambda x: clean(x))
    df[review] = df[review].apply(lambda x: deEmojify(x))
    df[review] = df[review].apply(lambda x: unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: _deaccent(x))


In [4]:
# create a copy of the dataset, as we need both untouched text and cleaned text

dataset_preprocessed = dataset.copy()

In [5]:
cleaning(dataset_preprocessed, 'review_text')


cleaning_little(dataset, 'review_text')

In [6]:
X_preprocessed = dataset_preprocessed['review_text'].values
X = dataset['review_text'].values

Apply lemmatizing to the preprocessed dataset as well (for BoW)

In [7]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [8]:
X_preprocessed = list(map(lambda x: lemmatization(x), X_preprocessed))
X_preprocessed = list(map(lambda x: ' '.join(x), X_preprocessed))

reduce the BoW to 2000 as recommended by CTM

In [9]:
# referencing to preprocessing in CTM
# to keep first 2000 words in the preprpcessed documents

from sklearn.feature_extraction.text import CountVectorizer

VOCABULARY_SIZE = 2000

vectorizer = CountVectorizer(max_features=VOCABULARY_SIZE)
vectorizer.fit_transform(X_preprocessed)
temp_vocabulary = set(vectorizer.get_feature_names_out())

preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
                            for doc in X_preprocessed]

X_preprocessed = preprocessed_docs_tmp

In [10]:
X_preprocessed[0]

'werewolf rid unicorn rainbow gun build find hair spider cavern get sword shoot cat take lord moon use yoyo summon minion shoot enemy find sky temple air spawn buy music box wizard go record music like play base whenever want go build castle make entirely white would seem thing minecraft game dimension trust get use start game terrarium simply one satisfy sandbox experience may sound compare minecraft imagination'

Training

In [11]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=X, text_for_bow=X_preprocessed)

Batches: 100%|██████████| 409/409 [00:35<00:00, 11.47it/s]


In [12]:
# check the vocabs

tp.vocab[:10]

array(['aaa', 'ability', 'able', 'absolute', 'absolutely', 'absolutly',
       'accesories', 'access', 'accessory', 'accidentally'], dtype=object)

In [13]:
len(tp.vocab)

2000

Training combined TM

In [14]:
N_TOPICS = 20

ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=N_TOPICS, num_epochs=10)
ctm.fit(training_dataset) # run the model

0it [00:00, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [15]:
ctm.get_topic_lists(5)

[['addictive', 'list', 'course', 'diverse', 'surprise'],
 ['cant', 'fix', 'crash', 'work', 'help'],
 ['game', 'play', 'get', 'like', 'terrarium'],
 ['build', 'boss', 'adventure', 'explore', 'craft'],
 ['list', 'main', 'example', 'thousand', 'unless'],
 ['list', 'example', 'sort', 'rather', 'main'],
 ['would', 'slime', 'shoot', 'unicorn', 'kill'],
 ['ever', 'best', 'one', 'ive', 'favorite'],
 ['play', 'buy', 'friend', 'really', 'pc'],
 ['get', 'go', 'make', 'kill', 'thing'],
 ['game', 'play', 'terrarium', 'one', 'hour'],
 ['fun', 'friend', 'lot', 'really', 'multiplayer'],
 ['great', 'worth', 'money', 'low', 'price'],
 ['minecraft', 'like', 'terrarium', 'well', 'terraria'],
 ['game', 'terrarium', 'great', 'boss', 'minecraft'],
 ['good', 'pretty', 'medium', 'nice', 'cool'],
 ['complete', 'main', 'list', 'information', 'possible'],
 ['update', 'hour', 'new', 'content', 'still'],
 ['game', 'terrarium', 'boss', 'world', 'minecraft'],
 ['awesome', 'love', 'amaze', 'amazing', 'must']]

In [16]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

  0%|          | 0/1278 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

quantitative evaluation

like calulating u_mass, u_npmi score

or inversed RBO (diversity metrics)

In [30]:
def _get_topics(ctm, k=10):
    return ctm.get_topic_lists(k)

def _get_topic_word_metrix(ctm):
    return ctm.get_topic_word_distribution()

# ref: https://contextualized-topic-models.readthedocs.io/en/latest/readme.html (go to the section: Mono-Lingual Topic Modeling)
# testing_dataset = qt.transform(text_for_contextual=testing_text_for_contextual, text_for_bow=testing_text_for_bow)
# # n_sample how many times to sample the distribution (see the doc)
# ctm.get_doc_topic_distribution(testing_dataset, n_samples=20) # returns a (n_documents, n_topics) matrix with the topic distribution of each document
def _get_topic_document_metrix(ctm, dataset, n_samples=20):
    return ctm.get_doc_topic_distribution(dataset, n_samples=n_samples).T


topic_words = _get_topics(ctm, k=10)
topic_word_metrix = _get_topic_word_metrix(ctm)
topic_document_metrix = _get_topic_document_metrix(ctm, training_dataset, n_samples=20)

  0%|          | 0/1278 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [18]:
topic_word_metrix.shape

(20, 2000)

In [32]:
topic_document_metrix.shape

(20, 81776)

In [22]:
from gensim import corpora

topic_words = _get_topics(ctm)
topics = ctm.get_predicted_topics(training_dataset, n_samples=20)

documents = pd.DataFrame({"Document": X,
                          "ID": range(len(X)),
                          "Topic": topics})

# remove documents which their topic contains 1<= words
# documents = documents[~documents['Topic'].isin(empty_topic_idxs)]

documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
# cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# bertopic_vectorizer = topic_model.vectorizer_model
# bertopic_analyzer = bertopic_vectorizer.build_analyzer()

# words = bertopic_vectorizer.get_feature_names_out()
# tokens = [bertopic_analyzer(doc) for doc in cleaned_docs]
texts = [doc.split() for doc in documents_per_topic.Document.values]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

  0%|          | 0/1278 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

In [23]:
# use Gensim to calculate coherence score

from gensim.models import CoherenceModel

coherence_model = CoherenceModel(topics=topic_words,
                                 texts=texts,
                                corpus=corpus,
                                dictionary=dictionary,
                                topn=10,
                                coherence='c_v')

In [24]:
print(coherence_model.get_coherence())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.42467335576522947


In [25]:
coherence_model_npmi = CoherenceModel(topics=topic_words,
                                 texts=texts,
                                corpus=corpus,
                                dictionary=dictionary,
                                topn=10,
                                coherence='c_npmi')

print(coherence_model_npmi.get_coherence())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

-0.058668006813056485


Save the model

In [26]:
from datetime import datetime

ctm_model_path = Path(f'./ctm_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
ctm.save(models_dir=ctm_model_path)



load the model

In [29]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, num_epochs=100, n_components=50)

model_datetime = datetime(2023, 1, 11, 19, 5, 43)
ctm_model_path

ctm.load(ctm_model_path.joinpath("contextualized_topic_model_nc_20_tpm_0.0_tpv_0.95_hs_prodLDA_ac_(100, 100)_do_softplus_lr_0.2_mo_0.002_rp_0.99"), epoch=9)



In [None]:
ctm.get_topic_lists(5)

[['disappointed', 'ingame', 'ex', 'experiance', 'success'],
 ['skyrim', 'ign', 'beating', 'scale', 'simulator'],
 ['go', 'kill', 'house', 'stuff', 'get'],
 ['buy', 'play', 'friend', 'friends', 'hour'],
 ['shame', 'unfortunately', 'lastly', 'initially', 'roughly'],
 ['rpg', 'minecraft', 'exploration', 'building', 'adventure'],
 ['unicorn', 'bunny', 'shoot', 'fish', 'lord'],
 ['ever', 'penny', 'best', 'definetly', 'addict'],
 ['assure', 'stress', 'custom', 'prefix', 'mid'],
 ['free', 'release', 'value', 'price', 'pay'],
 ['alright', 'guess', 'ok', 'neat', 'okay'],
 ['mac', 'please', 'fix', 'cant', 'crash'],
 ['average', 'frustrating', 'ingame', 'satisfaction', 'overwhelm'],
 ['terrarium', 'boss', 'minecraft', 'item', 'world'],
 ['collector', 'storage', 'favor', 'express', 'abandon'],
 ['scroller', 'platformer', 'retro', 'action', 'side'],
 ['game', 'play', 'time', 'new', 'one'],
 ['game', 'terrarium', 'minecraft', 'many', 'people'],
 ['lastly', 'harsh', 'enchantment', 'vs', 'sport'],
 ['