Demo ipynb for LDA

Testing the pipeline for a single game

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

import gensim

import spacy

import nltk

import pyLDAvis

In [2]:
# download nltk stopwords
# import nltk
# nltk.download('stopwords')

In [3]:
# load a dataset

dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 81776 entries, 63365 to 145140
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         81776 non-null  int64 
 1   app_id        81776 non-null  int64 
 2   app_name      81776 non-null  object
 3   review_text   81776 non-null  object
 4   review_score  81776 non-null  int64 
 5   review_votes  81776 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 4.4+ MB


In [4]:
# data preprocessing

import re

def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)
    
def remove_num(texts):
   output = re.sub(r'\d+', '', texts)
   return output

def unify_whitespaces(x):
    cleaned_string = re.sub(' +', ' ', x)
    return cleaned_string

from nltk.corpus import stopwords
stop=set(stopwords.words("english"))
def remove_stopword(text):
   text=[word.lower() for word in text.split() if word.lower() not in stop]
   return " ".join(text)

# only keep alphabets
def remove_non_alphabets(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: clean(x))
    df[review] = df[review].apply(lambda x: deEmojify(x))
    df[review] = df[review].apply(lambda x: remove_num(x))
    df[review] = df[review].apply(lambda x: unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: remove_non_alphabets(x))
    df[review] = df[review].apply(lambda x: remove_stopword(x))
    df[review] = df[review].apply(lambda x: x.lower())

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: clean(x), str_list))
    str_list = list(map(lambda x: deEmojify(x), str_list))
    str_list = list(map(lambda x: remove_num(x), str_list))
    str_list = list(map(lambda x: unify_whitespaces(x), str_list))
    str_list = list(map(lambda x: remove_non_alphabets(x), str_list))
    str_list = list(map(lambda x: remove_stopword(x), str_list))
    str_list = list(map(lambda x: x.lower(), str_list))
    return str_list

In [5]:
# apply data preprocessing

cleaning(dataset, 'review_text')

In [6]:
X = dataset['review_text'].values

In [7]:
X

array(['werewolf riding unicorn shooting rainbows gun build teleporters find hair dresser spider cavern get sword shoots cats take lord moon using yoyo summon sharknado minion shoots sharks enemies find sky temples air wyverns spawn buy music box wizard go record music like playing base whenever want go build castle made entirely white marble would seem thing minecraft game dimension trust get used start learning game terraria simply one satisfying sandbox experiences may sound rude compared minecraft imagination',
       'copies game go around giving people look sad', 'introduction',
       ...,
       'game game start newb get pro hours entertainment even get bored get mods newb',
       'far one greatest games played yet',
       'game awesome eye cthulhu possible'], dtype=object)

In [8]:
# nltk.download('averaged_perceptron_tagger')
t = nltk.word_tokenize(X[0])
tt = nltk.pos_tag(t)
tt

[('werewolf', 'NN'),
 ('riding', 'VBG'),
 ('unicorn', 'JJ'),
 ('shooting', 'NN'),
 ('rainbows', 'NNS'),
 ('gun', 'VBP'),
 ('build', 'JJ'),
 ('teleporters', 'NNS'),
 ('find', 'VBP'),
 ('hair', 'JJ'),
 ('dresser', 'NN'),
 ('spider', 'NN'),
 ('cavern', 'JJ'),
 ('get', 'NN'),
 ('sword', 'JJ'),
 ('shoots', 'NNS'),
 ('cats', 'NNS'),
 ('take', 'VBP'),
 ('lord', 'NN'),
 ('moon', 'NN'),
 ('using', 'VBG'),
 ('yoyo', 'JJ'),
 ('summon', 'JJ'),
 ('sharknado', 'NN'),
 ('minion', 'NN'),
 ('shoots', 'NNS'),
 ('sharks', 'JJ'),
 ('enemies', 'NNS'),
 ('find', 'VBP'),
 ('sky', 'JJ'),
 ('temples', 'NNS'),
 ('air', 'NN'),
 ('wyverns', 'VBZ'),
 ('spawn', 'JJ'),
 ('buy', 'NN'),
 ('music', 'NN'),
 ('box', 'NN'),
 ('wizard', 'NN'),
 ('go', 'VBP'),
 ('record', 'NN'),
 ('music', 'NN'),
 ('like', 'IN'),
 ('playing', 'VBG'),
 ('base', 'NN'),
 ('whenever', 'NN'),
 ('want', 'VBP'),
 ('go', 'VB'),
 ('build', 'JJ'),
 ('castle', 'NN'),
 ('made', 'VBD'),
 ('entirely', 'RB'),
 ('white', 'JJ'),
 ('marble', 'NN'),
 ('would'

In [8]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [9]:
# lemmatize the data

X_lemmatized = list(map(lambda x: lemmatization(x), X))

In [11]:
X_lemmatized[0]

['werewolf',
 'rid',
 'unicorn',
 'shooting',
 'rainbow',
 'gun',
 'build',
 'teleporters',
 'find',
 'hair',
 'dresser',
 'spider',
 'cavern',
 'get',
 'sword',
 'shoot',
 'cat',
 'take',
 'lord',
 'moon',
 'use',
 'yoyo',
 'summon',
 'sharknado',
 'minion',
 'shoot',
 'sharks',
 'enemy',
 'find',
 'sky',
 'temple',
 'air',
 'wyverns',
 'spawn',
 'buy',
 'music',
 'box',
 'wizard',
 'go',
 'record',
 'music',
 'like',
 'play',
 'base',
 'whenever',
 'want',
 'go',
 'build',
 'castle',
 'make',
 'entirely',
 'white',
 'marble',
 'would',
 'seem',
 'thing',
 'minecraft',
 'game',
 'dimension',
 'trust',
 'get',
 'use',
 'start',
 'learning',
 'game',
 'terrarium',
 'simply',
 'one',
 'satisfy',
 'sandbox',
 'experience',
 'may',
 'sound',
 'rude',
 'compare',
 'minecraft',
 'imagination']

In [17]:
# use gensim to build a dictionary and train our LDAModel

id2word = gensim.corpora.Dictionary(X_lemmatized)

corpus = [id2word.doc2bow(text) for text in X_lemmatized]

In [20]:
NUM_TOPICS = 20

lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                             id2word=id2word,
                                             num_topics=NUM_TOPICS,         # later can use grid search to find the best number of topics
                                             random_state=42,
                                             chunksize=100,
                                             passes=10,
                                            #  alpha='auto',
                                             workers=2)

inference test

In [21]:
# inference test

inference_test = ["well its been fun guys, but that's it, no more updates, that one was the last one, there is no longer going to be anymore content for this game anymore, there is no way to replay it as there won't be any updates, nope, that was it, the last update, nothing more, this game has no new ways to experience it as there is no more content updates, nothing new to freshen up the experience, its such a shame that this game has no replay-ability, once you beat the game there is like no point to playing again, as they said guys 1.2 will be they final update. nothing more after 1.2, there is no chance they will make another final update right? several years and final updates later: alright, thats it, no more updates we wont be getting anymore, thats it, nothing more, no more updates, for real this time... oh god, redigit made another tweet.",
                  "keeps forcing me to play it",
'''I will leave the cat here, so that everybody who passes by can pet it and give it a thumbs up and awards
　　　 　　／＞　　フ
　　　 　　| 　_　 _ l
　 　　 　／` ミ＿xノ
　　 　 /　　　 　 |
　　　 /　 ヽ　　 ﾉ
　 　 │　　|　|　|
　／￣|　　 |　|　|
　| (￣ヽ＿_ヽ_)__)
　＼二つ''']

inference_test = cleaning_strlist(inference_test)

inference_test = list(map(lambda x: lemmatization(x), inference_test))

corpus_test = [id2word.doc2bow(text) for text in inference_test]

test_output = lda_model[corpus_test]

test_output

<gensim.interfaces.TransformedCorpus at 0x2ae89bdf0>

In [22]:
inference_test[-1]

['leave',
 'cat',
 'everybody',
 'pass',
 'pet',
 'give',
 'thumb',
 'award',
 'l',
 'x']

In [23]:
for i in range(len(test_output)):
    print(sorted(test_output[i], key=lambda x: x[1], reverse=True))

[(1, 0.40155536), (15, 0.13400023), (13, 0.117064886), (16, 0.08842767), (18, 0.08247831), (10, 0.07789028), (17, 0.03888663), (19, 0.033185128), (7, 0.018651282)]
[(1, 0.49903196), (10, 0.27591208), (4, 0.012503109), (0, 0.012503108), (2, 0.012503108), (3, 0.012503108), (5, 0.012503108), (6, 0.012503108), (7, 0.012503108), (8, 0.012503108), (9, 0.012503108), (11, 0.012503108), (12, 0.012503108), (13, 0.012503108), (14, 0.012503108), (15, 0.012503108), (16, 0.012503108), (17, 0.012503108), (18, 0.012503108), (19, 0.012503108)]
[(16, 0.30494246), (7, 0.20502426), (13, 0.10501494), (19, 0.10501245), (5, 0.105011985), (11, 0.10497866)]


visualize the data

In [24]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis



Evaluation

gensim provide functions to calculate, so we don't need to install octis (as the evaluation backend of octis also relies on gensim)

octis seems awesome for simple development, but it installs many packages ;(

In [25]:
# corpus = lemmatized words (?) (list of list of str)

# create a result object from the LDAMulticore model for octis evaluation
# referencing from https://github.com/MIND-Lab/OCTIS/blob/master/octis/models/LDA.py
# and guideline in README: https://github.com/MIND-Lab/OCTIS/tree/master
result_lda_online = {}
result_lda_online['topic-word-matrix'] = lda_model.get_topics()

top_words = 10
topics_output = []
for topic in result_lda_online["topic-word-matrix"]:
    top_k = np.argsort(topic)[-top_words:]
    top_k_words = list(reversed([id2word[i] for i in top_k]))
    topics_output.append(top_k_words)
result_lda_online["topics"] = topics_output

def _get_topic_document_matrix(lda_model, corpus, num_topics=10):
    """
    Return the topic representation of the
    corpus
    """

    id_corpus = corpus

    doc_topic_tuples = []
    for document in id_corpus:
        doc_topic_tuples.append(
            lda_model.get_document_topics(document, minimum_probability=0))

    topic_document = np.zeros((num_topics, len(doc_topic_tuples)))

    for ndoc in range(len(doc_topic_tuples)):
        document = doc_topic_tuples[ndoc]
        for topic_tuple in document:
            topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
    return topic_document

result_lda_online['topic-document-matrix'] = _get_topic_document_matrix(lda_model, corpus, num_topics=20)

In [44]:
for t in result_lda_online['topics']:
    print(len(t))

10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10


In [28]:
lda_model.show_topics(num_topics=20, num_words=10, formatted=True, log=False)

[(0,
  '0.225*"cool" + 0.171*"life" + 0.062*"starbound" + 0.054*"epic" + 0.038*"hate" + 0.031*"xd" + 0.026*"lol" + 0.025*"real" + 0.023*"win" + 0.018*"consume"'),
 (1,
  '0.089*"game" + 0.055*"play" + 0.049*"hour" + 0.043*"new" + 0.042*"update" + 0.039*"time" + 0.038*"still" + 0.033*"get" + 0.032*"year" + 0.031*"come"'),
 (2,
  '0.049*"world" + 0.043*"go" + 0.029*"build" + 0.025*"dig" + 0.025*"make" + 0.023*"house" + 0.019*"hell" + 0.019*"find" + 0.016*"mine" + 0.015*"die"'),
 (3,
  '0.322*"love" + 0.177*"game" + 0.173*"would" + 0.090*"recommend" + 0.076*"anyone" + 0.060*"much" + 0.009*"gud" + 0.007*"edition" + 0.006*"bought" + 0.006*"animal"'),
 (4,
  '0.230*"best" + 0.204*"game" + 0.139*"play" + 0.137*"ever" + 0.115*"one" + 0.022*"ive" + 0.018*"sandbox" + 0.014*"addictive" + 0.012*"probably" + 0.012*"far"'),
 (5,
  '0.094*"pc" + 0.073*"dont" + 0.065*"im" + 0.054*"cant" + 0.046*"version" + 0.039*"xbox" + 0.037*"super" + 0.036*"computer" + 0.035*"op" + 0.034*"reccomend"'),
 (6,
  '0.05

In [39]:
result_lda_online['topic-document-matrix'][0]

array([0.00066725, 0.00555585, 0.02505902, ..., 0.00384636, 0.007143  ,
       0.00833436])

In [41]:
lda_model.get_topics().shape

(20, 35084)

In [29]:
np.sum(result_lda_online['topic-document-matrix'], axis=0)

array([1.00000004, 1.00000001, 0.99999997, ..., 1.00000001, 0.99999998,
       0.99999998])

In [45]:
from octis.evaluation_metrics.coherence_metrics import Coherence

# we first analysze NPMI

npmi = Coherence(texts=X_lemmatized, topk=10, measure='c_npmi')
nmpi_score = npmi.score(result_lda_online)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/michaelcheng/miniforge3/envs/fyp-test-tm/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/hm/n8540hpn7xd1f0pw3_qnsp4r0000gn/T/ipykernel_72691/3566859521.py", line 6, in <module>
    nmpi_score = npmi.score(result_lda_online)
  File "/Users/michaelcheng/miniforge3/envs/fyp-test-tm/lib/python3.9/site-packages/octis/evaluation_metrics/coherence_metrics.py", line 72, in score
    return npmi.get_coherence()
  File "/Users/michaelcheng/miniforge3/envs/fyp-test-tm/lib/python3.9/site-packages/gensim/models/coherencemodel.py", line 615, in get_coherence
    confirmed_measures = self.get_coherence_per_topic()
  File "/Users/michaelcheng/miniforge3/envs/fyp-test-tm/lib/python3.9/site-packages/gensim/models/coherencemodel.py", line 575, in get_coherence_per_topic
    self.estimate_probabilities(segmented_topics)
  File "/Users/michaelche

In [46]:
nmpi_score

NameError: name 'nmpi_score' is not defined

save model

we need to save the corpora.Dictionary and the LDA model

In [29]:
# save the LDA multicore model (and the corpora.Dictionary object) automatically
lda_model.save('lda_model')     # no need to add file extension

load model (both corpora Dictionary and the LDA model)

In [11]:
id2word_load = gensim.corpora.Dictionary.load('lda_model.id2word')

lda_model_load = gensim.models.ldamulticore.LdaMulticore.load('lda_model')

In [12]:
id2word = id2word_load
lda_model = lda_model_load

In [None]:
# test inference

corpus_load = [id2word_load.doc2bow(text) for text in inference_test]

output_load = lda_model_load[corpus_load]

for i in range(len(output_load)):
    # print(sorted(test_output[i], key=lambda x: x[1], reverse=True))
    print(sorted(output_load[i], key=lambda x: x[1], reverse=True))