# Load Data

In [1]:
import re
import ssl

import pandas as pd
import numpy as np
import tqdm
from scipy.stats import chi2_contingency

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

from langdetect import detect

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

from pprint import pprint


In [3]:
# load data
wo_long = pd.read_csv('/Users/javier/Desktop/cmgt-597/steam-review-data/wo_long_df.csv') # Wo Long: Fallen Dynasty

In [4]:
wo_long.shape

(3859, 8)

In [5]:
# filter helpful and actual reviews
# note: weighted_vote_score measures how helpful a review is. so reviews with a weighted_vote_score of 0 are filtered out

wo_long_df = wo_long[(wo_long['weighted_vote_score'] != 0.0) & # helpful reviews only
             (wo_long['playtime_at_review'] != 0.0) & # actual reviews only (can't review a game you never played!)
             (wo_long['review'].notnull()) # reviews with words only
             ]

wo_long_reviews = wo_long_df[['recommendationid', 'review', 'voted_up']]
wo_long_reviews

Unnamed: 0,recommendationid,review,voted_up
0,134018909,Saw all the reviews about performance and said...,False
1,153417645,I.... really don't know how to review this. On...,False
2,133962852,A good game buried deep beneath the biggest ca...,False
3,153249994,Performance was never fixed,False
4,134236783,After having completely finished this game and...,False
...,...,...,...
2673,133960443,'Mostly Nagative',False
2674,133957247,Dogshit Keybaord and mouse controls. And fuck ...,False
2675,134120443,PLEASE READ!!!\n\nThe people crying about perf...,True
2676,133931773,problem with the sensitivity of mouse & keyboa...,False


# Text Prep

In [6]:
# First tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether

def sent_to_words(sentences): # define function sent_to_words (sentences to words), argument is the dataframe and column of your reviews.
    for sentence in sentences: # for every sentence (review) in the sentences (review df)
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations  | # use the gensim package to tokenize your reviews

wo_long_reviews['data_words'] = list(sent_to_words(wo_long_reviews['review'])) # declare a new column in the lop_review df called 'data_words'
                                                                        # then use the sent_to_words function to tokenize the review column of the lop_review df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wo_long_reviews['data_words'] = list(sent_to_words(wo_long_reviews['review'])) # declare a new column in the lop_review df called 'data_words'


In [7]:
# After tokenization remove stoword and apply lemmatization 

stop_words = stopwords.words('english') # declare english as the stopword language
# stop_words.extend(['app', 'bank', 'account','try','issue','problem', 'face'])  #adding my own stop words

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): # define function process_words. arguments are the column of the tokenized words,
                                                                                                 # stop words library, allowed_postags
    # remove stop words
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] # remove stop words from tokenized reviews
    texts_out = [] # define variable texts_out. it is an empty list.

    # Lemmatization
    nlp = spacy.load("en_core_web_sm",disable=['parser', 'ner']) # define variable nlp. this is for lemmatization
    for sent in texts: # for every sentence in texts (review df)
        doc = nlp(" ".join(sent)) # declare variable doc.
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) # append the lemmatized words to the texts_out empty list
    
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out] # remove stopwords again
    return texts_out # return the texts_out list

wo_long_reviews['data_ready'] = process_words(wo_long_reviews['data_words'])  # processed Text Data!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wo_long_reviews['data_ready'] = process_words(wo_long_reviews['data_words'])  # processed Text Data!


In [8]:
# cleaning the reviews further to remove non-English text

def detect_my(text):
    try:
        return detect(text)
    except:
        return 'unknown'
    
wo_long_reviews['language'] = wo_long_reviews['review'].apply(detect_my)
wo_long_reviews


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wo_long_reviews['language'] = wo_long_reviews['review'].apply(detect_my)


Unnamed: 0,recommendationid,review,voted_up,data_words,data_ready,language
0,134018909,Saw all the reviews about performance and said...,False,"[saw, all, the, reviews, about, performance, a...","[see, review, performance, say, bad, fact, bad]",en
1,153417645,I.... really don't know how to review this. On...,False,"[really, don, know, how, to, review, this, on,...","[really, know, review, pc, performance, bad, m...",en
2,133962852,A good game buried deep beneath the biggest ca...,False,"[good, game, buried, deep, beneath, the, bigge...","[good, game, bury, deep, beneath, big, catastr...",en
3,153249994,Performance was never fixed,False,"[performance, was, never, fixed]","[performance, never, fix]",en
4,134236783,After having completely finished this game and...,False,"[after, having, completely, finished, this, ga...","[completely, finish, game, get, achievment, bi...",en
...,...,...,...,...,...,...
2673,133960443,'Mostly Nagative',False,"[mostly, nagative]","[mostly, nagative]",en
2674,133957247,Dogshit Keybaord and mouse controls. And fuck ...,False,"[dogshit, keybaord, and, mouse, controls, and,...","[control, say, use, controller, reason, play, ...",en
2675,134120443,PLEASE READ!!!\n\nThe people crying about perf...,True,"[please, read, the, people, crying, about, per...","[read, people, cry, performance, good, pc, dis...",en
2676,133931773,problem with the sensitivity of mouse & keyboa...,False,"[problem, with, the, sensitivity, of, mouse, k...","[keyboard, get, fix, get, fuck]",en


In [9]:
language_check = wo_long_reviews['language'].value_counts()
language_check

language
en         2365
zh-cn        32
so           31
unknown      23
tl           20
de           18
no           18
af           13
cy           13
fr           13
ca           12
tr           11
pt            8
id            8
it            7
nl            6
et            6
es            6
vi            6
sw            6
da            6
sq            6
pl            5
ro            4
ar            4
hr            4
ko            4
sk            3
sl            3
ja            3
th            2
sv            2
lv            1
fi            1
zh-tw         1
ru            1
lt            1
Name: count, dtype: int64

In [23]:
file_name = 'wo-long-language2.csv'

wo_long_reviews.to_csv(f'{file_name}', index = False)
output_filepath = f'/Users/javier/Desktop/cmgt-597/wo-long/{file_name}'
wo_long_reviews.to_csv(output_filepath, index = False)

In [10]:
drop_languages = ['ar', 'ja', 'ko', 'lt', 'pt', 'ru', 'th', 'zh-cn', 'zh-tw'] # these review tagged with these languages are not in English

wo_long_reviews = wo_long_reviews[(~wo_long_reviews['language'].isin(drop_languages)) &
                          ~((wo_long_reviews['language'] == 'unknown') & (wo_long_reviews['data_ready'].apply(len) == 0))
                          ]

wo_long_reviews = wo_long_reviews.drop(columns='language')

In [11]:
wo_long_reviews

Unnamed: 0,recommendationid,review,voted_up,data_words,data_ready
0,134018909,Saw all the reviews about performance and said...,False,"[saw, all, the, reviews, about, performance, a...","[see, review, performance, say, bad, fact, bad]"
1,153417645,I.... really don't know how to review this. On...,False,"[really, don, know, how, to, review, this, on,...","[really, know, review, pc, performance, bad, m..."
2,133962852,A good game buried deep beneath the biggest ca...,False,"[good, game, buried, deep, beneath, the, bigge...","[good, game, bury, deep, beneath, big, catastr..."
3,153249994,Performance was never fixed,False,"[performance, was, never, fixed]","[performance, never, fix]"
4,134236783,After having completely finished this game and...,False,"[after, having, completely, finished, this, ga...","[completely, finish, game, get, achievment, bi..."
...,...,...,...,...,...
2673,133960443,'Mostly Nagative',False,"[mostly, nagative]","[mostly, nagative]"
2674,133957247,Dogshit Keybaord and mouse controls. And fuck ...,False,"[dogshit, keybaord, and, mouse, controls, and,...","[control, say, use, controller, reason, play, ..."
2675,134120443,PLEASE READ!!!\n\nThe people crying about perf...,True,"[please, read, the, people, crying, about, per...","[read, people, cry, performance, good, pc, dis..."
2676,133931773,problem with the sensitivity of mouse & keyboa...,False,"[problem, with, the, sensitivity, of, mouse, k...","[keyboard, get, fix, get, fuck]"


# Base LDA Model

In [12]:
# Create the Dictionary and Corpus needed for Topic Modelling

#  Dictionary
id2word = corpora.Dictionary(wo_long_reviews['data_ready'])

#  Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in wo_long_reviews['data_ready']]

In [13]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            chunksize=100,
                                            passes=10,
                                            per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.028*"total" + 0.022*"gtx" + 0.016*"epic" + 0.012*"platinum" + '
  '0.009*"purchasing" + 0.004*"halfway" + 0.003*"judge" + 0.002*"refine" + '
  '0.002*"hinder" + 0.001*"monkey"'),
 (1,
  '0.013*"original" + 0.011*"device" + 0.011*"addict" + 0.010*"third" + '
  '0.010*"rpgs" + 0.009*"steep" + 0.008*"download" + 0.007*"forgot" + '
  '0.007*"curator" + 0.006*"experienced"'),
 (2,
  '0.019*"ridiculous" + 0.014*"gb" + 0.013*"click" + 0.010*"nightmare" + '
  '0.009*"superb" + 0.008*"hint" + 0.006*"pair" + 0.006*"frankly" + '
  '0.004*"tack" + 0.003*"acknowledge"'),
 (3,
  '0.044*"multiplayer" + 0.019*"generally" + 0.018*"downgrade" + 0.017*"home" '
  '+ 0.017*"blast" + 0.016*"frequent" + 0.015*"thus" + 0.013*"piss" + '
  '0.012*"entry" + 0.010*"thinking"'),
 (4,
  '0.028*"date" + 0.024*"exactly" + 0.022*"alot" + 0.018*"theme" + '
  '0.013*"tiger" + 0.012*"vary" + 0.012*"mixed" + 0.012*"conclusion" + '
  '0.011*"transfer" + 0.010*"recover"'),
 (5,
  '0.083*"game" + 0.045*"nioh" + 0.0

In [16]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=wo_long_reviews['data_ready'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.406638310903882

Coherence Score:  0.5099797978399756


# Hyperparameter Tuning

In [15]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # changed the LDA method to Rolly John's
                                                id2word=dictionary,
                                                num_topics=k, 
                                                random_state=100,
                                                chunksize=100,
                                                passes=10,
                                                alpha=a,
                                                eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=wo_long_reviews['data_ready'], dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [16]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)

    # pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
                    
    lda_tuning_results = pd.DataFrame(model_results)

    file_name = 'wo-long-lda-tuning-results2.csv'

    lda_tuning_results.to_csv(f'{file_name}', index = False)
    output_filepath = f'/Users/javier/Desktop/cmgt-597/wo-long/{file_name}'
    lda_tuning_results.to_csv(output_filepath, index = False)
    
    pbar.close()

100%|██████████| 540/540 [55:49<00:00,  6.20s/it]


In [17]:
lda_tuning_results = pd.read_csv('/Users/javier/Desktop/cmgt-597/wo-long/wo-long-lda-tuning-results2.csv')
lda_tuning_results

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,75% Corpus,2,0.01,0.01,0.399168
1,75% Corpus,2,0.01,0.31,0.389871
2,75% Corpus,2,0.01,0.61,0.391345
3,75% Corpus,2,0.01,0.9099999999999999,0.398632
4,75% Corpus,2,0.01,symmetric,0.391345
...,...,...,...,...,...
535,100% Corpus,10,asymmetric,0.01,0.342671
536,100% Corpus,10,asymmetric,0.31,0.450866
537,100% Corpus,10,asymmetric,0.61,0.483881
538,100% Corpus,10,asymmetric,0.9099999999999999,0.580819


# Final Model

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=7, 
                                            random_state=100,
                                            chunksize=100,
                                            passes=10,
                                            alpha=0.9,
                                            eta=0.9)
                                            #the alpha and beta will yield the highest coherence score, according to the tuning results
                                            

pprint(lda_model.print_topics())

# 7 0.9 0.9


[(0,
  '0.003*"bing" + 0.003*"chill" + 0.003*"qe" + 0.001*"bu" + 0.000*"jian" + '
  '0.000*"jiu" + 0.000*"zai" + 0.000*"bleh" + 0.000*"jiqe" + 0.000*"jide"'),
 (1,
  '0.001*"child" + 0.001*"baby" + 0.000*"bleh" + 0.000*"sekiro" + '
  '0.000*"trash" + 0.000*"gamer" + 0.000*"moron" + 0.000*"mkb" + '
  '0.000*"pronoun" + 0.000*"grass"'),
 (2,
  '0.075*"game" + 0.026*"play" + 0.014*"good" + 0.014*"review" + 0.014*"run" + '
  '0.013*"issue" + 0.012*"performance" + 0.012*"pc" + 0.011*"controller" + '
  '0.011*"get"'),
 (3,
  '0.002*"grind" + 0.001*"word" + 0.001*"paint" + 0.001*"spare" + '
  '0.001*"assez" + 0.001*"evolve" + 0.001*"adult" + 0.001*"boi" + '
  '0.000*"audience" + 0.000*"template"'),
 (4,
  '0.000*"bleh" + 0.000*"sekiro" + 0.000*"gamer" + 0.000*"hr" + 0.000*"trash" '
  '+ 0.000*"moron" + 0.000*"mkb" + 0.000*"pronoun" + 0.000*"teh" + '
  '0.000*"grass"'),
 (5,
  '0.034*"game" + 0.020*"nioh" + 0.014*"boss" + 0.010*"get" + 0.009*"feel" + '
  '0.009*"combat" + 0.009*"make" + 0.008*

In [18]:
# Final Model Evaluation
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=wo_long_reviews['data_ready'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.1850457059346855

Coherence Score:  0.5602101970066972


# Topic-Review Allocation

In [19]:
def format_topics_sentences(ldamodel=None, corpus=None, texts=None):
    # Initialize an empty list to store data
    data = []

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Initialize variables to store topic information
        dominant_topic = None
        perc_contribution = None
        topic_keywords = None
        
        # Loop through topics in the row
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                dominant_topic = int(topic_num)
                perc_contribution = round(prop_topic, 4)
                break
        
        # Store the extracted information
        data.append([dominant_topic, perc_contribution, topic_keywords])

    # Convert the list of data into a DataFrame
    sent_topics_df = pd.DataFrame(data, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])

    # Add original text to the end of the output
    contents = pd.Series(texts)
    contents.reset_index(drop=True, inplace=True)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

# Call the function with appropriate arguments
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=wo_long_reviews.review)

# Format the DataFrame
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'review']

# Display the first 5 rows of the DataFrame
display(df_dominant_topic.head(5))


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,review
0,0,2,0.5523,"game, play, good, review, run, issue, performa...",Saw all the reviews about performance and said...
1,1,2,0.7434,"game, play, good, review, run, issue, performa...",I.... really don't know how to review this. On...
2,2,2,0.7135,"game, play, good, review, run, issue, performa...",A good game buried deep beneath the biggest ca...
3,3,2,0.395,"game, play, good, review, run, issue, performa...",Performance was never fixed
4,4,5,0.807,"game, nioh, boss, get, feel, combat, make, sys...",After having completely finished this game and...


In [20]:
topic_count = df_dominant_topic['Dominant_Topic'].value_counts()
topic_count

Dominant_Topic
2    1495
5    1017
0      43
6      18
4       9
3       9
1       6
Name: count, dtype: int64

In [21]:
wo_long_reviews = df_dominant_topic.merge(wo_long_reviews, how='left')
wo_long_reviews.drop_duplicates(subset=['Document_No'], keep='first', inplace=True)

In [22]:
wo_long_reviews = wo_long_reviews[['Document_No', 'review', 'data_words', 'data_ready', 'Keywords', 'Dominant_Topic', 'Topic_Perc_Contrib', 'voted_up']]

In [23]:
# final dataframe
wo_long_reviews

Unnamed: 0,Document_No,review,data_words,data_ready,Keywords,Dominant_Topic,Topic_Perc_Contrib,voted_up
0,0,Saw all the reviews about performance and said...,"[saw, all, the, reviews, about, performance, a...","[see, review, performance, say, bad, fact, bad]","game, play, good, review, run, issue, performa...",2,0.5523,False
1,1,I.... really don't know how to review this. On...,"[really, don, know, how, to, review, this, on,...","[really, know, review, pc, performance, bad, m...","game, play, good, review, run, issue, performa...",2,0.7434,False
2,2,A good game buried deep beneath the biggest ca...,"[good, game, buried, deep, beneath, the, bigge...","[good, game, bury, deep, beneath, big, catastr...","game, play, good, review, run, issue, performa...",2,0.7135,False
3,3,Performance was never fixed,"[performance, was, never, fixed]","[performance, never, fix]","game, play, good, review, run, issue, performa...",2,0.3950,False
4,4,After having completely finished this game and...,"[after, having, completely, finished, this, ga...","[completely, finish, game, get, achievment, bi...","game, nioh, boss, get, feel, combat, make, sys...",5,0.8070,False
...,...,...,...,...,...,...,...,...
2672,2592,'Mostly Nagative',"[mostly, nagative]","[mostly, nagative]","game, nioh, boss, get, feel, combat, make, sys...",5,0.1650,False
2673,2593,Dogshit Keybaord and mouse controls. And fuck ...,"[dogshit, keybaord, and, mouse, controls, and,...","[control, say, use, controller, reason, play, ...","game, play, good, review, run, issue, performa...",2,0.5900,False
2674,2594,PLEASE READ!!!\n\nThe people crying about perf...,"[please, read, the, people, crying, about, per...","[read, people, cry, performance, good, pc, dis...","game, play, good, review, run, issue, performa...",2,0.7768,True
2675,2595,problem with the sensitivity of mouse & keyboa...,"[problem, with, the, sensitivity, of, mouse, k...","[keyboard, get, fix, get, fuck]","game, play, good, review, run, issue, performa...",2,0.4322,False


In [124]:
file_name = 'wo-long-lda2.csv'

wo_long_reviews.to_csv(f'{file_name}', index = False)
output_filepath = f'/Users/javier/Desktop/cmgt-597/wo-long/{file_name}'
wo_long_reviews.to_csv(output_filepath, index = False)

# Data Analysis

In [25]:
wo_long_lda_final = pd.read_csv('/Users/javier/Desktop/cmgt-597/wo-long/wo-long-lda2.csv')
wo_long_lda_final

Unnamed: 0,Document_No,review,data_words,data_ready,Keywords,Dominant_Topic,Topic_Perc_Contrib,voted_up
0,0,Saw all the reviews about performance and said...,"['saw', 'all', 'the', 'reviews', 'about', 'per...","['see', 'review', 'performance', 'say', 'bad',...","game, play, good, review, run, issue, performa...",2,0.5524,False
1,1,I.... really don't know how to review this. On...,"['really', 'don', 'know', 'how', 'to', 'review...","['really', 'know', 'review', 'pc', 'performanc...","game, play, good, review, run, issue, performa...",2,0.7434,False
2,2,A good game buried deep beneath the biggest ca...,"['good', 'game', 'buried', 'deep', 'beneath', ...","['good', 'game', 'bury', 'deep', 'beneath', 'b...","game, play, good, review, run, issue, performa...",2,0.7135,False
3,3,Performance was never fixed,"['performance', 'was', 'never', 'fixed']","['performance', 'never', 'fix']","game, play, good, review, run, issue, performa...",2,0.3950,False
4,4,After having completely finished this game and...,"['after', 'having', 'completely', 'finished', ...","['completely', 'finish', 'game', 'get', 'achie...","game, nioh, boss, get, feel, combat, make, sys...",5,0.8070,False
...,...,...,...,...,...,...,...,...
2592,2592,'Mostly Nagative',"['mostly', 'nagative']","['mostly', 'nagative']","game, nioh, boss, get, feel, combat, make, sys...",5,0.1650,False
2593,2593,Dogshit Keybaord and mouse controls. And fuck ...,"['dogshit', 'keybaord', 'and', 'mouse', 'contr...","['control', 'say', 'use', 'controller', 'reaso...","game, play, good, review, run, issue, performa...",2,0.5901,False
2594,2594,PLEASE READ!!!\n\nThe people crying about perf...,"['please', 'read', 'the', 'people', 'crying', ...","['read', 'people', 'cry', 'performance', 'good...","game, play, good, review, run, issue, performa...",2,0.7770,True
2595,2595,problem with the sensitivity of mouse & keyboa...,"['problem', 'with', 'the', 'sensitivity', 'of'...","['keyboard', 'get', 'fix', 'get', 'fuck']","game, play, good, review, run, issue, performa...",2,0.4322,False


In [26]:
# distribution of topics across reviews

topic_count = wo_long_lda_final['Dominant_Topic'].value_counts()
topic_count

Dominant_Topic
2    1496
5    1016
0      43
6      19
3       9
4       8
1       6
Name: count, dtype: int64

In [27]:
# Visualize the topics

import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

---